Coverage for /builds/BuildGrid/buildgrid/buildgrid/server/metrics_names.py: 100.00%
90 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-22 21:04 +0000
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-22 21:04 +0000
1# Copyright (C) 2020 Bloomberg LP
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# <http://www.apache.org/licenses/LICENSE-2.0>
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15#
16# CAS metrics
17#
19#: Number of exceptions thrown from CAS servicer functions
20CAS_EXCEPTION_COUNT_METRIC_NAME = 'cas-exception'
22#: Number of bytes uploaded to a CAS instance
23CAS_UPLOADED_BYTES_METRIC_NAME = 'cas-uploaded-bytes'
25#: Number of bytes downloaded from a CAS instance
26CAS_DOWNLOADED_BYTES_METRIC_NAME = 'cas-downloaded-bytes'
28#: Number of blobs requested in ``FindMissingBlobs()`` calls
29CAS_FIND_MISSING_BLOBS_NUM_REQUESTED_METRIC_NAME = 'find-missing-blobs-num-requested'
31#: Size of blobs requested in ``FindMissingBlobs()`` calls
32CAS_FIND_MISSING_BLOBS_SIZE_BYTES_REQUESTED_METRIC_NAME = 'find-missing-blobs-size-bytes-requested'
34#: Number of blobs reported to be missing in ``FindMissingBlobs()`` calls
35CAS_FIND_MISSING_BLOBS_NUM_MISSING_METRIC_NAME = 'find-missing-blobs-num-missing'
37#: Percentage of blobs reported to be missing in ``FindMissingBlobs()`` calls
38CAS_FIND_MISSING_BLOBS_PERCENT_MISSING_METRIC_NAME = 'find-missing-blobs-percent-missing'
40#: Size of blobs reported to be missing in ``FindMissingBlobs()`` calls
41CAS_FIND_MISSING_BLOBS_SIZE_BYTES_MISSING_METRIC_NAME = 'find-missing-blobs-size-bytes-missing'
43#: Time that ``FindMissingBlobs()`` operations took to complete
44CAS_FIND_MISSING_BLOBS_TIME_METRIC_NAME = 'find-missing-blobs'
46#: Time that ``BatchUpdateBlobs()`` operations took to complete
47CAS_BATCH_UPDATE_BLOBS_TIME_METRIC_NAME = 'batch-update-blobs'
49#: Size of blobs written with ``BatchUpdateBlobs()`` calls
50CAS_BATCH_UPDATE_BLOBS_SIZE_BYTES = 'batch-update-blobs-size-bytes'
52#: Time that ``BatchReadBlobs()`` operations took to complete
53CAS_BATCH_READ_BLOBS_TIME_METRIC_NAME = 'batch-read-blobs'
55#: Size of blobs read with ``BatchReadBlobs()`` calls
56CAS_BATCH_READ_BLOBS_SIZE_BYTES = 'batch-read-blobs-size-bytes'
58#: Time that ``GetTree()`` operations took to complete
59CAS_GET_TREE_TIME_METRIC_NAME = 'get-tree'
61#: Time that ``ByteStream.Read()`` operations took to complete
62CAS_BYTESTREAM_READ_TIME_METRIC_NAME = 'bytestream-read'
64#: Size of blobs read with ``ByteStream.Read()``
65CAS_BYTESTREAM_READ_SIZE_BYTES = 'bytestream-read-size-bytes'
67#: Time that ``ByteStream.Write()`` operations took to complete
68CAS_BYTESTREAM_WRITE_TIME_METRIC_NAME = 'bytestream-write'
70#: Size of blobs written with ``ByteStream.Write()``
71CAS_BYTESTREAM_WRITE_SIZE_BYTES = 'bytestream-write-size-bytes'
73# CAS cache wrapper metrics
75#: Count of cache misses in BatchReadBlobs requests to the
76# !with-cache-storage. This only counts the blobs which were
77# in the fallback storage; blobs that were entirely missing
78# don't count as cache misses, since this metric is intended
79# to measure how many things that *could* have been cached
80# were actually not.
81CAS_CACHE_BULK_READ_MISS_COUNT_NAME = 'cas-withcache-bulk-read-misses'
83#: Count of cache hits in BatchReadBlobs requests to the !with-cache-storage
84CAS_CACHE_BULK_READ_HIT_COUNT_NAME = 'cas-withcache-bulk-read-hits'
86#: Percentage of cache hits in a given BatchReadBlobs request in the
87# !with-cache-storage. This is as a percentage of total blobs requested,
88# including blobs which were missing entirely.
89CAS_CACHE_BULK_READ_HIT_PERCENTAGE_NAME = 'cas-withcache-bulk-read-hit-percent'
91#: Count of cache misses in ByteStream Read requests to the
92# !with-cache-storage. This only counts the blobs which were
93# in the fallback storage; blobs that were entirely missing
94# don't count as cache misses, since this metric is intended
95# to measure how many things that *could* have been cached
96# were actually not.
97CAS_CACHE_GET_BLOB_MISS_COUNT_NAME = 'cas-withcache-get-blob-misses'
99#: Count of cache hits in ByteStream Read requests to the !with-cache-storage
100CAS_CACHE_GET_BLOB_HIT_COUNT_NAME = 'cas-withcache-get-blob-hits'
102# Indexed CAS metrics
104#: Time taken to bulk select a number of digests from the index
105CAS_INDEX_BULK_SELECT_DIGEST_TIME_METRIC_NAME = 'cas.index.bulk-select-digest-time'
107#: Time taken to update a blob timestamp in the index
108CAS_INDEX_BLOB_TIMESTAMP_UPDATE_TIME_METRIC_NAME = 'cas.index.blob-timestamp-update-time'
110#: Time taken to run a bulk timestamp update in the index
111CAS_INDEX_BULK_TIMESTAMP_UPDATE_TIME_METRIC_NAME = 'cas.index.bulk-timestamp-update-time'
113#: Time taken to return from `get_blob()`. This includes the time taken to
114# check and update the index, along with to time to fetch the blob from the
115# underlying storage, and update the index if `fallback_on_get` is enabled.
116CAS_INDEX_GET_BLOB_TIME_METRIC_NAME = 'cas.index.get-blob-time'
118#: Time taken to store a list of digests in the index
119CAS_INDEX_SAVE_DIGESTS_TIME_METRIC_NAME = 'cas.index.save-digests-time'
121#: Time taken to get the total size of the CAS the index is for
122CAS_INDEX_SIZE_CALCULATION_TIME_METRIC_NAME = 'cas.index.total-size-calculation-time'
124#
125# ActionCache metrics
126#
128#: Time that ``GetActionResult()`` operations took to complete
129AC_GET_ACTION_RESULT_TIME_METRIC_NAME = 'get-action-result'
131#: Time that ``UpdateActionResult()`` operations took to complete
132AC_UPDATE_ACTION_RESULT_TIME_METRIC_NAME = 'update-action-result'
134#: Number of cache hits from the ActionCache
135AC_CACHE_HITS_METRIC_NAME = 'action-cache-hits'
137#: Number of cache misses from the ActionCache
138AC_CACHE_MISSES_METRIC_NAME = 'action-cache-misses'
141#
142# S3 metrics
143#
145#: Time taken to check errors from a bulk_delete
146S3_DELETE_ERROR_CHECK_METRIC_NAME = "s3-deletion-error-check-timer"
149#
150# Cleanup metrics
151#
153#: Number of blobs deleted per second in a cleanup batch
154CLEANUP_BLOBS_DELETION_RATE_METRIC_NAME = "cleanup.blobs-deleted-per-second"
156#: Number of bytes deleted per second in a cleanup batch
157CLEANUP_BYTES_DELETION_RATE_METRIC_NAME = "cleanup.bytes-deleted-per-second"
159#: Total time taken to clean enough blobs to get the CAS size down to the low watermark
160CLEANUP_RUNTIME_METRIC_NAME = "cleanup.runtime-timer"
162#: Time taken to bulk delete a set of blobs from the index
163CLEANUP_INDEX_BULK_DELETE_METRIC_NAME = "cleanup.index.bulk-delete-timer"
165#: Time taken to mark a set of blobs as deleted in the index
166CLEANUP_INDEX_MARK_DELETED_METRIC_NAME = "cleanup.index.mark-as-deleted-timer"
168#: Number of blobs that were already marked for deletion in the index when marking as deleted
169CLEANUP_INDEX_PREMARKED_BLOBS_METRIC_NAME = "cleanup.index.premarked-blobs-count"
171#: Time taken to bulk delete a set of blobs from the storage backend
172CLEANUP_STORAGE_BULK_DELETE_METRIC_NAME = "cleanup.storage.bulk-delete-timer"
174#: Number of blobs that failed to be deleted from the storage backend in a given bulk delete request
175CLEANUP_STORAGE_DELETION_FAILURES_METRIC_NAME = "cleanup.storage.deletion-failures-count"
178#
179# ExecutedActionMetadata metrics
180#
182#: Time spent queued before being assigned to a worker
183QUEUED_TIME_METRIC_NAME = 'action-queued-time'
185#: Time spent in the worker (fetching inputs + executing + uploading outputs)
186WORKER_HANDLING_TIME_METRIC_NAME = 'worker-handling-time'
188#: Time spent fetching inputs before execution
189INPUTS_FETCHING_TIME_METRIC_NAME = 'inputs-fetching-time'
191#: Time spent waiting for executions to complete
192EXECUTION_TIME_METRIC_NAME = 'execution-time'
194#: Time spent uploading inputs after execution
195OUTPUTS_UPLOADING_TIME_METRIC_NAME = 'outputs-uploading-time'
197#: Total time spent servicing an execution request (time queued +fetching inputs +
198# executing + uploading outputs)
199TOTAL_HANDLING_TIME_METRIC_NAME = 'total-handling-time'
202#
203# Execution service metrics
204#
206#: Number of bots connected
207BOT_COUNT_METRIC_NAME = 'bots-count'
209#: Number of clients connected
210CLIENT_COUNT_METRIC_NAME = 'clients-count'
212#: Number of leases present in the scheduler
213LEASE_COUNT_METRIC_NAME = 'lease-count'
215#: Counter metric indicating lease stage transitions
216LEASE_CHANGES_COUNTER_METRIC_NAME = 'lease-state-transitions-counter'
218#: Number of active jobs in the scheduler
219JOB_COUNT_METRIC_NAME = 'job-count'
221#: Counter metric indicating job stage transitions
222JOB_CHANGES_COUNTER_METRIC_NAME = 'job-stage-transitions-counter'
224#: Average time that a job spends waiting to be executed
225AVERAGE_QUEUE_TIME_METRIC_NAME = 'average-queue-time'
227#: Number of ``Execute()`` requests received:
228EXECUTE_REQUEST_COUNT_METRIC_NAME = 'execute-call-count'
230#: Time spent servicing ``Execute()`` requests:
231EXECUTE_SERVICER_TIME_METRIC_NAME = 'execute-servicing-time'
233#: Number of ``WaitExecution()`` requests received:
234WAIT_EXECUTION_REQUEST_COUNT_METRIC_NAME = 'wait-execution-call-count'
236#: Time spent servicing ``WaitExecution()`` requests:
237WAIT_EXECUTION_SERVICER_TIME_METRIC_NAME = 'wait-execution-servicing-time'
239#
240# LogStream service metrics
241#
243#: Time spent creating a LogStream
244LOGSTREAM_CREATE_LOG_STREAM_TIME_METRIC_NAME = 'logstream.create-logstream-time'
246#: Number of bytes in a committed logstream
247LOGSTREAM_WRITE_UPLOADED_BYTES_COUNT = 'logstream.write.uploaded-bytes-count'
249#
250# Authentication Metrics
251#
253#: Number of invalid JWTs recieved:
254INVALID_JWT_COUNT_METRIC_NAME = 'authentication.jwt.invalid-jwt-count'
256#: Duration of JWK fetch request:
257JWK_FETCH_TIME_METRIC_NAME = 'authentication.jwk.fetch-request-time'
259#: Duration of JWT decoding:
260JWT_DECODE_TIME_METRIC_NAME = 'authentication.jwt.decode-jwt-time'
262#: Duration of JWT validation (can include fetching JWK):
263JWT_VALIDATION_TIME_METRIC_NAME = 'authentication.jwt.validate-jwt-time'
265#
266# Bots service metrics
267#
269#: Time spent servicing ``CreateBotSession()`` requests
270BOTS_CREATE_BOT_SESSION_TIME_METRIC_NAME = 'bots.create-bot-session-time'
272#: Time spent servicing ``UpdateBotSession()`` requests
273BOTS_UPDATE_BOT_SESSION_TIME_METRIC_NAME = 'bots.update-bot-session-time'
275#: Time spent selecting an Action from the data store to create a lease for
276BOTS_ASSIGN_JOB_LEASES_TIME_METRIC_NAME = 'bots.assign-job-leases-time'
279#
280# Scheduler metrics
281#
283#: Time taken to queue an Action
284SCHEDULER_QUEUE_ACTION_TIME_METRIC_NAME = 'scheduler.queue-action-time'
286#: Time taken to update a job's Lease
287SCHEDULER_UPDATE_LEASE_TIME_METRIC_NAME = 'scheduler.update-lease-time'
289#: Time taken to cancel an Operation
290SCHEDULER_CANCEL_OPERATION_TIME_METRIC_NAME = 'scheduler.cancel-operation-time'
293#
294# Data Store (scheduler's backend) metrics
295#
296# Some of these seem like duplicates of the request-level timers
297# at a glance, but measuring at the data store level allows us to
298# see how much overhead our own code is adding to the calls.
299#
301#: Time taken to create a Job
302DATA_STORE_CREATE_JOB_TIME_METRIC_NAME = 'datastore.all.create-job-time'
304#: Time taken to enqueue a Job
305DATA_STORE_QUEUE_JOB_TIME_METRIC_NAME = 'datastore.all.queue-job-time'
307#: Time taken to update a Job
308DATA_STORE_UPDATE_JOB_TIME_METRIC_NAME = 'datastore.all.update-job-time'
310#: Time taken to create a Lease
311DATA_STORE_CREATE_LEASE_TIME_METRIC_NAME = 'datastore.all.create-lease-time'
313#: Time taken to update a Lease
314DATA_STORE_UPDATE_LEASE_TIME_METRIC_NAME = 'datastore.all.update-lease-time'
316#: Time taken to create an Operation
317DATA_STORE_CREATE_OPERATION_TIME_METRIC_NAME = 'datastore.all.create-operation-time'
319#: Time taken to update an Operation
320DATA_STORE_UPDATE_OPERATION_TIME_METRIC_NAME = 'datastore.all.update-operation-time'
322#: Time taken to get a list of Operations
323DATA_STORE_LIST_OPERATIONS_TIME_METRIC_NAME = 'datastore.all.list-operations-time'
325#: Time taken to get a Job by Action Digest
326DATA_STORE_GET_JOB_BY_DIGEST_TIME_METRIC_NAME = 'datastore.all.get-job-by-digest-time'
328#: Time taken to get a Job by name
329DATA_STORE_GET_JOB_BY_NAME_TIME_METRIC_NAME = 'datastore.all.get-job-by-name-time'
331#: Time taken to get a Job by Operation name
332DATA_STORE_GET_JOB_BY_OPERATION_TIME_METRIC_NAME = 'datastore.all.get-job-by-operation-time'
334#: Time taken to handle checking for a job update. When using
335# a database backend other than PostgreSQL, this will measure
336# how long it takes to check all watched jobs for updates once.
337# For PostgreSQL and the in-memory scheduler, this measures how
338# long it takes to handle a job update notification.
339DATA_STORE_CHECK_FOR_UPDATE_TIME_METRIC_NAME = 'datastore.all.check-for-update-time'
341# SQL-specific metrics
343#: Time taken to store the ExecuteResponse
344DATA_STORE_STORE_RESPONSE_TIME_METRIC_NAME = 'datastore.sql.store-response-time'
346#: Number of rows deleted from the jobs table during each pruning
347DATA_STORE_PRUNER_NUM_ROWS_DELETED_METRIC_NAME = "datastore.sql.pruner-num-rows-deleted"
349#: Time taken per scheduler pruning invocation
350DATA_STORE_PRUNER_DELETE_TIME_METRIC_NAME = 'datastore.sql.pruner-delete-time'
353#
354# Operations service metrics
355#
357#: Time taken to completely handle a ListOperations request
358OPERATIONS_LIST_OPERATIONS_TIME_METRIC_NAME = 'operations.list-operations-time'
360#: Time taken to completely handle a GetOperation request
361OPERATIONS_GET_OPERATION_TIME_METRIC_NAME = 'operations.get-operation-time'
363#: Time taken to completely handle a CancelOperation request
364OPERATIONS_CANCEL_OPERATION_TIME_METRIC_NAME = 'operations.cancel-operation-time'
366#: Time taken to completely handle a DeleteOperation request. BuildGrid
367# doesn't actually support DeleteOperation, but this metric will at
368# least provide insight into whether people are attempting to call it.
369OPERATIONS_DELETE_OPERATION_TIME_METRIC_NAME = 'operations.delete-operation-time'