Coverage for /builds/BuildGrid/buildgrid/buildgrid/server/metrics_names.py: 100.00%
91 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-05 15:37 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-05 15:37 +0000
1# Copyright (C) 2020 Bloomberg LP
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# <http://www.apache.org/licenses/LICENSE-2.0>
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15#
16# CAS metrics
17#
19#: Number of exceptions thrown from CAS servicer functions
20CAS_EXCEPTION_COUNT_METRIC_NAME = "cas-exception"
22#: Number of bytes uploaded to a CAS instance
23CAS_UPLOADED_BYTES_METRIC_NAME = "cas-uploaded-bytes"
25#: Number of bytes downloaded from a CAS instance
26CAS_DOWNLOADED_BYTES_METRIC_NAME = "cas-downloaded-bytes"
28#: Number of blobs requested in ``FindMissingBlobs()`` calls
29CAS_FIND_MISSING_BLOBS_NUM_REQUESTED_METRIC_NAME = "find-missing-blobs-num-requested"
31#: Size of blobs requested in ``FindMissingBlobs()`` calls
32CAS_FIND_MISSING_BLOBS_SIZE_BYTES_REQUESTED_METRIC_NAME = "find-missing-blobs-size-bytes-requested"
34#: Number of blobs reported to be missing in ``FindMissingBlobs()`` calls
35CAS_FIND_MISSING_BLOBS_NUM_MISSING_METRIC_NAME = "find-missing-blobs-num-missing"
37#: Percentage of blobs reported to be missing in ``FindMissingBlobs()`` calls
38CAS_FIND_MISSING_BLOBS_PERCENT_MISSING_METRIC_NAME = "find-missing-blobs-percent-missing"
40#: Size of blobs reported to be missing in ``FindMissingBlobs()`` calls
41CAS_FIND_MISSING_BLOBS_SIZE_BYTES_MISSING_METRIC_NAME = "find-missing-blobs-size-bytes-missing"
43#: Time that ``FindMissingBlobs()`` operations took to complete
44CAS_FIND_MISSING_BLOBS_TIME_METRIC_NAME = "find-missing-blobs"
46#: Time that ``BatchUpdateBlobs()`` operations took to complete
47CAS_BATCH_UPDATE_BLOBS_TIME_METRIC_NAME = "batch-update-blobs"
49#: Size of blobs written with ``BatchUpdateBlobs()`` calls
50CAS_BATCH_UPDATE_BLOBS_SIZE_BYTES = "batch-update-blobs-size-bytes"
52#: Time that ``BatchReadBlobs()`` operations took to complete
53CAS_BATCH_READ_BLOBS_TIME_METRIC_NAME = "batch-read-blobs"
55#: Size of blobs read with ``BatchReadBlobs()`` calls
56CAS_BATCH_READ_BLOBS_SIZE_BYTES = "batch-read-blobs-size-bytes"
58#: Time that ``GetTree()`` operations took to complete
59CAS_GET_TREE_TIME_METRIC_NAME = "get-tree"
61#: Time that ``ByteStream.Read()`` operations took to complete
62CAS_BYTESTREAM_READ_TIME_METRIC_NAME = "bytestream-read"
64#: Size of blobs read with ``ByteStream.Read()``
65CAS_BYTESTREAM_READ_SIZE_BYTES = "bytestream-read-size-bytes"
67#: Time that ``ByteStream.Write()`` operations took to complete
68CAS_BYTESTREAM_WRITE_TIME_METRIC_NAME = "bytestream-write"
70#: Size of blobs written with ``ByteStream.Write()``
71CAS_BYTESTREAM_WRITE_SIZE_BYTES = "bytestream-write-size-bytes"
73# CAS cache wrapper metrics
75#: Count of cache misses in BatchReadBlobs requests to the
76# !with-cache-storage. This only counts the blobs which were
77# in the fallback storage; blobs that were entirely missing
78# don't count as cache misses, since this metric is intended
79# to measure how many things that *could* have been cached
80# were actually not.
81CAS_CACHE_BULK_READ_MISS_COUNT_NAME = "cas-withcache-bulk-read-misses"
83#: Count of cache hits in BatchReadBlobs requests to the !with-cache-storage
84CAS_CACHE_BULK_READ_HIT_COUNT_NAME = "cas-withcache-bulk-read-hits"
86#: Percentage of cache hits in a given BatchReadBlobs request in the
87# !with-cache-storage. This is as a percentage of total blobs requested,
88# including blobs which were missing entirely.
89CAS_CACHE_BULK_READ_HIT_PERCENTAGE_NAME = "cas-withcache-bulk-read-hit-percent"
91#: Count of cache misses in ByteStream Read requests to the
92# !with-cache-storage. This only counts the blobs which were
93# in the fallback storage; blobs that were entirely missing
94# don't count as cache misses, since this metric is intended
95# to measure how many things that *could* have been cached
96# were actually not.
97CAS_CACHE_GET_BLOB_MISS_COUNT_NAME = "cas-withcache-get-blob-misses"
99#: Count of cache hits in ByteStream Read requests to the !with-cache-storage
100CAS_CACHE_GET_BLOB_HIT_COUNT_NAME = "cas-withcache-get-blob-hits"
102# Indexed CAS metrics
104#: Time taken to bulk select a number of digests from the index
105CAS_INDEX_BULK_SELECT_DIGEST_TIME_METRIC_NAME = "cas.index.bulk-select-digest-time"
107#: Time taken to update a blob timestamp in the index
108CAS_INDEX_BLOB_TIMESTAMP_UPDATE_TIME_METRIC_NAME = "cas.index.blob-timestamp-update-time"
110#: Time taken to run a bulk timestamp update in the index
111CAS_INDEX_BULK_TIMESTAMP_UPDATE_TIME_METRIC_NAME = "cas.index.bulk-timestamp-update-time"
113#: Time taken to return from `get_blob()`. This includes the time taken to
114# check and update the index, along with to time to fetch the blob from the
115# underlying storage, and update the index if `fallback_on_get` is enabled.
116CAS_INDEX_GET_BLOB_TIME_METRIC_NAME = "cas.index.get-blob-time"
118#: Time taken to store a list of digests in the index
119CAS_INDEX_SAVE_DIGESTS_TIME_METRIC_NAME = "cas.index.save-digests-time"
121#: Time taken to get the total size of the CAS the index is for
122CAS_INDEX_SIZE_CALCULATION_TIME_METRIC_NAME = "cas.index.total-size-calculation-time"
124#
125# ActionCache metrics
126#
128#: Time that ``GetActionResult()`` operations took to complete
129AC_GET_ACTION_RESULT_TIME_METRIC_NAME = "get-action-result"
131#: Time that ``UpdateActionResult()`` operations took to complete
132AC_UPDATE_ACTION_RESULT_TIME_METRIC_NAME = "update-action-result"
134#: Number of cache hits from the ActionCache
135AC_CACHE_HITS_METRIC_NAME = "action-cache-hits"
137#: Number of cache misses from the ActionCache
138AC_CACHE_MISSES_METRIC_NAME = "action-cache-misses"
140#: Number of cache hits which became misses due to missing blobs in CAS
141AC_UNUSABLE_CACHE_HITS_METRIC_NAME = "action-cache-hits-with-missing-blobs"
143#
144# S3 metrics
145#
147#: Time taken to check errors from a bulk_delete
148S3_DELETE_ERROR_CHECK_METRIC_NAME = "s3-deletion-error-check-timer"
151#
152# Cleanup metrics
153#
155#: Number of blobs deleted per second in a cleanup batch
156CLEANUP_BLOBS_DELETION_RATE_METRIC_NAME = "cleanup.blobs-deleted-per-second"
158#: Number of bytes deleted per second in a cleanup batch
159CLEANUP_BYTES_DELETION_RATE_METRIC_NAME = "cleanup.bytes-deleted-per-second"
161#: Total time taken to clean enough blobs to get the CAS size down to the low watermark
162CLEANUP_RUNTIME_METRIC_NAME = "cleanup.runtime-timer"
164#: Time taken to bulk delete a set of blobs from the index
165CLEANUP_INDEX_BULK_DELETE_METRIC_NAME = "cleanup.index.bulk-delete-timer"
167#: Time taken to mark a set of blobs as deleted in the index
168CLEANUP_INDEX_MARK_DELETED_METRIC_NAME = "cleanup.index.mark-as-deleted-timer"
170#: Number of blobs that were already marked for deletion in the index when marking as deleted
171CLEANUP_INDEX_PREMARKED_BLOBS_METRIC_NAME = "cleanup.index.premarked-blobs-count"
173#: Time taken to bulk delete a set of blobs from the storage backend
174CLEANUP_STORAGE_BULK_DELETE_METRIC_NAME = "cleanup.storage.bulk-delete-timer"
176#: Number of blobs that failed to be deleted from the storage backend in a given bulk delete request
177CLEANUP_STORAGE_DELETION_FAILURES_METRIC_NAME = "cleanup.storage.deletion-failures-count"
180#
181# ExecutedActionMetadata metrics
182#
184#: Time spent queued before being assigned to a worker
185QUEUED_TIME_METRIC_NAME = "action-queued-time"
187#: Time spent in the worker (fetching inputs + executing + uploading outputs)
188WORKER_HANDLING_TIME_METRIC_NAME = "worker-handling-time"
190#: Time spent fetching inputs before execution
191INPUTS_FETCHING_TIME_METRIC_NAME = "inputs-fetching-time"
193#: Time spent waiting for executions to complete
194EXECUTION_TIME_METRIC_NAME = "execution-time"
196#: Time spent uploading inputs after execution
197OUTPUTS_UPLOADING_TIME_METRIC_NAME = "outputs-uploading-time"
199#: Total time spent servicing an execution request (time queued +fetching inputs +
200# executing + uploading outputs)
201TOTAL_HANDLING_TIME_METRIC_NAME = "total-handling-time"
204#
205# Execution service metrics
206#
208#: Number of bots connected
209BOT_COUNT_METRIC_NAME = "bots-count"
211#: Number of clients connected
212CLIENT_COUNT_METRIC_NAME = "clients-count"
214#: Number of leases present in the scheduler
215LEASE_COUNT_METRIC_NAME = "lease-count"
217#: Counter metric indicating lease stage transitions
218LEASE_CHANGES_COUNTER_METRIC_NAME = "lease-state-transitions-counter"
220#: Number of active jobs in the scheduler
221JOB_COUNT_METRIC_NAME = "job-count"
223#: Counter metric indicating job stage transitions
224JOB_CHANGES_COUNTER_METRIC_NAME = "job-stage-transitions-counter"
226#: Average time that a job spends waiting to be executed
227AVERAGE_QUEUE_TIME_METRIC_NAME = "average-queue-time"
229#: Number of ``Execute()`` requests received:
230EXECUTE_REQUEST_COUNT_METRIC_NAME = "execute-call-count"
232#: Time spent servicing ``Execute()`` requests:
233EXECUTE_SERVICER_TIME_METRIC_NAME = "execute-servicing-time"
235#: Number of ``WaitExecution()`` requests received:
236WAIT_EXECUTION_REQUEST_COUNT_METRIC_NAME = "wait-execution-call-count"
238#: Time spent servicing ``WaitExecution()`` requests:
239WAIT_EXECUTION_SERVICER_TIME_METRIC_NAME = "wait-execution-servicing-time"
241#
242# LogStream service metrics
243#
245#: Time spent creating a LogStream
246LOGSTREAM_CREATE_LOG_STREAM_TIME_METRIC_NAME = "logstream.create-logstream-time"
248#: Number of bytes in a committed logstream
249LOGSTREAM_WRITE_UPLOADED_BYTES_COUNT = "logstream.write.uploaded-bytes-count"
251#
252# Authentication Metrics
253#
255#: Number of invalid JWTs recieved:
256INVALID_JWT_COUNT_METRIC_NAME = "authentication.jwt.invalid-jwt-count"
258#: Duration of JWK fetch request:
259JWK_FETCH_TIME_METRIC_NAME = "authentication.jwk.fetch-request-time"
261#: Duration of JWT decoding:
262JWT_DECODE_TIME_METRIC_NAME = "authentication.jwt.decode-jwt-time"
264#: Duration of JWT validation (can include fetching JWK):
265JWT_VALIDATION_TIME_METRIC_NAME = "authentication.jwt.validate-jwt-time"
267#
268# Bots service metrics
269#
271#: Time spent servicing ``CreateBotSession()`` requests
272BOTS_CREATE_BOT_SESSION_TIME_METRIC_NAME = "bots.create-bot-session-time"
274#: Time spent servicing ``UpdateBotSession()`` requests
275BOTS_UPDATE_BOT_SESSION_TIME_METRIC_NAME = "bots.update-bot-session-time"
277#: Time spent selecting an Action from the data store to create a lease for
278BOTS_ASSIGN_JOB_LEASES_TIME_METRIC_NAME = "bots.assign-job-leases-time"
281#
282# Scheduler metrics
283#
285#: Time taken to queue an Action
286SCHEDULER_QUEUE_ACTION_TIME_METRIC_NAME = "scheduler.queue-action-time"
288#: Time taken to update a job's Lease
289SCHEDULER_UPDATE_LEASE_TIME_METRIC_NAME = "scheduler.update-lease-time"
291#: Time taken to cancel an Operation
292SCHEDULER_CANCEL_OPERATION_TIME_METRIC_NAME = "scheduler.cancel-operation-time"
295#
296# Data Store (scheduler's backend) metrics
297#
298# Some of these seem like duplicates of the request-level timers
299# at a glance, but measuring at the data store level allows us to
300# see how much overhead our own code is adding to the calls.
301#
303#: Time taken to create a Job
304DATA_STORE_CREATE_JOB_TIME_METRIC_NAME = "datastore.all.create-job-time"
306#: Time taken to enqueue a Job
307DATA_STORE_QUEUE_JOB_TIME_METRIC_NAME = "datastore.all.queue-job-time"
309#: Time taken to update a Job
310DATA_STORE_UPDATE_JOB_TIME_METRIC_NAME = "datastore.all.update-job-time"
312#: Time taken to create a Lease
313DATA_STORE_CREATE_LEASE_TIME_METRIC_NAME = "datastore.all.create-lease-time"
315#: Time taken to update a Lease
316DATA_STORE_UPDATE_LEASE_TIME_METRIC_NAME = "datastore.all.update-lease-time"
318#: Time taken to create an Operation
319DATA_STORE_CREATE_OPERATION_TIME_METRIC_NAME = "datastore.all.create-operation-time"
321#: Time taken to update an Operation
322DATA_STORE_UPDATE_OPERATION_TIME_METRIC_NAME = "datastore.all.update-operation-time"
324#: Time taken to get a list of Operations
325DATA_STORE_LIST_OPERATIONS_TIME_METRIC_NAME = "datastore.all.list-operations-time"
327#: Time taken to get a Job by Action Digest
328DATA_STORE_GET_JOB_BY_DIGEST_TIME_METRIC_NAME = "datastore.all.get-job-by-digest-time"
330#: Time taken to get a Job by name
331DATA_STORE_GET_JOB_BY_NAME_TIME_METRIC_NAME = "datastore.all.get-job-by-name-time"
333#: Time taken to get a Job by Operation name
334DATA_STORE_GET_JOB_BY_OPERATION_TIME_METRIC_NAME = "datastore.all.get-job-by-operation-time"
336#: Time taken to handle checking for a job update. When using
337# a database backend other than PostgreSQL, this will measure
338# how long it takes to check all watched jobs for updates once.
339# For PostgreSQL and the in-memory scheduler, this measures how
340# long it takes to handle a job update notification.
341DATA_STORE_CHECK_FOR_UPDATE_TIME_METRIC_NAME = "datastore.all.check-for-update-time"
343# SQL-specific metrics
345#: Time taken to store the ExecuteResponse
346DATA_STORE_STORE_RESPONSE_TIME_METRIC_NAME = "datastore.sql.store-response-time"
348#: Number of rows deleted from the jobs table during each pruning
349DATA_STORE_PRUNER_NUM_ROWS_DELETED_METRIC_NAME = "datastore.sql.pruner-num-rows-deleted"
351#: Time taken per scheduler pruning invocation
352DATA_STORE_PRUNER_DELETE_TIME_METRIC_NAME = "datastore.sql.pruner-delete-time"
355#
356# Operations service metrics
357#
359#: Time taken to completely handle a ListOperations request
360OPERATIONS_LIST_OPERATIONS_TIME_METRIC_NAME = "operations.list-operations-time"
362#: Time taken to completely handle a GetOperation request
363OPERATIONS_GET_OPERATION_TIME_METRIC_NAME = "operations.get-operation-time"
365#: Time taken to completely handle a CancelOperation request
366OPERATIONS_CANCEL_OPERATION_TIME_METRIC_NAME = "operations.cancel-operation-time"
368#: Time taken to completely handle a DeleteOperation request. BuildGrid
369# doesn't actually support DeleteOperation, but this metric will at
370# least provide insight into whether people are attempting to call it.
371OPERATIONS_DELETE_OPERATION_TIME_METRIC_NAME = "operations.delete-operation-time"