Skip to content

Commit

Permalink
[Monitoring] Collecting a metric for the age of untriaged testcases (#…
Browse files Browse the repository at this point in the history
…4381)

### Motivation


Once a testcase is generated (or manually uploaded), followup tasks
(analyze/progression) are started. This happens by publishing to a
pubsub queue, both for the manually uploaded case, and for the fuzzer
generated case.

If for any reason the messages are not processed, the testcase gets
stuck. To get better visibility into these stuck testcases, the
UNTRIAGED_TESTCASE_AGE metric is introduced, to pinpoint how old these
testcases that have not yet been triaged are(more precisely, gone
through analyze/regression/impact/progression tasks).


### Attention points

Testcase.timestamp mutates in analyze task:


https://github.com/google/clusterfuzz/blob/6ed80851ad0f6f624c5b232b0460c405f0a018b5/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py#L589

This makes it unreliable as a source of truth for testcase creation
time. To circumvent that, a new ```created``` field is added to the
Testcase entity, from which we can derive the correct creation time.

Since this new field will only apply for testcases created after this PR
is merged, Testcase.timestamp will be used instead to calculate the
testcase age when the new field is missing.

### Testing strategy

Ran the triage cron locally, and verified the codepath for the metric is
hit and produces sane output (reference testcase: 4505741036158976).

![image](https://github.com/user-attachments/assets/6281b44f-768a-417e-8ec1-763f132c8181)


Part of #4271
  • Loading branch information
vitorguidi authored Nov 13, 2024
1 parent 082b008 commit ba9009a
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 2 deletions.
23 changes: 22 additions & 1 deletion src/clusterfuzz/_internal/cron/triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,22 @@ def _file_issue(testcase, issue_tracker, throttler):
return filed


def _emit_untriaged_testcase_age_metric(critical_tasks_completed: bool,
testcase: data_types.Testcase):
"""Emmits a metric to track age of untriaged testcases."""
if critical_tasks_completed:
return
if not testcase.timestamp:
return

monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
testcase.get_age_in_seconds(),
labels={
'job': testcase.job_type,
'platform': testcase.platform,
})


def main():
"""Files bugs."""
try:
Expand Down Expand Up @@ -328,6 +344,8 @@ def main():
# Already deleted.
continue

critical_tasks_completed = data_handler.critical_tasks_completed(testcase)

# Skip if testcase's job is removed.
if testcase.job_type not in all_jobs:
continue
Expand All @@ -336,6 +354,9 @@ def main():
if testcase.job_type in excluded_jobs:
continue

# Emmit the metric for testcases that should be triaged.
_emit_untriaged_testcase_age_metric(critical_tasks_completed, testcase)

# Skip if we are running progression task at this time.
if testcase.get_metadata('progression_pending'):
continue
Expand All @@ -351,7 +372,7 @@ def main():

# Require that all tasks like minimizaton, regression testing, etc have
# finished.
if not data_handler.critical_tasks_completed(testcase):
if not critical_tasks_completed:
continue

# For testcases that are not part of a group, wait an additional time to
Expand Down
2 changes: 2 additions & 0 deletions src/clusterfuzz/_internal/datastore/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@ def store_testcase(crash, fuzzed_keys, minimized_keys, regression, fixed,
testcase.archive_filename = archive_filename
testcase.http_flag = http_flag
testcase.timestamp = datetime.datetime.utcnow()
testcase.created = testcase.timestamp
testcase.gestures = gestures
testcase.redzone = redzone
testcase.disable_ubsan = disable_ubsan
Expand Down Expand Up @@ -1377,6 +1378,7 @@ def create_user_uploaded_testcase(key,
testcase.set_metadata(metadata_key, metadata_value, update_testcase=False)

testcase.timestamp = utils.utcnow()
testcase.created = testcase.timestamp
testcase.uploader_email = uploader_email
testcase.put()

Expand Down
20 changes: 19 additions & 1 deletion src/clusterfuzz/_internal/datastore/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,9 +421,16 @@ class Testcase(Model):
# File name of the original uploaded archive.
archive_filename = ndb.TextProperty()

# Timestamp.
# The time when a testcase is considered valid. This is the same as the
# creation time, except for analyze task, in which this field is a
# placeholder and will be refreshed.
timestamp = ndb.DateTimeProperty()

# Source of truth for creation time. This is missing for testcases
# created before it was introduced, in which case the timestamp
# field will be a proxy for creation time.
created = ndb.DateTimeProperty(indexed=False)

# Does the testcase crash stack vary b/w crashes ?
flaky_stack = ndb.BooleanProperty(default=False, indexed=False)

Expand Down Expand Up @@ -671,6 +678,17 @@ def _ensure_metadata_is_cached(self):

setattr(self, 'metadata_cache', cache)

# Returns testcase.created in case it is present, as it is
# the source of truth for creation time. If missing, returns
# testcase.timestamp as a proxy for creation time.
def get_created_time(self) -> ndb.DateTimeProperty:
return self.created if self.created else self.timestamp

def get_age_in_seconds(self):
current_time = datetime.datetime.utcnow()
testcase_age = current_time - self.get_created_time()
return testcase_age.total_seconds()

def get_metadata(self, key=None, default=None):
"""Get metadata for a test case. Slow on first access."""
self._ensure_metadata_is_cached()
Expand Down
11 changes: 11 additions & 0 deletions src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,17 @@
monitor.StringField('status'),
])

UNTRIAGED_TESTCASE_AGE = monitor.CumulativeDistributionMetric(
'issues/untriaged_testcase_age',
description='Age of testcases that were not yet triaged '
'(have not yet completed analyze, regression,'
' minimization, impact task), in seconds.',
bucketer=monitor.GeometricBucketer(),
field_spec=[
monitor.StringField('job'),
monitor.StringField('platform'),
])

ANALYZE_TASK_REPRODUCIBILITY = monitor.CounterMetric(
'task/analyze/reproducibility',
description='Outcome count for analyze task.',
Expand Down

0 comments on commit ba9009a

Please sign in to comment.