Skip to content

Commit 0dd67b9

Browse files
committed
Update CF_TIP_BOOT_FAILED_COUNT Metric
Add `is_candidate` label to the CF_TIP_BOOT_FAILED_COUNT metric to distinguish prod and candidate instances. Add `instance_id` label to the CF_TIP_BOOT_FAILED_COUNT metric to get the GCE VM instance ID for accurate instance tracking.
1 parent 74f530a commit 0dd67b9

File tree

3 files changed

+72
-7
lines changed

3 files changed

+72
-7
lines changed

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@
4040
'Count of failure in booting up cuttlefish with tip-of-the-tree build ',
4141
field_spec=[
4242
monitor.StringField('build_id'),
43+
# Add 'instance_id' field to get the GCE VM ID
44+
monitor.StringField('instance_id'),
45+
# Add 'is_candidate' field to distinguish between prod and
46+
# candidate instances.
47+
monitor.BooleanField('is_candidate'),
4348
monitor.BooleanField('is_succeeded'),
4449
])
4550

@@ -439,4 +444,4 @@
439444
monitor.BooleanField('deploy_kubernetes'),
440445
monitor.BooleanField('deploy_terraform'),
441446
monitor.StringField('clusterfuzz_version')
442-
])
447+
])

src/clusterfuzz/_internal/platforms/android/flash.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919

2020
from clusterfuzz._internal.base import dates
2121
from clusterfuzz._internal.base import persistent_cache
22+
from clusterfuzz._internal.base import utils
2223
from clusterfuzz._internal.datastore import locks
24+
from clusterfuzz._internal.google_cloud_utils import compute_metadata
2325
from clusterfuzz._internal.metrics import logs
2426
from clusterfuzz._internal.metrics import monitoring_metrics
2527
from clusterfuzz._internal.system import archive
@@ -171,10 +173,17 @@ def flash_to_latest_build_if_needed():
171173
'branch %s and target %s.' % (branch, target))
172174
return
173175

176+
instance_id = None
177+
is_candidate = None
174178
if environment.is_android_cuttlefish():
175179
download_latest_build(build_info, FLASH_CUTTLEFISH_REGEXES, image_directory)
176180
adb.recreate_cuttlefish_device()
177181
adb.connect_to_cuttlefish_device()
182+
if compute_metadata.is_gce():
183+
# Get the GCE-assigned VM instance ID for accurate instance tracking.
184+
instance_id = compute_metadata.get('instance/id')
185+
# Determine if the current instance is a candidate.
186+
is_candidate = utils.get_clusterfuzz_release() == 'candidate'
178187
else:
179188
download_latest_build(build_info, FLASH_IMAGE_REGEXES, image_directory)
180189
# We do one device flash at a time on one host, otherwise we run into
@@ -220,6 +229,8 @@ def flash_to_latest_build_if_needed():
220229
logs.info('Trying to boot cuttlefish instance using stable build.')
221230
monitoring_metrics.CF_TIP_BOOT_FAILED_COUNT.increment({
222231
'build_id': build_info['bid'],
232+
'instance_id': instance_id,
233+
'is_candidate': is_candidate,
223234
'is_succeeded': False
224235
})
225236
boot_stable_build_cuttlefish(branch, target, image_directory)
@@ -229,9 +240,10 @@ def flash_to_latest_build_if_needed():
229240
else:
230241
logs.error('Unable to find device. Reimaging failed.')
231242
adb.bad_state_reached()
232-
233243
monitoring_metrics.CF_TIP_BOOT_FAILED_COUNT.increment({
234244
'build_id': build_info['bid'],
245+
'instance_id': instance_id,
246+
'is_candidate': is_candidate,
235247
'is_succeeded': True
236248
})
237249
logs.info('Reimaging finished.')

src/clusterfuzz/_internal/tests/core/metrics/monitor_test.py

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ def setUp(self):
7676
'clusterfuzz._internal.base.persistent_cache.get_value',
7777
'clusterfuzz._internal.base.persistent_cache.set_value',
7878
'clusterfuzz._internal.base.persistent_cache.delete_value',
79+
'clusterfuzz._internal.google_cloud_utils.compute_metadata.get',
7980
'clusterfuzz._internal.platforms.android.settings.is_google_device',
8081
'clusterfuzz._internal.platforms.android.fetch_artifact.get_latest_artifact_info',
8182
'clusterfuzz._internal.system.environment.is_android_cuttlefish',
@@ -116,37 +117,84 @@ def _setup_monitoring_daemon(self, mock_client):
116117
monitor._monitoring_daemon.start()
117118
return call_queue
118119

119-
def _assert_cuttlefish_boot_metric(self, time_series, is_succeeded):
120+
def _assert_cuttlefish_boot_metric(self, time_series, instance_id,
121+
is_candidate, is_succeeded):
120122
"""Asserts Cuttlefish boot failure metric presence and correctness in time series."""
121123
for ts in time_series:
122124
if ts.metric.type == "custom.googleapis.com/tip_boot_failure":
125+
if instance_id is not None and ts.metric.labels['instance_id'] != str(
126+
instance_id):
127+
continue
128+
if is_candidate is not None and ts.metric.labels['is_candidate'] != str(
129+
is_candidate):
130+
continue
123131
if is_succeeded is not None and ts.metric.labels['is_succeeded'] != str(
124132
is_succeeded):
125133
continue
134+
self.assertEqual(ts.metric.labels['instance_id'], str(instance_id))
135+
self.assertEqual(ts.metric.labels['is_candidate'], str(is_candidate))
126136
self.assertEqual(ts.metric.labels['is_succeeded'], str(is_succeeded))
127137
self.assertEqual(ts.metric.labels['build_id'], "test-bid")
128138

139+
def _fake_get(self, path):
140+
if path == "instance/zone":
141+
return "projects/1234567890/zones/us-central1-b"
142+
if path == "instance/id":
143+
return "1234567890"
144+
return ""
145+
146+
@patch(
147+
'clusterfuzz._internal.metrics.monitor.monitoring_v3.MetricServiceClient')
148+
def test_cuttlefish_boot_success_metric_for_candidate_fleet(
149+
self, mock_client):
150+
"""Tests the metric emission for a successful Cuttlefish boot."""
151+
self.mock.get.side_effect = self._fake_get
152+
call_queue = self._setup_monitoring_daemon(mock_client)
153+
self.mock.get_device_state.return_value = 'device'
154+
flash.flash_to_latest_build_if_needed()
155+
args = call_queue.get(timeout=20)
156+
time_series = args['time_series']
157+
self._assert_cuttlefish_boot_metric(time_series, '1234567890', True, True)
158+
monitor._monitoring_daemon.stop()
159+
160+
@patch(
161+
'clusterfuzz._internal.metrics.monitor.monitoring_v3.MetricServiceClient')
162+
def test_cuttlefish_boot_failure_metric_for_candidate_fleet(
163+
self, mock_client):
164+
"""Tests the metric emission for a failed Cuttlefish boot."""
165+
self.mock.get.side_effect = self._fake_get
166+
call_queue = self._setup_monitoring_daemon(mock_client)
167+
flash.flash_to_latest_build_if_needed()
168+
args = call_queue.get(timeout=20)
169+
time_series = args['time_series']
170+
self._assert_cuttlefish_boot_metric(time_series, '1234567890', True, False)
171+
monitor._monitoring_daemon.stop()
172+
129173
@patch(
130174
'clusterfuzz._internal.metrics.monitor.monitoring_v3.MetricServiceClient')
131-
def test_cuttlefish_boot_success_metric(self, mock_client):
175+
def test_cuttlefish_boot_success_metric_for_production_fleet(
176+
self, mock_client):
132177
"""Tests the metric emission for a successful Cuttlefish boot."""
178+
self.mock.get.side_effect = self._fake_get
133179
call_queue = self._setup_monitoring_daemon(mock_client)
134180
self.mock.get_device_state.return_value = 'device'
135181
flash.flash_to_latest_build_if_needed()
136182
args = call_queue.get(timeout=20)
137183
time_series = args['time_series']
138-
self._assert_cuttlefish_boot_metric(time_series, True)
184+
self._assert_cuttlefish_boot_metric(time_series, '1234567890', False, True)
139185
monitor._monitoring_daemon.stop()
140186

141187
@patch(
142188
'clusterfuzz._internal.metrics.monitor.monitoring_v3.MetricServiceClient')
143-
def test_cuttlefish_boot_failure_metric(self, mock_client):
189+
def test_cuttlefish_boot_failure_metric_for_production_fleet(
190+
self, mock_client):
144191
"""Tests the metric emission for a failed Cuttlefish boot."""
192+
self.mock.get.side_effect = self._fake_get
145193
call_queue = self._setup_monitoring_daemon(mock_client)
146194
flash.flash_to_latest_build_if_needed()
147195
args = call_queue.get(timeout=20)
148196
time_series = args['time_series']
149-
self._assert_cuttlefish_boot_metric(time_series, False)
197+
self._assert_cuttlefish_boot_metric(time_series, '1234567890', False, False)
150198
monitor._monitoring_daemon.stop()
151199

152200
def test_counter_metric_success(self):

0 commit comments

Comments
 (0)