diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index 10c2f67595..4d84d167ec 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -40,6 +40,8 @@ 'Count of failure in booting up cuttlefish with tip-of-the-tree build ', field_spec=[ monitor.StringField('build_id'), + monitor.StringField('instance_id'), + monitor.BooleanField('is_candidate'), monitor.BooleanField('is_succeeded'), ]) diff --git a/src/clusterfuzz/_internal/platforms/android/flash.py b/src/clusterfuzz/_internal/platforms/android/flash.py index 14c29635e0..98b9a85819 100644 --- a/src/clusterfuzz/_internal/platforms/android/flash.py +++ b/src/clusterfuzz/_internal/platforms/android/flash.py @@ -19,7 +19,9 @@ from clusterfuzz._internal.base import dates from clusterfuzz._internal.base import persistent_cache +from clusterfuzz._internal.base import utils from clusterfuzz._internal.datastore import locks +from clusterfuzz._internal.google_cloud_utils import compute_metadata from clusterfuzz._internal.metrics import logs from clusterfuzz._internal.metrics import monitoring_metrics from clusterfuzz._internal.system import archive @@ -171,10 +173,17 @@ def flash_to_latest_build_if_needed(): 'branch %s and target %s.' % (branch, target)) return + instance_id = None + is_candidate = None if environment.is_android_cuttlefish(): download_latest_build(build_info, FLASH_CUTTLEFISH_REGEXES, image_directory) adb.recreate_cuttlefish_device() adb.connect_to_cuttlefish_device() + if compute_metadata.is_gce(): + # Get the GCE-assigned VM instance ID for accurate instance tracking. + instance_id = compute_metadata.get('instance/id') + # Determine if the current instance is a candidate. + is_candidate = utils.get_clusterfuzz_release() == 'candidate' else: download_latest_build(build_info, FLASH_IMAGE_REGEXES, image_directory) # We do one device flash at a time on one host, otherwise we run into @@ -220,6 +229,8 @@ def flash_to_latest_build_if_needed(): logs.info('Trying to boot cuttlefish instance using stable build.') monitoring_metrics.CF_TIP_BOOT_FAILED_COUNT.increment({ 'build_id': build_info['bid'], + 'instance_id': instance_id, + 'is_candidate': is_candidate, 'is_succeeded': False }) boot_stable_build_cuttlefish(branch, target, image_directory) @@ -229,9 +240,10 @@ def flash_to_latest_build_if_needed(): else: logs.error('Unable to find device. Reimaging failed.') adb.bad_state_reached() - monitoring_metrics.CF_TIP_BOOT_FAILED_COUNT.increment({ 'build_id': build_info['bid'], + 'instance_id': instance_id, + 'is_candidate': is_candidate, 'is_succeeded': True }) logs.info('Reimaging finished.') diff --git a/src/clusterfuzz/_internal/tests/core/metrics/monitor_test.py b/src/clusterfuzz/_internal/tests/core/metrics/monitor_test.py index f2d8f374bf..80c9ba367d 100644 --- a/src/clusterfuzz/_internal/tests/core/metrics/monitor_test.py +++ b/src/clusterfuzz/_internal/tests/core/metrics/monitor_test.py @@ -76,6 +76,7 @@ def setUp(self): 'clusterfuzz._internal.base.persistent_cache.get_value', 'clusterfuzz._internal.base.persistent_cache.set_value', 'clusterfuzz._internal.base.persistent_cache.delete_value', + 'clusterfuzz._internal.google_cloud_utils.compute_metadata.get', 'clusterfuzz._internal.platforms.android.settings.is_google_device', 'clusterfuzz._internal.platforms.android.fetch_artifact.get_latest_artifact_info', 'clusterfuzz._internal.system.environment.is_android_cuttlefish', @@ -116,37 +117,84 @@ def _setup_monitoring_daemon(self, mock_client): monitor._monitoring_daemon.start() return call_queue - def _assert_cuttlefish_boot_metric(self, time_series, is_succeeded): + def _assert_cuttlefish_boot_metric(self, time_series, instance_id, + is_candidate, is_succeeded): """Asserts Cuttlefish boot failure metric presence and correctness in time series.""" for ts in time_series: if ts.metric.type == "custom.googleapis.com/tip_boot_failure": + if instance_id is not None and ts.metric.labels['instance_id'] != str( + instance_id): + continue + if is_candidate is not None and ts.metric.labels['is_candidate'] != str( + is_candidate): + continue if is_succeeded is not None and ts.metric.labels['is_succeeded'] != str( is_succeeded): continue + self.assertEqual(ts.metric.labels['instance_id'], str(instance_id)) + self.assertEqual(ts.metric.labels['is_candidate'], str(is_candidate)) self.assertEqual(ts.metric.labels['is_succeeded'], str(is_succeeded)) self.assertEqual(ts.metric.labels['build_id'], "test-bid") + def _fake_get(self, path): + if path == "instance/zone": + return "projects/1234567890/zones/us-central1-b" + if path == "instance/id": + return "1234567890" + return "" + + @patch( + 'clusterfuzz._internal.metrics.monitor.monitoring_v3.MetricServiceClient') + def test_cuttlefish_boot_success_metric_for_candidate_fleet( + self, mock_client): + """Tests the metric emission for a successful Cuttlefish boot.""" + self.mock.get.side_effect = self._fake_get + call_queue = self._setup_monitoring_daemon(mock_client) + self.mock.get_device_state.return_value = 'device' + flash.flash_to_latest_build_if_needed() + args = call_queue.get(timeout=20) + time_series = args['time_series'] + self._assert_cuttlefish_boot_metric(time_series, '1234567890', True, True) + monitor._monitoring_daemon.stop() + + @patch( + 'clusterfuzz._internal.metrics.monitor.monitoring_v3.MetricServiceClient') + def test_cuttlefish_boot_failure_metric_for_candidate_fleet( + self, mock_client): + """Tests the metric emission for a failed Cuttlefish boot.""" + self.mock.get.side_effect = self._fake_get + call_queue = self._setup_monitoring_daemon(mock_client) + flash.flash_to_latest_build_if_needed() + args = call_queue.get(timeout=20) + time_series = args['time_series'] + self._assert_cuttlefish_boot_metric(time_series, '1234567890', True, False) + monitor._monitoring_daemon.stop() + @patch( 'clusterfuzz._internal.metrics.monitor.monitoring_v3.MetricServiceClient') - def test_cuttlefish_boot_success_metric(self, mock_client): + def test_cuttlefish_boot_success_metric_for_production_fleet( + self, mock_client): """Tests the metric emission for a successful Cuttlefish boot.""" + self.mock.get.side_effect = self._fake_get call_queue = self._setup_monitoring_daemon(mock_client) self.mock.get_device_state.return_value = 'device' flash.flash_to_latest_build_if_needed() args = call_queue.get(timeout=20) time_series = args['time_series'] - self._assert_cuttlefish_boot_metric(time_series, True) + self._assert_cuttlefish_boot_metric(time_series, '1234567890', False, True) monitor._monitoring_daemon.stop() @patch( 'clusterfuzz._internal.metrics.monitor.monitoring_v3.MetricServiceClient') - def test_cuttlefish_boot_failure_metric(self, mock_client): + def test_cuttlefish_boot_failure_metric_for_production_fleet( + self, mock_client): """Tests the metric emission for a failed Cuttlefish boot.""" + self.mock.get.side_effect = self._fake_get call_queue = self._setup_monitoring_daemon(mock_client) flash.flash_to_latest_build_if_needed() args = call_queue.get(timeout=20) time_series = args['time_series'] - self._assert_cuttlefish_boot_metric(time_series, False) + self._assert_cuttlefish_boot_metric(time_series, '1234567890', False, False) monitor._monitoring_daemon.stop() def test_counter_metric_success(self):