Fix flakereport

stephanos · stephanos · commit 7f5d574ed048 · 2026-02-25T11:06:17.000-08:00
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -354,12 +354,19 @@ jobs:
           flags: unit-test
           report_type: test_results
 
+      - name: Get job ID
+        id: get_job_id
+        uses: ./.github/actions/get-job-id
+        with:
+          job_name: Unit test
+          run_id: ${{ github.run_id }}
+
       - name: Upload test results to GitHub
         # Can't pin to major because the action linter doesn't recognize the include-hidden-files flag.
         uses: actions/upload-artifact@v4.4.3
         if: ${{ !cancelled() }}
         with:
-          name: junit-xml--${{github.run_id}}--${{github.run_attempt}}--unit-test
+          name: junit-xml--${{ github.run_id }}--${{ steps.get_job_id.outputs.job_id }}--${{ github.run_attempt }}--unit-test
           path: ./.testoutput/junit.*.xml
           include-hidden-files: true
           retention-days: 28
@@ -447,12 +454,19 @@ jobs:
           flags: integration-test
           report_type: test_results
 
+      - name: Get job ID
+        id: get_job_id
+        uses: ./.github/actions/get-job-id
+        with:
+          job_name: Integration test
+          run_id: ${{ github.run_id }}
+
       - name: Upload test results to GitHub
         # Can't pin to major because the action linter doesn't recognize the include-hidden-files flag.
         uses: actions/upload-artifact@v4.4.3
         if: ${{ !cancelled() }}
         with:
-          name: junit-xml--${{github.run_id}}--${{github.run_attempt}}--integration-test
+          name: junit-xml--${{ github.run_id }}--${{ steps.get_job_id.outputs.job_id }}--${{ github.run_attempt }}--integration-test
           path: ./.testoutput/junit.*.xml
           include-hidden-files: true
           retention-days: 28
diff --git a/tools/flakereport/flakereport.go b/tools/flakereport/flakereport.go
@@ -259,8 +259,9 @@ func runGenerateCommand(c *cli.Context) (err error) {
 	// Count test runs by name for failure rate calculation
 	testRunCounts := countTestRuns(allTestRuns)
 
-	// Group failures by test name
+	// Group failures by test name, then remove parent entries whose subtests were observed.
 	grouped := groupFailuresByTest(allFailures)
+	filterParentTests(grouped, testRunCounts)
 	fmt.Printf("Unique tests with failures: %d\n", len(grouped))
 
 	// Classify failures
diff --git a/tools/flakereport/github.go b/tools/flakereport/github.go
@@ -191,8 +191,8 @@ func extractArtifactZip(zipPath, outputDir string) ([]string, error) {
 	return xmlFiles, nil
 }
 
-// parseArtifactName extracts run_id and job_id from artifact name
-// Format: {prefix}--{run_id}--{job_id}--{suffix}
+// parseArtifactName extracts run_id and job_id from artifact name.
+// Format: junit-xml--{run_id}--{job_id}--{run_attempt}--...--{test-type}
 // Returns: runID, jobID (or "unknown" if not parseable)
 func parseArtifactName(artifactName string) (runID string, jobID string) {
 	parts := strings.Split(artifactName, "--")
diff --git a/tools/flakereport/parallel.go b/tools/flakereport/parallel.go
@@ -135,7 +135,8 @@ func processArtifactJob(ctx context.Context, job ArtifactJob, totalArtifacts int
 		result.Failures = append(result.Failures, failures...)
 
 		// Extract all test runs for failure rate calculation
-		testRuns := extractAllTestRuns(suites, job.RunID)
+		_, jobID := parseArtifactName(job.Artifact.Name)
+		testRuns := extractAllTestRuns(suites, job.RunID, jobID)
 		result.AllRuns = append(result.AllRuns, testRuns...)
 	}
 
diff --git a/tools/flakereport/parser.go b/tools/flakereport/parser.go
@@ -94,7 +94,7 @@ func extractFailures(suites *junit.Testsuites, artifactName string, runID int64,
 
 // extractAllTestRuns extracts all test runs (including successes) from parsed JUnit data
 // Used for calculating failure rates
-func extractAllTestRuns(suites *junit.Testsuites, runID int64) []TestRun {
+func extractAllTestRuns(suites *junit.Testsuites, runID int64, jobID string) []TestRun {
 	var runs []TestRun
 
 	for _, suite := range suites.Suites {
@@ -105,6 +105,7 @@ func extractAllTestRuns(suites *junit.Testsuites, runID int64) []TestRun {
 				Failed:    testcase.Failure != nil,
 				Skipped:   testcase.Skipped != nil,
 				RunID:     runID,
+				JobID:     jobID,
 			}
 			runs = append(runs, run)
 		}
@@ -237,6 +238,25 @@ func convertToReports(grouped map[string][]TestFailure, testRunCounts map[string
 	return reports
 }
 
+// filterParentTests removes top-level test names from grouped when subtests of
+// that parent were observed in testRunCounts. A top-level failure whose subtests
+// ran in other CI jobs is already captured (with a correct denominator) in the
+// Flaky Suites section, so including it in the per-test table produces a
+// misleading 1/1 entry.
+func filterParentTests(grouped map[string][]TestFailure, testRunCounts map[string]int) {
+	suitePrefix := make(map[string]bool, len(testRunCounts))
+	for name := range testRunCounts {
+		if idx := strings.IndexByte(name, '/'); idx >= 0 {
+			suitePrefix[name[:idx]] = true
+		}
+	}
+	for testName := range grouped {
+		if !strings.Contains(testName, "/") && suitePrefix[testName] {
+			delete(grouped, testName)
+		}
+	}
+}
+
 // isFinalRetry returns true if the test name has the "(final)" suffix,
 // indicating the test runner exhausted all retries.
 func isFinalRetry(testName string) bool {
@@ -341,23 +361,36 @@ func identifyCIBreakers(failures []TestFailure) (map[string][]TestFailure, map[s
 	return ciBreakers, ciBreakCount
 }
 
+// jobKey returns a string that uniquely identifies a single job execution.
+// When a real JobID is available it is globally unique; otherwise we fall back
+// to the RunID so that the set still grows correctly across CI runs.
+func jobKey(runID int64, jobID string) string {
+	if jobID != "" && jobID != "unknown" {
+		return jobID
+	}
+	return fmt.Sprintf("%d", runID)
+}
+
 // generateSuiteReports creates per-suite flake breakdown from all failures and test runs.
-// Suite flake rate = % of workflow runs where the suite had at least one non-retry failure.
+// Suite flake rate = % of job executions where the suite had at least one non-retry failure.
 func generateSuiteReports(allFailures []TestFailure, allTestRuns []TestRun) []SuiteReport {
-	// Track unique workflow runs per suite (denominator)
-	suiteRuns := make(map[string]map[int64]bool)
+	// Track unique job executions per suite (denominator).
+	// Each matrix shard / DB-config combination is a separate job execution even
+	// though it shares the same workflow RunID, so we key by JobID (falling back
+	// to RunID when JobID is unavailable).
+	suiteRuns := make(map[string]map[string]bool)
 	for _, run := range allTestRuns {
 		if run.Skipped || !isGoTestSuite(run.SuiteName) {
 			continue
 		}
 		if suiteRuns[run.SuiteName] == nil {
-			suiteRuns[run.SuiteName] = make(map[int64]bool)
+			suiteRuns[run.SuiteName] = make(map[string]bool)
 		}
-		suiteRuns[run.SuiteName][run.RunID] = true
+		suiteRuns[run.SuiteName][jobKey(run.RunID, run.JobID)] = true
 	}
 
-	// Track workflow runs with non-retry failures per suite (numerator)
-	suiteFailedRuns := make(map[string]map[int64]bool)
+	// Track job executions with non-retry failures per suite (numerator)
+	suiteFailedRuns := make(map[string]map[string]bool)
 	suiteLastFailure := make(map[string]time.Time)
 	for _, failure := range allFailures {
 		if !isGoTestSuite(failure.SuiteName) {
@@ -368,9 +401,9 @@ func generateSuiteReports(allFailures []TestFailure, allTestRuns []TestRun) []Su
 			continue
 		}
 		if suiteFailedRuns[failure.SuiteName] == nil {
-			suiteFailedRuns[failure.SuiteName] = make(map[int64]bool)
+			suiteFailedRuns[failure.SuiteName] = make(map[string]bool)
 		}
-		suiteFailedRuns[failure.SuiteName][failure.RunID] = true
+		suiteFailedRuns[failure.SuiteName][jobKey(failure.RunID, failure.JobID)] = true
 		if failure.Timestamp.After(suiteLastFailure[failure.SuiteName]) {
 			suiteLastFailure[failure.SuiteName] = failure.Timestamp
 		}
diff --git a/tools/flakereport/parser_test.go b/tools/flakereport/parser_test.go
@@ -52,15 +52,27 @@ func TestParseArtifactName(t *testing.T) {
 		expectedJobID string
 	}{
 		{
-			name:          "valid artifact name",
-			artifactName:  "test-results--12345678--87654321--junit",
-			expectedRunID: "12345678",
-			expectedJobID: "87654321",
+			name:          "functional test artifact",
+			artifactName:  "junit-xml--22373551837--64609560060--1--integration-0--Integration--functional-test",
+			expectedRunID: "22373551837",
+			expectedJobID: "64609560060",
+		},
+		{
+			name:          "unit test artifact",
+			artifactName:  "junit-xml--22373551837--64609560061--1--unit-test",
+			expectedRunID: "22373551837",
+			expectedJobID: "64609560061",
+		},
+		{
+			name:          "integration test artifact",
+			artifactName:  "junit-xml--22373551837--64609560062--1--integration-test",
+			expectedRunID: "22373551837",
+			expectedJobID: "64609560062",
 		},
 		{
 			name:          "artifact name with empty job id",
-			artifactName:  "test-results--12345678----junit",
-			expectedRunID: "12345678",
+			artifactName:  "junit-xml--22373551837----1--unit-test",
+			expectedRunID: "22373551837",
 			expectedJobID: "unknown",
 		},
 		{
@@ -157,6 +169,41 @@ func TestClassifyFailures(t *testing.T) {
 	assert.Len(t, flaky["TestNormal"], 2)
 }
 
+func TestFilterParentTests(t *testing.T) {
+	makeFailures := func(names ...string) map[string][]TestFailure {
+		m := make(map[string][]TestFailure, len(names))
+		for _, n := range names {
+			m[n] = []TestFailure{{Name: n}}
+		}
+		return m
+	}
+
+	t.Run("removes parent when subtests observed", func(t *testing.T) {
+		grouped := makeFailures("TestFooSuite", "TestFooSuite/TestBar")
+		counts := map[string]int{
+			"TestFooSuite/TestBar": 10,
+			"TestFooSuite/TestBaz": 20,
+		}
+		filterParentTests(grouped, counts)
+		require.NotContains(t, grouped, "TestFooSuite")
+		require.Contains(t, grouped, "TestFooSuite/TestBar")
+	})
+
+	t.Run("keeps parent when no subtests observed", func(t *testing.T) {
+		grouped := makeFailures("TestStandalone")
+		counts := map[string]int{"TestStandalone": 5}
+		filterParentTests(grouped, counts)
+		require.Contains(t, grouped, "TestStandalone")
+	})
+
+	t.Run("keeps subtest entry regardless", func(t *testing.T) {
+		grouped := makeFailures("TestFooSuite/TestBar")
+		counts := map[string]int{"TestFooSuite/TestBar": 10}
+		filterParentTests(grouped, counts)
+		require.Contains(t, grouped, "TestFooSuite/TestBar")
+	})
+}
+
 func TestGenerateSuiteReports(t *testing.T) {
 	now := time.Now()
 	twoDaysAgo := now.Add(-48 * time.Hour)
diff --git a/tools/flakereport/types.go b/tools/flakereport/types.go
@@ -20,6 +20,7 @@ type TestRun struct {
 	Failed    bool   // Whether the test failed
 	Skipped   bool   // Whether the test was skipped
 	RunID     int64  // Workflow run ID
+	JobID     string // GitHub Actions job ID (unique per matrix job/shard)
 }
 
 // TestReport represents aggregated failures for a single test
@@ -35,9 +36,9 @@ type TestReport struct {
 // SuiteReport represents aggregated flake data for a test suite
 type SuiteReport struct {
 	SuiteName   string    // Test suite name from JUnit XML
-	FlakeRate   float64   // Percentage of runs with at least one non-retry failure
-	FailedRuns  int       // Number of runs with at least one non-retry failure
-	TotalRuns   int       // Total number of workflow runs where this suite appeared
+	FlakeRate   float64   // Percentage of job executions with at least one non-retry failure
+	FailedRuns  int       // Number of job executions with at least one non-retry failure
+	TotalRuns   int       // Total number of job executions where this suite appeared
 	LastFailure time.Time // Timestamp of the most recent failure
 }
 

Original file line number	Diff line number	Diff line change
`@@ -135,7 +135,8 @@ func processArtifactJob(ctx context.Context, job ArtifactJob, totalArtifacts int`
`135`	`135`	`result.Failures = append(result.Failures, failures...)`
`136`	`136`
`137`	`137`	`// Extract all test runs for failure rate calculation`
`138`		`- testRuns := extractAllTestRuns(suites, job.RunID)`
	`138`	`+ _, jobID := parseArtifactName(job.Artifact.Name)`
	`139`	`+ testRuns := extractAllTestRuns(suites, job.RunID, jobID)`
`139`	`140`	`result.AllRuns = append(result.AllRuns, testRuns...)`
`140`	`141`	`}`
`141`	`142`