Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions config/views.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,97 @@ component_readiness:
enabled: true
automate_jira:
enabled: true
- name: 4.22-main-mass-failure
base_release:
release: "4.21"
relative_start: ga-30d
relative_end: ga
sample_release:
release: "4.22"
relative_start: now-7d
relative_end: now
variant_options:
column_group_by:
Network: { }
Platform: { }
Topology: { }
db_group_by:
Architecture: { }
FeatureSet: { }
Installer: { }
Network: { }
Platform: { }
Suite: { }
Topology: { }
Upgrade: { }
LayeredProduct: { }
include_variants:
Architecture:
- amd64
- arm64
- multi
FeatureSet:
- default
- techpreview
Installer:
- ipi
- upi
- hypershift
JobTier:
- blocking
- informing
- standard
LayeredProduct:
- none
- virt
Network:
- ovn
OS:
- rhcos9
Owner:
- eng
- service-delivery
Platform:
- aws
- azure
- gcp
- metal
- rosa
- vsphere
Topology:
- ha
- microshift
- external
CGroupMode:
- v2
ContainerRuntime:
- runc
- crun
Upgrade:
- micro
- minor
- none
advanced_options:
minimum_failure: 3
confidence: 95
pity_factor: 5
ignore_missing: false
ignore_disruption: true
flake_as_failure: false
pass_rate_required_new_tests: 95
include_multi_release_analysis: true
key_test_names:
- "install should succeed: overall"
- "[sig-cluster-lifecycle] Cluster completes upgrade"
- "[Jira:\"Test Framework\"] there should not be mass test failures"
metrics:
enabled: true
regression_tracking:
enabled: true
prime_cache:
enabled: true
automate_jira:
enabled: false
- name: 4.22-hypershift-candidates
base_release:
release: "4.21"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,8 @@ type fallbackTestQueryReleasesGeneratorCacheKey struct {
VariantDBGroupBy sets.String
// CRTimeRoundingFactor is used by GetReleaseDatesFromBigQuery
CRTimeRoundingFactor time.Duration
// KeyTestNames affects the BuildComponentReportQuery results via filtering logic
KeyTestNames []string
}

// getCacheKey creates a cache key using the generator properties that we want included for uniqueness in what
Expand All @@ -370,6 +372,7 @@ func (f *fallbackTestQueryReleasesGenerator) getCacheKey() fallbackTestQueryRele
BaseEnd: f.BaseEnd,
VariantDBGroupBy: f.ReqOptions.VariantOption.DBGroupBy,
CRTimeRoundingFactor: f.ReqOptions.CacheOption.CRTimeRoundingFactor,
KeyTestNames: f.ReqOptions.AdvancedOption.KeyTestNames,
}
}

Expand Down Expand Up @@ -516,8 +519,9 @@ type fallbackTestQueryGeneratorCacheKey struct {
BaseRelease string
BaseStart time.Time
BaseEnd time.Time
// IgnoreDisruption is the only field within AdvancedOption that is used here
// IgnoreDisruption and KeyTestNames are fields within AdvancedOption that affect the query
IgnoreDisruption bool
KeyTestNames []string
IncludeVariants map[string][]string
VariantDBGroupBy sets.String
// if we ever needed fallback on cross-compare views we should include fields for that,
Expand All @@ -533,6 +537,7 @@ func (f *fallbackTestQueryGenerator) getCacheKey() fallbackTestQueryGeneratorCac
BaseStart: f.BaseStart,
BaseEnd: f.BaseEnd,
IgnoreDisruption: f.ReqOptions.AdvancedOption.IgnoreDisruption,
KeyTestNames: f.ReqOptions.AdvancedOption.KeyTestNames,
IncludeVariants: f.ReqOptions.VariantOption.IncludeVariants,
VariantDBGroupBy: f.ReqOptions.VariantOption.DBGroupBy,
}
Expand All @@ -541,7 +546,7 @@ func (f *fallbackTestQueryGenerator) getCacheKey() fallbackTestQueryGeneratorCac
func (f *fallbackTestQueryGenerator) getTestFallbackRelease(ctx context.Context) (bq.ReportTestStatus, []error) {
commonQuery, groupByQuery, queryParameters := query.BuildComponentReportQuery(
f.client, f.ReqOptions, f.allVariants, f.ReqOptions.VariantOption.IncludeVariants,
query.DefaultJunitTable, false)
query.DefaultJunitTable, false, f.ReqOptions.AdvancedOption.KeyTestNames...)
before := time.Now()
log.Infof("Starting Fallback (%s) QueryTestStatus", f.BaseRelease)
errs := []error{}
Expand Down
124 changes: 100 additions & 24 deletions pkg/api/componentreadiness/query/querygenerators.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,8 @@ func NewBaseQueryGenerator(

func (b *baseQueryGenerator) QueryTestStatus(ctx context.Context) (bq.ReportTestStatus, []error) {

commonQuery, groupByQuery, queryParameters := BuildComponentReportQuery(b.client, b.ReqOptions, b.allVariants, b.ReqOptions.VariantOption.IncludeVariants, DefaultJunitTable, false)
commonQuery, groupByQuery, queryParameters := BuildComponentReportQuery(b.client, b.ReqOptions, b.allVariants, b.ReqOptions.VariantOption.IncludeVariants, DefaultJunitTable, false, b.ReqOptions.AdvancedOption.KeyTestNames...)

before := time.Now()
errs := []error{}
baseString := commonQuery + ` AND jv_Release.variant_value = @BaseRelease`
baseQuery := b.client.Query(ctx, bqlabel.CRJunitBase, baseString+groupByQuery)
Expand All @@ -164,8 +163,6 @@ func (b *baseQueryGenerator) QueryTestStatus(ctx context.Context) (bq.ReportTest
errs = append(errs, baseErrs...)
}

log.Infof("Base QueryTestStatus completed in %s with %d base results from db", time.Since(before), len(baseStatus))

return bq.ReportTestStatus{BaseStatus: baseStatus}, errs
}

Expand Down Expand Up @@ -207,9 +204,8 @@ func NewSampleQueryGenerator(
}

func (s *sampleQueryGenerator) QueryTestStatus(ctx context.Context) (bq.ReportTestStatus, []error) {
commonQuery, groupByQuery, queryParameters := BuildComponentReportQuery(s.client, s.ReqOptions, s.allVariants, s.IncludeVariants, s.JunitTable, true)
commonQuery, groupByQuery, queryParameters := BuildComponentReportQuery(s.client, s.ReqOptions, s.allVariants, s.IncludeVariants, s.JunitTable, true, s.ReqOptions.AdvancedOption.KeyTestNames...)

before := time.Now()
errs := []error{}
sampleString := commonQuery
// Only set sample release when PR and payload options are not set
Expand Down Expand Up @@ -273,19 +269,36 @@ func (s *sampleQueryGenerator) QueryTestStatus(ctx context.Context) (bq.ReportTe
errs = append(errs, sampleErrs...)
}

log.Infof("Sample QueryTestStatus completed in %s with %d sample results db", time.Since(before), len(sampleStatus))

return bq.ReportTestStatus{SampleStatus: sampleStatus}, errs
}

// buildPriorityCaseStatement generates a SQL CASE statement that assigns priority based on test position in the list.
// Lower index = higher priority. This is used to ensure when multiple key tests appear in the same job,
// only the highest priority one is counted.
func buildPriorityCaseStatement(keyTestNames []string) string {
var caseStatements []string
for i, testName := range keyTestNames {
// Escape single quotes in test name for SQL
escapedTestName := strings.ReplaceAll(testName, "'", "''")
caseStatements = append(caseStatements, fmt.Sprintf("WHEN test_name = '%s' THEN %d", escapedTestName, i))
}
// Add a default case to handle any unexpected tests (should not happen due to IN UNNEST filter)
caseStatements = append(caseStatements, fmt.Sprintf("ELSE %d", len(keyTestNames)))
return strings.Join(caseStatements, "\n\t\t\t\t\t\t\t")
}

// BuildComponentReportQuery returns the common query for the higher level summary component summary.
// If keyTestNames is provided, when any of these tests fail in a job, all other test failures
// in that job are excluded from regression analysis. Only the highest priority (earliest in the list)
// key test will be included for each affected job.
func BuildComponentReportQuery(
client *bqcachedclient.Client,
reqOptions reqopts.RequestOptions,
allJobVariants crtest.JobVariants,
includeVariants map[string][]string,
junitTable string,
isSample bool,
keyTestNames ...string,
) (string, string, []bigquery.QueryParameter) {
// Parts of the query, including the columns returned, are dynamic, based on the list of variants we're told to work with.
// Variants will be returned as columns with names like: variant_[VariantName]
Expand All @@ -294,7 +307,7 @@ func BuildComponentReportQuery(
joinVariants := ""
groupByVariants := ""
for _, v := range sortedKeys(allJobVariants.Variants) {
joinVariants += fmt.Sprintf("LEFT JOIN %s.job_variants jv_%s ON variant_registry_job_name = jv_%s.job_name AND jv_%s.variant_name = '%s'\n",
joinVariants += fmt.Sprintf("LEFT JOIN %s.job_variants jv_%s ON junit_data.variant_registry_job_name = jv_%s.job_name AND jv_%s.variant_name = '%s'\n",
client.Dataset, v, v, v, v)
}
for _, v := range reqOptions.VariantOption.DBGroupBy.List() {
Expand All @@ -313,33 +326,95 @@ func BuildComponentReportQuery(
// TODO: last_failure here explicitly uses success_val not adjusted_success_val, this ensures we
// show the last time the test failed, not flaked. if you enable the flakes as failures feature (which is
// non default today), the last failure time will be wrong which can impact things like failed fix detection.
queryString := fmt.Sprintf(`WITH latest_component_mapping AS (
SELECT *
FROM %s.component_mapping cm
WHERE created_at = (
SELECT MAX(created_at)
FROM %s.component_mapping))
// Build the WITH clause - add jobs_with_failed_key_tests CTE if keyTestNames is provided
withClause := ""
if len(keyTestNames) > 0 {
// Create a CTE that identifies the highest priority (lowest index) key test in each job
// This ensures when multiple key tests appear in the same job, only the highest priority one is used
withClause = fmt.Sprintf(`WITH key_test_priorities AS (
SELECT
prowjob_build_id,
test_name,
-- Find the index/priority of each test (lower index = higher priority)
CASE
%s
END AS test_priority
FROM %s.%s AS junit
WHERE modified_time >= DATETIME(@From)
AND modified_time < DATETIME(@To)
AND test_name IN UNNEST(@KeyTestNames)
AND success_val = 0
),
Comment on lines +331 to +347
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

key_test_priorities may suppress jobs where a key test merely flaked.

The CTE filters success_val = 0 against the raw (non-deduped) table. For a key test that flaked, the raw table contains a failure row with success_val = 0, so the job would still be matched. The deduped table (dedupedJunitTable) resolves this by preferring the flake row, but the CTE bypasses that logic entirely.

If a key test flakes, should the remaining tests in that job be suppressed from regression analysis? If not, add AND flake_count = 0 to the CTE filter (or consider querying the deduped table instead):

💡 Suggested fix
 					FROM %s.%s AS junit
 					WHERE modified_time >= DATETIME(`@From`)
 					AND modified_time < DATETIME(`@To`)
 					AND test_name IN UNNEST(`@KeyTestNames`)
-					AND success_val = 0
+					AND success_val = 0
+					AND flake_count = 0
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/api/componentreadiness/query/querygenerators.go` around lines 332 - 348,
The CTE key_test_priorities currently filters raw junit rows with "success_val =
0" which will include flaked failures; update the WHERE clause inside the
key_test_priorities CTE in querygenerators.go to exclude flakes (add "AND
flake_count = 0") or switch the CTE to read from dedupedJunitTable instead of
the raw junit table so the dedup logic is applied (modify the CTE that selects
from "%s.%s AS junit" named in the WITH block and adjust parameters
accordingly).

jobs_with_highest_priority_test AS (
SELECT
prowjob_build_id,
test_name
FROM key_test_priorities
WHERE test_priority = (
SELECT MIN(test_priority)
FROM key_test_priorities ep2
WHERE ep2.prowjob_build_id = key_test_priorities.prowjob_build_id
)
),
latest_component_mapping AS (
SELECT *
FROM %s.component_mapping cm
WHERE created_at = (
SELECT MAX(created_at)
FROM %s.component_mapping))`,
buildPriorityCaseStatement(keyTestNames), client.Dataset, junitTable, client.Dataset, client.Dataset)
} else {
withClause = fmt.Sprintf(`WITH latest_component_mapping AS (
SELECT *
FROM %s.component_mapping cm
WHERE created_at = (
SELECT MAX(created_at)
FROM %s.component_mapping))`,
client.Dataset, client.Dataset)
}

queryString := fmt.Sprintf(`%s
SELECT
ANY_VALUE(test_name HAVING MAX prowjob_start) AS test_name,
ANY_VALUE(testsuite HAVING MAX prowjob_start) AS test_suite,
ANY_VALUE(junit_data.test_name HAVING MAX junit_data.prowjob_start) AS test_name,
ANY_VALUE(junit_data.testsuite HAVING MAX junit_data.prowjob_start) AS test_suite,
cm.id as test_id,
%s
COUNT(cm.id) AS total_count,
SUM(adjusted_success_val) AS success_count,
SUM(adjusted_flake_count) AS flake_count,
MAX(CASE WHEN success_val = 0 THEN prowjob_start ELSE NULL END) AS last_failure,
SUM(junit_data.adjusted_success_val) AS success_count,
SUM(junit_data.adjusted_flake_count) AS flake_count,
MAX(CASE WHEN junit_data.success_val = 0 THEN junit_data.prowjob_start ELSE NULL END) AS last_failure,
ANY_VALUE(cm.component) AS component,
ANY_VALUE(cm.capabilities) AS capabilities,
FROM (%s)
INNER JOIN latest_component_mapping cm ON testsuite = cm.suite AND test_name = cm.name
FROM (%s) AS junit_data
INNER JOIN latest_component_mapping cm ON junit_data.testsuite = cm.suite AND junit_data.test_name = cm.name
`,
client.Dataset, client.Dataset, selectVariants, fmt.Sprintf(dedupedJunitTable, jobNameQueryPortion, client.Dataset, junitTable, client.Dataset, client.Dataset, jobRunAnnotationToIgnore))
withClause, selectVariants, fmt.Sprintf(dedupedJunitTable, jobNameQueryPortion, client.Dataset, junitTable, client.Dataset, client.Dataset, jobRunAnnotationToIgnore))

queryString += joinVariants

queryString += `WHERE cm.staff_approved_obsolete = false AND
(variant_registry_job_name LIKE 'periodic-%%' OR variant_registry_job_name LIKE 'release-%%' OR variant_registry_job_name LIKE 'aggregator-%%')`
(junit_data.variant_registry_job_name LIKE 'periodic-%%' OR junit_data.variant_registry_job_name LIKE 'release-%%' OR junit_data.variant_registry_job_name LIKE 'aggregator-%%')`
commonParams := []bigquery.QueryParameter{}

// Add filtering logic for key tests with priority
// Only include the highest priority test from each job, and exclude all other tests from those jobs
if len(keyTestNames) > 0 {
queryString += `
AND (
-- Include tests from jobs that don't have any failed key tests
junit_data.prowjob_build_id NOT IN (SELECT prowjob_build_id FROM jobs_with_highest_priority_test)
-- Or include only the highest priority key test from jobs that have them
OR EXISTS (
SELECT 1 FROM jobs_with_highest_priority_test j
WHERE j.prowjob_build_id = junit_data.prowjob_build_id
AND j.test_name = junit_data.test_name
)
)`
commonParams = append(commonParams, bigquery.QueryParameter{
Name: "KeyTestNames",
Value: keyTestNames,
})
}
if reqOptions.AdvancedOption.IgnoreDisruption {
queryString += ` AND NOT 'Disruption' in UNNEST(capabilities)`
}
Expand Down Expand Up @@ -627,6 +702,7 @@ func FetchTestStatusResults(ctx context.Context, query *bigquery.Query) (map[str

status[testIDStr] = testStatus
}

return status, errs
}

Expand Down
Loading