mozilla · sean-rose · Nov 10, 2025 · Sep 5, 2025 · Sep 5, 2025 · Nov 7, 2025
@@ -56,9 +56,19 @@ def extract_table_references(sql: str) -> List[str]:
         creates |= {
             _raw_table_name(expr.this)
             for expr in statement.find_all(sqlglot.exp.Create)
+            if expr.kind in ("TABLE", "VIEW")
         }
         tables |= (
-            {_raw_table_name(table) for table in statement.find_all(sqlglot.exp.Table)}
+            {
+                _raw_table_name(table)
+                for table in statement.find_all(sqlglot.exp.Table)
+                # Starting in sqlglot v26.9.0 the identifiers for UDFs created by `CREATE TEMP FUNCTION`
+                # statements get parsed into `Table` expressions (https://github.com/tobymao/sqlglot/pull/4829),
+                # so we need to exclude those (note that `Table` expressions in the bodies of UDFs won't have
+                # the `UserDefinedFunction` expression as their parent).  We reported this as a sqlglot bug
+                # (https://github.com/tobymao/sqlglot/issues/6298) but were told this is expected behavior.
+                if not isinstance(table.parent, sqlglot.exp.UserDefinedFunction)
+            }
             # ignore references created in this query
             - creates
             # ignore CTEs created in this statement

@@ -185,6 +185,22 @@ def qualify_table_references_in_file(path: Path) -> str:
             "Cannot qualify table_references of query scripts or UDFs"
         )
 
+    # INFORMATION_SCHEMA views that can be dataset qualified
+    # https://cloud.google.com/bigquery/docs/information-schema-intro#dataset_qualifier
+    # used to make a best-effort attempt to qualify with dataset when appropriate
+    DATASET_QUALIFIED_INFO_SCHEMA_VIEWS = (
+        "COLUMNS",
+        "COLUMN_FIELD_PATHS",
+        "MATERIALIZED_VIEWS",
+        "PARAMETERS",
+        "PARTITIONS",
+        "ROUTINES",
+        "ROUTINE_OPTIONS",
+        "TABLES",
+        "TABLE_OPTIONS",
+        "VIEWS",
+    )
+
     # determine the default target project and dataset from the path
     target_project = Path(path).parent.parent.parent.name
     default_dataset = Path(path).parent.parent.name
@@ -232,7 +248,9 @@ def qualify_table_references_in_file(path: Path) -> str:
             for table_expr in statement.find_all(sqlglot.exp.Table):
                 # existing table ref including backticks without alias
                 table_expr.set("alias", "")
-                reference_string = table_expr.sql(dialect="bigquery")
+                # as of sqlglot 26, dialect=bigquery puts backticks at only start and end
+                # if every part is quoted, so using dialect=None
+                reference_string = table_expr.sql(dialect=None).replace('"', "`?")
 
                 matched_cte = [
                     re.match(
@@ -244,17 +262,28 @@ def qualify_table_references_in_file(path: Path) -> str:
                 if any(matched_cte):
                     continue
 
-                # project id is parsed as the catalog attribute
-                # but information_schema region may also be parsed as catalog
-                if table_expr.catalog.startswith("region-"):
-                    project_name = f"{target_project}.{table_expr.catalog}"
+                # as of sqlglot 26, INFORMATION_SCHEMA.{VIEW_NAME} is parsed together as a table name
+                if table_expr.name.startswith("INFORMATION_SCHEMA."):
+                    if table_expr.db.lower().startswith("region-"):
+                        project_name = table_expr.catalog or target_project
+                        dataset_name = table_expr.db
+                    elif (
+                        table_expr.name.split(".")[1]
+                        in DATASET_QUALIFIED_INFO_SCHEMA_VIEWS
+                    ):  # add project and dataset
+                        project_name = target_project
+                        dataset_name = table_expr.db or default_dataset
+                    else:  # just add project
+                        project_name = ""
+                        dataset_name = table_expr.db or target_project
                 elif table_expr.catalog == "":  # no project id
                     project_name = target_project
+                    dataset_name = table_expr.db or default_dataset
                 else:  # project id exists
                     continue
 
                 # fully qualified table ref
-                replacement_string = f"`{project_name}.{table_expr.db or default_dataset}.{table_expr.name}`"
+                replacement_string = f"`{project_name}{'.' if project_name else ''}{dataset_name}.{table_expr.name}`"
 
                 table_replacements.add((reference_string, replacement_string))
 

@@ -260,7 +260,7 @@ deprecation:
 
 format:
   skip:
-  - bigquery_etl/glam/templates/*.sql
+  - bigquery_etl/glam/templates/**/*.sql
   - sql_generators/**/*.sql
   - tests/**/*.sql
   - sql/moz-fx-data-shared-prod/udf_js/jackknife_percentile_ci/udf.sql
@@ -271,8 +271,10 @@ format:
   - sql/moz-fx-data-shared-prod/udf_legacy/to_iso8601.sql
   - stored_procedures/safe_crc32_uuid.sql
   skip_qualifying_references:
-  - sql/**/checks.sql
   - sql/mozfun/**/examples/*.sql
+  # Don't try to qualify references when Jinja expressions are being used for table/view IDs.
+  - sql/**/bigeye_custom_rules.sql
+  - sql/**/checks.sql
 
 routine:
   dependency_dir: udf_js_lib/

@@ -43,7 +43,7 @@ pytest==8.4.2
 PyYAML==6.0.3
 rich-click==1.9.4
 smart_open==6.4.0
-sqlglot==25.28.0
+sqlglot==27.29.0
 sqlparse==0.5.3
 stripe==6.4.0
 symbolic==12.16.3

@@ -899,9 +899,7 @@ jaraco-classes==3.4.0 \
 jeepney==0.8.0 \
     --hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806 \
     --hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755
-    # via
-    #   keyring
-    #   secretstorage
+    # via secretstorage
 jinja2==3.1.6 \
     --hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \
     --hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67
@@ -2346,9 +2344,7 @@ s3transfer==0.13.0 \
 secretstorage==3.3.3 \
     --hash=sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77 \
     --hash=sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99
-    # via
-    #   bigeye-sdk
-    #   keyring
+    # via bigeye-sdk
 shellingham==1.5.4 \
     --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \
     --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de
@@ -2389,9 +2385,9 @@ soupsieve==2.6 \
     --hash=sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb \
     --hash=sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9
     # via beautifulsoup4
-sqlglot==25.28.0 \
-    --hash=sha256:01a6b76dd916c2ce39a5f808559e337600ea261873d25be0dc7f99da3d24860d \
-    --hash=sha256:1bfcadada28bcab873382b271ed30138597cbf8893d90cc259563f9498bd1b49
+sqlglot==27.29.0 \
+    --hash=sha256:2270899694663acef94fa93497971837e6fadd712f4a98b32aee1e980bc82722 \
+    --hash=sha256:9a5ea8ac61826a7763de10cad45a35f0aa9bfcf7b96ee74afb2314de9089e1cb
     # via -r requirements.in
 sqlparse==0.5.3 \
     --hash=sha256:09f67787f56a0b16ecdbde1bfc7f5d9c3371ca683cfeaa8e6ff60b4807ec9272 \

@@ -24,3 +24,30 @@ def test_extract_table_refs_pivot_and_join(self):
         refs = extract_table_references(pivot_query)
 
         assert set(refs) == {"Produce", "Perishable_Mints"}
+
+    def test_extract_table_refs_with_ctes(self):
+        sql = """
+          WITH foo AS (SELECT * FROM bar)
+          SELECT * FROM foo
+        """
+        refs = extract_table_references(sql)
+
+        assert refs == ["bar"]
+
+    def test_extract_table_refs_with_temp_udfs(self):
+        sql = """
+          CREATE TEMP FUNCTION foo() AS ((SELECT MAX(foo) FROM bar));
+          SELECT foo()
+        """
+        refs = extract_table_references(sql)
+
+        assert refs == ["bar"]
+
+    def test_extract_table_refs_with_temp_tables(self):
+        sql = """
+          CREATE TEMP TABLE foo AS SELECT * FROM bar;
+          SELECT * FROM foo
+        """
+        refs = extract_table_references(sql)
+
+        assert refs == ["bar"]
@@ -186,10 +186,10 @@ def test_qualify_table_references_in_file_array_fields(self, tmp_path):
                     SELECT
                         STRUCT(key, CAST(TRUE AS INT64) AS value)
                     FROM
-                        data.array_field AS key
+                        _d.array_field AS key
                 )
             ) AS array_field
-        FROM data
+        FROM data AS _d
         """
         query_path = tmp_path / "project" / "dataset" / "test"
         query_path.mkdir(parents=True, exist_ok=True)
@@ -207,10 +207,10 @@ def test_qualify_table_references_in_file_array_fields(self, tmp_path):
                     SELECT
                         STRUCT(key, CAST(TRUE AS INT64) AS value)
                     FROM
-                        data.array_field AS key
+                        _d.array_field AS key
                 )
             ) AS array_field
-        FROM data
+        FROM data AS _d
         """
         assert result == expected
 
@@ -327,14 +327,23 @@ def test_qualify_table_references_information_schema(self, tmp_path):
         WITH info_schema_region AS (
             SELECT 1 FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT
         ),
+        info_schema_region_quoted AS (
+            SELECT 1 FROM `region-us`.`INFORMATION_SCHEMA.JOBS_BY_PROJECT`
+        ),
+        info_schema_region_full_quoted AS (
+            SELECT 1 FROM `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
+        ),
         info_schema_project AS (
-            SELECT 1 FROM proj.INFORMATION_SCHEMA.JOBS_BY_PROJECT
+            SELECT 1 FROM proj.INFORMATION_SCHEMA.SCHEMATA
         ),
         info_schema_full AS (
             SELECT * FROM `project.region-us.INFORMATION_SCHEMA.SCHEMATA`
         ),
         info_schema_none AS (
             SELECT * FROM INFORMATION_SCHEMA.SCHEMATA
+        ),
+        info_schema_none_ds_qualify AS (
+            SELECT * FROM INFORMATION_SCHEMA.TABLES
         )
         SELECT 1
         """
@@ -346,14 +355,23 @@ def test_qualify_table_references_information_schema(self, tmp_path):
         WITH info_schema_region AS (
             SELECT 1 FROM `{default_project}.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
         ),
+        info_schema_region_quoted AS (
+            SELECT 1 FROM `{default_project}.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
+        ),
+        info_schema_region_full_quoted AS (
+            SELECT 1 FROM `{default_project}.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
+        ),
         info_schema_project AS (
-            SELECT 1 FROM proj.INFORMATION_SCHEMA.JOBS_BY_PROJECT
+            SELECT 1 FROM `proj.INFORMATION_SCHEMA.SCHEMATA`
         ),
         info_schema_full AS (
             SELECT * FROM `project.region-us.INFORMATION_SCHEMA.SCHEMATA`
         ),
         info_schema_none AS (
             SELECT * FROM `{default_project}.INFORMATION_SCHEMA.SCHEMATA`
+        ),
+        info_schema_none_ds_qualify AS (
+            SELECT * FROM `{default_project}.{default_dataset}.INFORMATION_SCHEMA.TABLES`
         )
         SELECT 1
         """