Skip to content

Commit 073bdac

Browse files
sean-roseBenWu
andauthored
fix(format): Update qualify_table_references_in_file() to work with newer sqlglot versions (DENG-7005) (#8411)
* [DENG-7005] Update qualify_table_references to work with newer sqlglot * up sqlglot * sqlglot==27.29.0 * fix(format): Correct the `format.skip` config for GLAM templates to account for subdirectories now being used. * fix(format): Skip qualifying table references in `bigeye_custom_rules.sql` files. * fix(dependency): Implement workaround for tobymao/sqlglot#6298. * fix(dependency): Only consider `CREATE [TEMP] TABLE` and `CREATE VIEW` statements for ignoring table references created in the query itself. * test(dependency): Add `extract_table_references()` tests for SQL with CTEs and UDFs. * test(dependency): Add `extract_table_references()` test for SQL with temp tables. --------- Co-authored-by: bwu <[email protected]>
1 parent c9ecdc2 commit 073bdac

File tree

7 files changed

+107
-25
lines changed

7 files changed

+107
-25
lines changed

bigquery_etl/dependency.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,19 @@ def extract_table_references(sql: str) -> List[str]:
5656
creates |= {
5757
_raw_table_name(expr.this)
5858
for expr in statement.find_all(sqlglot.exp.Create)
59+
if expr.kind in ("TABLE", "VIEW")
5960
}
6061
tables |= (
61-
{_raw_table_name(table) for table in statement.find_all(sqlglot.exp.Table)}
62+
{
63+
_raw_table_name(table)
64+
for table in statement.find_all(sqlglot.exp.Table)
65+
# Starting in sqlglot v26.9.0 the identifiers for UDFs created by `CREATE TEMP FUNCTION`
66+
# statements get parsed into `Table` expressions (https://github.com/tobymao/sqlglot/pull/4829),
67+
# so we need to exclude those (note that `Table` expressions in the bodies of UDFs won't have
68+
# the `UserDefinedFunction` expression as their parent). We reported this as a sqlglot bug
69+
# (https://github.com/tobymao/sqlglot/issues/6298) but were told this is expected behavior.
70+
if not isinstance(table.parent, sqlglot.exp.UserDefinedFunction)
71+
}
6272
# ignore references created in this query
6373
- creates
6474
# ignore CTEs created in this statement

bigquery_etl/util/common.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,22 @@ def qualify_table_references_in_file(path: Path) -> str:
185185
"Cannot qualify table_references of query scripts or UDFs"
186186
)
187187

188+
# INFORMATION_SCHEMA views that can be dataset qualified
189+
# https://cloud.google.com/bigquery/docs/information-schema-intro#dataset_qualifier
190+
# used to make a best-effort attempt to qualify with dataset when appropriate
191+
DATASET_QUALIFIED_INFO_SCHEMA_VIEWS = (
192+
"COLUMNS",
193+
"COLUMN_FIELD_PATHS",
194+
"MATERIALIZED_VIEWS",
195+
"PARAMETERS",
196+
"PARTITIONS",
197+
"ROUTINES",
198+
"ROUTINE_OPTIONS",
199+
"TABLES",
200+
"TABLE_OPTIONS",
201+
"VIEWS",
202+
)
203+
188204
# determine the default target project and dataset from the path
189205
target_project = Path(path).parent.parent.parent.name
190206
default_dataset = Path(path).parent.parent.name
@@ -232,7 +248,9 @@ def qualify_table_references_in_file(path: Path) -> str:
232248
for table_expr in statement.find_all(sqlglot.exp.Table):
233249
# existing table ref including backticks without alias
234250
table_expr.set("alias", "")
235-
reference_string = table_expr.sql(dialect="bigquery")
251+
# as of sqlglot 26, dialect=bigquery puts backticks at only start and end
252+
# if every part is quoted, so using dialect=None
253+
reference_string = table_expr.sql(dialect=None).replace('"', "`?")
236254

237255
matched_cte = [
238256
re.match(
@@ -244,17 +262,28 @@ def qualify_table_references_in_file(path: Path) -> str:
244262
if any(matched_cte):
245263
continue
246264

247-
# project id is parsed as the catalog attribute
248-
# but information_schema region may also be parsed as catalog
249-
if table_expr.catalog.startswith("region-"):
250-
project_name = f"{target_project}.{table_expr.catalog}"
265+
# as of sqlglot 26, INFORMATION_SCHEMA.{VIEW_NAME} is parsed together as a table name
266+
if table_expr.name.startswith("INFORMATION_SCHEMA."):
267+
if table_expr.db.lower().startswith("region-"):
268+
project_name = table_expr.catalog or target_project
269+
dataset_name = table_expr.db
270+
elif (
271+
table_expr.name.split(".")[1]
272+
in DATASET_QUALIFIED_INFO_SCHEMA_VIEWS
273+
): # add project and dataset
274+
project_name = target_project
275+
dataset_name = table_expr.db or default_dataset
276+
else: # just add project
277+
project_name = ""
278+
dataset_name = table_expr.db or target_project
251279
elif table_expr.catalog == "": # no project id
252280
project_name = target_project
281+
dataset_name = table_expr.db or default_dataset
253282
else: # project id exists
254283
continue
255284

256285
# fully qualified table ref
257-
replacement_string = f"`{project_name}.{table_expr.db or default_dataset}.{table_expr.name}`"
286+
replacement_string = f"`{project_name}{'.' if project_name else ''}{dataset_name}.{table_expr.name}`"
258287

259288
table_replacements.add((reference_string, replacement_string))
260289

bqetl_project.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ deprecation:
260260

261261
format:
262262
skip:
263-
- bigquery_etl/glam/templates/*.sql
263+
- bigquery_etl/glam/templates/**/*.sql
264264
- sql_generators/**/*.sql
265265
- tests/**/*.sql
266266
- sql/moz-fx-data-shared-prod/udf_js/jackknife_percentile_ci/udf.sql
@@ -271,8 +271,10 @@ format:
271271
- sql/moz-fx-data-shared-prod/udf_legacy/to_iso8601.sql
272272
- stored_procedures/safe_crc32_uuid.sql
273273
skip_qualifying_references:
274-
- sql/**/checks.sql
275274
- sql/mozfun/**/examples/*.sql
275+
# Don't try to qualify references when Jinja expressions are being used for table/view IDs.
276+
- sql/**/bigeye_custom_rules.sql
277+
- sql/**/checks.sql
276278

277279
routine:
278280
dependency_dir: udf_js_lib/

requirements.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ pytest==8.4.2
4343
PyYAML==6.0.3
4444
rich-click==1.9.4
4545
smart_open==6.4.0
46-
sqlglot==25.28.0
46+
sqlglot==27.29.0
4747
sqlparse==0.5.3
4848
stripe==6.4.0
4949
symbolic==12.16.3

requirements.txt

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -899,9 +899,7 @@ jaraco-classes==3.4.0 \
899899
jeepney==0.8.0 \
900900
--hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806 \
901901
--hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755
902-
# via
903-
# keyring
904-
# secretstorage
902+
# via secretstorage
905903
jinja2==3.1.6 \
906904
--hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \
907905
--hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67
@@ -2346,9 +2344,7 @@ s3transfer==0.13.0 \
23462344
secretstorage==3.3.3 \
23472345
--hash=sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77 \
23482346
--hash=sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99
2349-
# via
2350-
# bigeye-sdk
2351-
# keyring
2347+
# via bigeye-sdk
23522348
shellingham==1.5.4 \
23532349
--hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \
23542350
--hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de
@@ -2389,9 +2385,9 @@ soupsieve==2.6 \
23892385
--hash=sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb \
23902386
--hash=sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9
23912387
# via beautifulsoup4
2392-
sqlglot==25.28.0 \
2393-
--hash=sha256:01a6b76dd916c2ce39a5f808559e337600ea261873d25be0dc7f99da3d24860d \
2394-
--hash=sha256:1bfcadada28bcab873382b271ed30138597cbf8893d90cc259563f9498bd1b49
2388+
sqlglot==27.29.0 \
2389+
--hash=sha256:2270899694663acef94fa93497971837e6fadd712f4a98b32aee1e980bc82722 \
2390+
--hash=sha256:9a5ea8ac61826a7763de10cad45a35f0aa9bfcf7b96ee74afb2314de9089e1cb
23952391
# via -r requirements.in
23962392
sqlparse==0.5.3 \
23972393
--hash=sha256:09f67787f56a0b16ecdbde1bfc7f5d9c3371ca683cfeaa8e6ff60b4807ec9272 \

tests/test_dependency.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,30 @@ def test_extract_table_refs_pivot_and_join(self):
2424
refs = extract_table_references(pivot_query)
2525

2626
assert set(refs) == {"Produce", "Perishable_Mints"}
27+
28+
def test_extract_table_refs_with_ctes(self):
29+
sql = """
30+
WITH foo AS (SELECT * FROM bar)
31+
SELECT * FROM foo
32+
"""
33+
refs = extract_table_references(sql)
34+
35+
assert refs == ["bar"]
36+
37+
def test_extract_table_refs_with_temp_udfs(self):
38+
sql = """
39+
CREATE TEMP FUNCTION foo() AS ((SELECT MAX(foo) FROM bar));
40+
SELECT foo()
41+
"""
42+
refs = extract_table_references(sql)
43+
44+
assert refs == ["bar"]
45+
46+
def test_extract_table_refs_with_temp_tables(self):
47+
sql = """
48+
CREATE TEMP TABLE foo AS SELECT * FROM bar;
49+
SELECT * FROM foo
50+
"""
51+
refs = extract_table_references(sql)
52+
53+
assert refs == ["bar"]

tests/util/test_common.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -186,10 +186,10 @@ def test_qualify_table_references_in_file_array_fields(self, tmp_path):
186186
SELECT
187187
STRUCT(key, CAST(TRUE AS INT64) AS value)
188188
FROM
189-
data.array_field AS key
189+
_d.array_field AS key
190190
)
191191
) AS array_field
192-
FROM data
192+
FROM data AS _d
193193
"""
194194
query_path = tmp_path / "project" / "dataset" / "test"
195195
query_path.mkdir(parents=True, exist_ok=True)
@@ -207,10 +207,10 @@ def test_qualify_table_references_in_file_array_fields(self, tmp_path):
207207
SELECT
208208
STRUCT(key, CAST(TRUE AS INT64) AS value)
209209
FROM
210-
data.array_field AS key
210+
_d.array_field AS key
211211
)
212212
) AS array_field
213-
FROM data
213+
FROM data AS _d
214214
"""
215215
assert result == expected
216216

@@ -327,14 +327,23 @@ def test_qualify_table_references_information_schema(self, tmp_path):
327327
WITH info_schema_region AS (
328328
SELECT 1 FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT
329329
),
330+
info_schema_region_quoted AS (
331+
SELECT 1 FROM `region-us`.`INFORMATION_SCHEMA.JOBS_BY_PROJECT`
332+
),
333+
info_schema_region_full_quoted AS (
334+
SELECT 1 FROM `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
335+
),
330336
info_schema_project AS (
331-
SELECT 1 FROM proj.INFORMATION_SCHEMA.JOBS_BY_PROJECT
337+
SELECT 1 FROM proj.INFORMATION_SCHEMA.SCHEMATA
332338
),
333339
info_schema_full AS (
334340
SELECT * FROM `project.region-us.INFORMATION_SCHEMA.SCHEMATA`
335341
),
336342
info_schema_none AS (
337343
SELECT * FROM INFORMATION_SCHEMA.SCHEMATA
344+
),
345+
info_schema_none_ds_qualify AS (
346+
SELECT * FROM INFORMATION_SCHEMA.TABLES
338347
)
339348
SELECT 1
340349
"""
@@ -346,14 +355,23 @@ def test_qualify_table_references_information_schema(self, tmp_path):
346355
WITH info_schema_region AS (
347356
SELECT 1 FROM `{default_project}.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
348357
),
358+
info_schema_region_quoted AS (
359+
SELECT 1 FROM `{default_project}.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
360+
),
361+
info_schema_region_full_quoted AS (
362+
SELECT 1 FROM `{default_project}.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
363+
),
349364
info_schema_project AS (
350-
SELECT 1 FROM proj.INFORMATION_SCHEMA.JOBS_BY_PROJECT
365+
SELECT 1 FROM `proj.INFORMATION_SCHEMA.SCHEMATA`
351366
),
352367
info_schema_full AS (
353368
SELECT * FROM `project.region-us.INFORMATION_SCHEMA.SCHEMATA`
354369
),
355370
info_schema_none AS (
356371
SELECT * FROM `{default_project}.INFORMATION_SCHEMA.SCHEMATA`
372+
),
373+
info_schema_none_ds_qualify AS (
374+
SELECT * FROM `{default_project}.{default_dataset}.INFORMATION_SCHEMA.TABLES`
357375
)
358376
SELECT 1
359377
"""

0 commit comments

Comments
 (0)