Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion bigquery_etl/dependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,19 @@ def extract_table_references(sql: str) -> List[str]:
creates |= {
_raw_table_name(expr.this)
for expr in statement.find_all(sqlglot.exp.Create)
if expr.kind in ("TABLE", "VIEW")
}
tables |= (
{_raw_table_name(table) for table in statement.find_all(sqlglot.exp.Table)}
{
_raw_table_name(table)
for table in statement.find_all(sqlglot.exp.Table)
# Starting in sqlglot v26.9.0 the identifiers for UDFs created by `CREATE TEMP FUNCTION`
# statements get parsed into `Table` expressions (https://github.com/tobymao/sqlglot/pull/4829),
# so we need to exclude those (note that `Table` expressions in the bodies of UDFs won't have
# the `UserDefinedFunction` expression as their parent). We reported this as a sqlglot bug
# (https://github.com/tobymao/sqlglot/issues/6298) but were told this is expected behavior.
if not isinstance(table.parent, sqlglot.exp.UserDefinedFunction)
}
# ignore references created in this query
- creates
# ignore CTEs created in this statement
Expand Down
41 changes: 35 additions & 6 deletions bigquery_etl/util/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,22 @@ def qualify_table_references_in_file(path: Path) -> str:
"Cannot qualify table_references of query scripts or UDFs"
)

# INFORMATION_SCHEMA views that can be dataset qualified
# https://cloud.google.com/bigquery/docs/information-schema-intro#dataset_qualifier
# used to make a best-effort attempt to qualify with dataset when appropriate
DATASET_QUALIFIED_INFO_SCHEMA_VIEWS = (
"COLUMNS",
"COLUMN_FIELD_PATHS",
"MATERIALIZED_VIEWS",
"PARAMETERS",
"PARTITIONS",
"ROUTINES",
"ROUTINE_OPTIONS",
"TABLES",
"TABLE_OPTIONS",
"VIEWS",
)

# determine the default target project and dataset from the path
target_project = Path(path).parent.parent.parent.name
default_dataset = Path(path).parent.parent.name
Expand Down Expand Up @@ -232,7 +248,9 @@ def qualify_table_references_in_file(path: Path) -> str:
for table_expr in statement.find_all(sqlglot.exp.Table):
# existing table ref including backticks without alias
table_expr.set("alias", "")
reference_string = table_expr.sql(dialect="bigquery")
# as of sqlglot 26, dialect=bigquery puts backticks at only start and end
# if every part is quoted, so using dialect=None
reference_string = table_expr.sql(dialect=None).replace('"', "`?")

matched_cte = [
re.match(
Expand All @@ -244,17 +262,28 @@ def qualify_table_references_in_file(path: Path) -> str:
if any(matched_cte):
continue

# project id is parsed as the catalog attribute
# but information_schema region may also be parsed as catalog
if table_expr.catalog.startswith("region-"):
project_name = f"{target_project}.{table_expr.catalog}"
# as of sqlglot 26, INFORMATION_SCHEMA.{VIEW_NAME} is parsed together as a table name
if table_expr.name.startswith("INFORMATION_SCHEMA."):
if table_expr.db.lower().startswith("region-"):
project_name = table_expr.catalog or target_project
dataset_name = table_expr.db
elif (
table_expr.name.split(".")[1]
in DATASET_QUALIFIED_INFO_SCHEMA_VIEWS
): # add project and dataset
project_name = target_project
dataset_name = table_expr.db or default_dataset
else: # just add project
project_name = ""
dataset_name = table_expr.db or target_project
elif table_expr.catalog == "": # no project id
project_name = target_project
dataset_name = table_expr.db or default_dataset
else: # project id exists
continue

# fully qualified table ref
replacement_string = f"`{project_name}.{table_expr.db or default_dataset}.{table_expr.name}`"
replacement_string = f"`{project_name}{'.' if project_name else ''}{dataset_name}.{table_expr.name}`"

table_replacements.add((reference_string, replacement_string))

Expand Down
6 changes: 4 additions & 2 deletions bqetl_project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ deprecation:

format:
skip:
- bigquery_etl/glam/templates/*.sql
- bigquery_etl/glam/templates/**/*.sql
- sql_generators/**/*.sql
- tests/**/*.sql
- sql/moz-fx-data-shared-prod/udf_js/jackknife_percentile_ci/udf.sql
Expand All @@ -271,8 +271,10 @@ format:
- sql/moz-fx-data-shared-prod/udf_legacy/to_iso8601.sql
- stored_procedures/safe_crc32_uuid.sql
skip_qualifying_references:
- sql/**/checks.sql
- sql/mozfun/**/examples/*.sql
# Don't try to qualify references when Jinja expressions are being used for table/view IDs.
- sql/**/bigeye_custom_rules.sql
- sql/**/checks.sql

routine:
dependency_dir: udf_js_lib/
Expand Down
2 changes: 1 addition & 1 deletion requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ pytest==8.4.2
PyYAML==6.0.3
rich-click==1.9.4
smart_open==6.4.0
sqlglot==25.28.0
sqlglot==27.29.0
sqlparse==0.5.3
stripe==6.4.0
symbolic==12.16.3
Expand Down
14 changes: 5 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -899,9 +899,7 @@ jaraco-classes==3.4.0 \
jeepney==0.8.0 \
--hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806 \
--hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755
# via
# keyring
# secretstorage
# via secretstorage
jinja2==3.1.6 \
--hash=sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d \
--hash=sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67
Expand Down Expand Up @@ -2346,9 +2344,7 @@ s3transfer==0.13.0 \
secretstorage==3.3.3 \
--hash=sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77 \
--hash=sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99
# via
# bigeye-sdk
# keyring
# via bigeye-sdk
shellingham==1.5.4 \
--hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \
--hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de
Expand Down Expand Up @@ -2389,9 +2385,9 @@ soupsieve==2.6 \
--hash=sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb \
--hash=sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9
# via beautifulsoup4
sqlglot==25.28.0 \
--hash=sha256:01a6b76dd916c2ce39a5f808559e337600ea261873d25be0dc7f99da3d24860d \
--hash=sha256:1bfcadada28bcab873382b271ed30138597cbf8893d90cc259563f9498bd1b49
sqlglot==27.29.0 \
--hash=sha256:2270899694663acef94fa93497971837e6fadd712f4a98b32aee1e980bc82722 \
--hash=sha256:9a5ea8ac61826a7763de10cad45a35f0aa9bfcf7b96ee74afb2314de9089e1cb
# via -r requirements.in
sqlparse==0.5.3 \
--hash=sha256:09f67787f56a0b16ecdbde1bfc7f5d9c3371ca683cfeaa8e6ff60b4807ec9272 \
Expand Down
27 changes: 27 additions & 0 deletions tests/test_dependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,30 @@ def test_extract_table_refs_pivot_and_join(self):
refs = extract_table_references(pivot_query)

assert set(refs) == {"Produce", "Perishable_Mints"}

def test_extract_table_refs_with_ctes(self):
sql = """
WITH foo AS (SELECT * FROM bar)
SELECT * FROM foo
"""
refs = extract_table_references(sql)

assert refs == ["bar"]

def test_extract_table_refs_with_temp_udfs(self):
sql = """
CREATE TEMP FUNCTION foo() AS ((SELECT MAX(foo) FROM bar));
SELECT foo()
"""
refs = extract_table_references(sql)

assert refs == ["bar"]

def test_extract_table_refs_with_temp_tables(self):
sql = """
CREATE TEMP TABLE foo AS SELECT * FROM bar;
SELECT * FROM foo
"""
refs = extract_table_references(sql)

assert refs == ["bar"]
30 changes: 24 additions & 6 deletions tests/util/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,10 @@ def test_qualify_table_references_in_file_array_fields(self, tmp_path):
SELECT
STRUCT(key, CAST(TRUE AS INT64) AS value)
FROM
data.array_field AS key
_d.array_field AS key
)
) AS array_field
FROM data
FROM data AS _d
"""
query_path = tmp_path / "project" / "dataset" / "test"
query_path.mkdir(parents=True, exist_ok=True)
Expand All @@ -207,10 +207,10 @@ def test_qualify_table_references_in_file_array_fields(self, tmp_path):
SELECT
STRUCT(key, CAST(TRUE AS INT64) AS value)
FROM
data.array_field AS key
_d.array_field AS key
)
) AS array_field
FROM data
FROM data AS _d
"""
assert result == expected

Expand Down Expand Up @@ -327,14 +327,23 @@ def test_qualify_table_references_information_schema(self, tmp_path):
WITH info_schema_region AS (
SELECT 1 FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT
),
info_schema_region_quoted AS (
SELECT 1 FROM `region-us`.`INFORMATION_SCHEMA.JOBS_BY_PROJECT`
),
info_schema_region_full_quoted AS (
SELECT 1 FROM `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
),
info_schema_project AS (
SELECT 1 FROM proj.INFORMATION_SCHEMA.JOBS_BY_PROJECT
SELECT 1 FROM proj.INFORMATION_SCHEMA.SCHEMATA
),
info_schema_full AS (
SELECT * FROM `project.region-us.INFORMATION_SCHEMA.SCHEMATA`
),
info_schema_none AS (
SELECT * FROM INFORMATION_SCHEMA.SCHEMATA
),
info_schema_none_ds_qualify AS (
SELECT * FROM INFORMATION_SCHEMA.TABLES
)
SELECT 1
"""
Expand All @@ -346,14 +355,23 @@ def test_qualify_table_references_information_schema(self, tmp_path):
WITH info_schema_region AS (
SELECT 1 FROM `{default_project}.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
),
info_schema_region_quoted AS (
SELECT 1 FROM `{default_project}.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
),
info_schema_region_full_quoted AS (
SELECT 1 FROM `{default_project}.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
),
info_schema_project AS (
SELECT 1 FROM proj.INFORMATION_SCHEMA.JOBS_BY_PROJECT
SELECT 1 FROM `proj.INFORMATION_SCHEMA.SCHEMATA`
),
info_schema_full AS (
SELECT * FROM `project.region-us.INFORMATION_SCHEMA.SCHEMATA`
),
info_schema_none AS (
SELECT * FROM `{default_project}.INFORMATION_SCHEMA.SCHEMATA`
),
info_schema_none_ds_qualify AS (
SELECT * FROM `{default_project}.{default_dataset}.INFORMATION_SCHEMA.TABLES`
)
SELECT 1
"""
Expand Down