Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 18 additions & 14 deletions sql/2024/privacy/number_of_websites_with_related_origin_trials.sql
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
# Pages that participate in the privacy-relayed origin trials
CREATE TEMP FUNCTION `DECODE_ORIGIN_TRIAL`(token STRING) RETURNS STRING DETERMINISTIC AS (
SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70))
);

CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING)
RETURNS STRUCT<
CREATE TEMP FUNCTION `PARSE_ORIGIN_TRIAL`(token STRING) RETURNS STRUCT<
token STRING,
origin STRING,
feature STRING,
expiry TIMESTAMP,
is_subdomain BOOL,
is_third_party BOOL
> AS (
STRUCT(
DECODE_ORIGIN_TRIAL(token) AS token,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.origin') AS origin,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.feature') AS feature,
TIMESTAMP_SECONDS(CAST(JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.expiry') AS INT64)) AS expiry,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isSubdomain') = 'true' AS is_subdomain,
JSON_VALUE(DECODE_ORIGIN_TRIAL(token), '$.isThirdParty') = 'true' AS is_third_party
>
DETERMINISTIC AS (
(
WITH decoded_token AS (
SELECT SAFE_CONVERT_BYTES_TO_STRING(SUBSTR(SAFE.FROM_BASE64(token), 70)) AS decoded
)

SELECT
STRUCT(
decoded AS token,
JSON_VALUE(decoded, '$.origin') AS origin,
JSON_VALUE(decoded, '$.feature') AS feature,
TIMESTAMP_SECONDS(CAST(JSON_VALUE(decoded, '$.expiry') AS INT64)) AS expiry,
JSON_VALUE(decoded, '$.isSubdomain') = 'true' AS is_subdomain,
JSON_VALUE(decoded, '$.isThirdParty') = 'true' AS is_third_party
)
FROM decoded_token
)
);

Expand Down
37 changes: 37 additions & 0 deletions sql/2025/privacy/cookies_top_first_party_names.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that use first-party cookies across sites.

WITH pages AS (
SELECT
client,
root_page,
custom_metrics,
COUNT(DISTINCT NET.HOST(root_page)) OVER (PARTITION BY client) AS total_domains
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
),

cookies AS (
SELECT
client,
cookie,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_host,
NET.HOST(root_page) AS firstparty_host,
total_domains
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
)

SELECT
client,
COUNT(DISTINCT firstparty_host) AS domain_count,
COUNT(DISTINCT firstparty_host) / ANY_VALUE(total_domains) AS pct_domains,
SAFE.STRING(cookie.name) AS cookie_name
FROM cookies
WHERE firstparty_host LIKE '%' || cookie_host
GROUP BY
client,
cookie_name
ORDER BY
domain_count DESC,
client DESC
LIMIT 500
37 changes: 37 additions & 0 deletions sql/2025/privacy/cookies_top_third_party_domains.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
WITH pages AS (
SELECT
page,
client,
root_page,
custom_metrics,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
),

cookies AS (
SELECT
client,
page,
cookie,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_host,
NET.HOST(root_page) AS firstparty_host,
total_pages
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
)

SELECT
client,
cookie_host,
COUNT(DISTINCT page) AS page_count,
COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct_pages
FROM cookies
WHERE firstparty_host NOT LIKE '%' || cookie_host
GROUP BY
client,
cookie_host
ORDER BY
page_count DESC,
client
LIMIT 500
37 changes: 37 additions & 0 deletions sql/2025/privacy/cookies_top_third_party_names.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that set cookies using many domains.

WITH pages AS (
SELECT
client,
root_page,
custom_metrics,
COUNT(DISTINCT net.host(root_page)) OVER (PARTITION BY client) AS total_domains
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01'
),

cookies AS (
SELECT
client,
cookie,
NET.HOST(SAFE.STRING(cookie.domain)) AS cookie_host,
NET.HOST(root_page) AS firstparty_host,
total_domains
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics.cookies)) AS cookie
)

SELECT
client,
COUNT(DISTINCT firstparty_host) AS domain_count,
COUNT(DISTINCT firstparty_host) / ANY_VALUE(total_domains) AS pct_domains,
SAFE.STRING(cookie.name) AS cookie_name
FROM cookies
WHERE firstparty_host NOT LIKE '%' || cookie_host
GROUP BY
client,
cookie_name
ORDER BY
domain_count DESC,
client DESC
LIMIT 500
90 changes: 90 additions & 0 deletions sql/2025/privacy/most_common_bounce_domains.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
-- Detection logic explained:
-- https://github.com/privacycg/proposals/issues/6
-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md

WITH redirect_requests AS (
SELECT
client,
url,
index,
response_headers,
page
FROM `httparchive.crawl.requests`
WHERE
date = '2025-07-01' AND
is_root_page = TRUE AND
type NOT IN ('css', 'image', 'font', 'video', 'audio') AND
ROUND(INT64(summary.status) / 100) = 3 AND
index <= 2
),

navigation_redirect AS (
-- Find the first navigation redirect
SELECT
client,
url,
page,
response_header.value AS navigation_redirect_location
FROM redirect_requests,
UNNEST(response_headers) AS response_header
WHERE
index = 1 AND
LOWER(response_header.name) = 'location' AND
NET.REG_DOMAIN(response_header.value) != NET.REG_DOMAIN(page)
),

bounce_redirect AS (
-- Find the second navigation redirect
SELECT
client,
url,
page,
response_header.value AS bounce_redirect_location,
response_headers
FROM redirect_requests,
UNNEST(response_headers) AS response_header
WHERE
index = 2 AND
LOWER(response_header.name) = 'location'
),

bounce_sequences AS (
-- Combine the first and second navigation redirects
SELECT
nav.client,
NET.REG_DOMAIN(navigation_redirect_location) AS bounce_hostname,
COUNT(DISTINCT nav.page) AS number_of_pages
--ARRAY_AGG(bounce.bounce_tracking_cookies) AS bounce_tracking_cookies
FROM navigation_redirect AS nav
LEFT JOIN bounce_redirect AS bounce
ON
nav.client = bounce.client AND
nav.page = bounce.page AND
nav.navigation_redirect_location = bounce.url
WHERE bounce_redirect_location IS NOT NULL
GROUP BY
nav.client,
bounce_hostname
),

pages_total AS (
SELECT
client,
COUNT(DISTINCT page) AS total_pages
FROM `httparchive.crawl.pages`
WHERE date = '2025-07-01' AND
is_root_page
GROUP BY client
)

-- Count the number of websites with bounce tracking per bounce hostname
SELECT
client,
bounce_hostname,
number_of_pages,
number_of_pages / total_pages AS pct_pages
FROM bounce_sequences
JOIN pages_total
USING (client)
ORDER BY number_of_pages DESC
LIMIT 100
53 changes: 53 additions & 0 deletions sql/2025/privacy/most_common_client_hints.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
-- Pages that use Client Hints

WITH response_headers AS (
SELECT
client,
page,
LOWER(response_header.name) AS header_name,
LOWER(response_header.value) AS header_value,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_websites
FROM `httparchive.crawl.requests`,
UNNEST(response_headers) AS response_header
WHERE
date = '2025-07-01' AND
is_root_page = TRUE AND
is_main_document = TRUE
),

meta_tags AS (
SELECT
client,
page,
LOWER(SAFE.STRING(meta_node.`http-equiv`)) AS tag_name,
LOWER(SAFE.STRING(meta_node.content)) AS tag_value
FROM (
SELECT
client,
page,
custom_metrics.other.almanac AS metrics
FROM `httparchive.crawl.pages`
WHERE
date = '2025-07-01' AND
is_root_page = TRUE
),
UNNEST(JSON_QUERY_ARRAY(metrics.`meta-nodes`.nodes)) AS meta_node
WHERE SAFE.STRING(meta_node.`http-equiv`) IS NOT NULL
)

SELECT
client,
IF(header_name = 'accept-ch', header_value, tag_value) AS value,
COUNT(DISTINCT page) / ANY_VALUE(total_websites) AS pct_pages,
COUNT(DISTINCT page) AS number_of_pages
FROM response_headers
FULL OUTER JOIN meta_tags
USING (client, page)
WHERE
header_name = 'accept-ch' OR
tag_name = 'accept-ch'
GROUP BY
client,
value
ORDER BY pct_pages DESC
LIMIT 200
26 changes: 26 additions & 0 deletions sql/2025/privacy/most_common_cmps_for_iab_tcf_v2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- Counts of CMPs using IAB Transparency & Consent Framework
-- cf. https://github.com/InteractiveAdvertisingBureau/GDPR-Transparency-and-Consent-Framework/blob/master/TCFv2/IAB%20Tech%20Lab%20-%20CMP%20API%20v2.md--tcdata
-- CMP vendor list: https://iabeurope.eu/cmp-list/

WITH cmps AS (
SELECT
client,
page,
SAFE.INT64(custom_metrics.privacy.iab_tcf_v2.data.cmpId) AS cmpId,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages
FROM `httparchive.crawl.pages`
WHERE
date = '2025-07-01'
)

SELECT
client,
cmpId,
COUNT(0) / ANY_VALUE(total_pages) AS pct_pages,
COUNT(0) AS number_of_pages
FROM cmps
GROUP BY
client,
cmpId
ORDER BY
pct_pages DESC
Loading
Loading