|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Composition attack: does averaging many queries break clipping? |
| 3 | +# |
| 4 | +# METHODOLOGY: |
| 5 | +# Run N independent queries (each with a different pac_seed) on the same data. |
| 6 | +# The attacker averages the N results. Noise decreases at 1/sqrt(N) but the |
| 7 | +# outlier signal stays constant. With enough queries, noise → 0 and the signal |
| 8 | +# should be detectable. |
| 9 | +# |
| 10 | +# For each NQ (number of queries), we compute the average over those queries |
| 11 | +# per trial, then find the best classification threshold across trials. |
| 12 | +# 50% = random, 100% = perfect attack. |
| 13 | +set -euo pipefail |
| 14 | + |
| 15 | +DUCKDB="/home/ila/Code/pac/build/release/duckdb" |
| 16 | +PAC_EXT="/home/ila/Code/pac/build/release/extension/pac/pac.duckdb_extension" |
| 17 | + |
| 18 | +N=1000; TV=999999; MI=0.0078125; FILT=3; NT=15 |
| 19 | + |
| 20 | +run_sum() { |
| 21 | + local cond=$1 seed=$2 clip=$3 |
| 22 | + local insert="" |
| 23 | + [ "$cond" = "in" ] && insert="INSERT INTO users VALUES (0, ${TV});" |
| 24 | + local clip_sql="" |
| 25 | + [ "$clip" != "off" ] && clip_sql="SET pac_clip_support = ${clip};" |
| 26 | + $DUCKDB -noheader -list 2>/dev/null <<SQL |
| 27 | +LOAD '${PAC_EXT}'; |
| 28 | +CREATE TABLE users(user_id INTEGER, acctbal INTEGER); |
| 29 | +INSERT INTO users SELECT i, ((hash(i*31+7)%10000)+1)::INTEGER FROM generate_series(1,${N}) t(i); |
| 30 | +${insert} |
| 31 | +ALTER TABLE users ADD PAC_KEY(user_id); |
| 32 | +ALTER TABLE users SET PU; |
| 33 | +SET pac_mi = ${MI}; |
| 34 | +SET pac_seed = ${seed}; |
| 35 | +${clip_sql} |
| 36 | +SELECT SUM(acctbal) FROM users WHERE user_id <= ${FILT} OR user_id = 0; |
| 37 | +SQL |
| 38 | +} |
| 39 | + |
| 40 | +MAX_NQ=100 # max queries per trial |
| 41 | + |
| 42 | +for CLIP in off 2; do |
| 43 | + echo "===================================================" |
| 44 | + echo " COMPOSITION ATTACK: clip=${CLIP}, filt<=${FILT}" |
| 45 | + echo " ${NT} trials x ${MAX_NQ} queries each" |
| 46 | + echo "===================================================" |
| 47 | + echo "" |
| 48 | + |
| 49 | + # Collect all queries upfront |
| 50 | + IN_F=$(mktemp); OUT_F=$(mktemp) |
| 51 | + for trial in $(seq 1 $NT); do |
| 52 | + for q in $(seq 1 $MAX_NQ); do |
| 53 | + s=$((trial * 10000 + q)) |
| 54 | + echo "in,${trial},${q},$(run_sum in $s $CLIP)" >> "$IN_F" |
| 55 | + echo "out,${trial},${q},$(run_sum out $s $CLIP)" >> "$OUT_F" |
| 56 | + done |
| 57 | + echo " trial ${trial}/${NT} done" >&2 |
| 58 | + done |
| 59 | + |
| 60 | + # Analyze at different NQ cutoffs |
| 61 | + for NQ in 1 5 10 25 50 100; do |
| 62 | + echo "--- NQ=${NQ} ---" |
| 63 | + $DUCKDB -markdown <<SQL |
| 64 | +CREATE TABLE raw AS |
| 65 | +SELECT split_part(c,',',1) AS truth, |
| 66 | + TRY_CAST(split_part(c,',',2) AS INTEGER) AS trial, |
| 67 | + TRY_CAST(split_part(c,',',3) AS INTEGER) AS qid, |
| 68 | + TRY_CAST(split_part(c,',',4) AS DOUBLE) AS v |
| 69 | +FROM ( |
| 70 | + SELECT column0 AS c FROM read_csv('${IN_F}',columns={'column0':'VARCHAR'},header=false) |
| 71 | + UNION ALL |
| 72 | + SELECT column0 FROM read_csv('${OUT_F}',columns={'column0':'VARCHAR'},header=false) |
| 73 | +) WHERE split_part(c,',',4) != ''; |
| 74 | +
|
| 75 | +-- Average first NQ queries per trial |
| 76 | +WITH avgs AS ( |
| 77 | + SELECT truth, trial, AVG(v) AS v |
| 78 | + FROM raw WHERE qid <= ${NQ} AND v IS NOT NULL |
| 79 | + GROUP BY truth, trial |
| 80 | +) |
| 81 | +SELECT truth, printf('%.0f', AVG(v)) AS mean, printf('%.0f', STDDEV(v)) AS std, COUNT(*) AS n |
| 82 | +FROM avgs GROUP BY truth ORDER BY truth; |
| 83 | +
|
| 84 | +-- Best threshold classifier on averaged values |
| 85 | +WITH avgs AS ( |
| 86 | + SELECT truth, trial, AVG(v) AS v |
| 87 | + FROM raw WHERE qid <= ${NQ} AND v IS NOT NULL |
| 88 | + GROUP BY truth, trial |
| 89 | +), |
| 90 | +ths AS (SELECT UNNEST(generate_series( |
| 91 | + (SELECT (MIN(v))::BIGINT FROM avgs), |
| 92 | + (SELECT (MAX(v))::BIGINT FROM avgs), |
| 93 | + GREATEST(1, ((SELECT MAX(v)-MIN(v) FROM avgs)/50)::BIGINT) |
| 94 | +)) AS t), |
| 95 | +accs AS ( |
| 96 | + SELECT t, 100.0*SUM(CASE |
| 97 | + WHEN truth='in' AND v > t THEN 1 WHEN truth='out' AND v <= t THEN 1 |
| 98 | + ELSE 0 END)::DOUBLE / COUNT(*) AS acc |
| 99 | + FROM avgs, ths GROUP BY t |
| 100 | +) |
| 101 | +SELECT printf('%.1f%%', MAX(acc)) AS best_accuracy FROM accs; |
| 102 | +SQL |
| 103 | + echo "" |
| 104 | + done |
| 105 | + rm -f "$IN_F" "$OUT_F" |
| 106 | +done |
0 commit comments