Athena/src/Bao/generate_dataset.py at master · AlayaDB-AI/Athena · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
import os
import argparse
import json

import psycopg2
from tqdm import tqdm

sys.path.append('.')
from src.utils.db_utils import DBConn
from src.utils.workload_utils import read_workload

ALL_OPTIONS = [
    "enable_nestloop", "enable_hashjoin", "enable_mergejoin",
    "enable_seqscan", "enable_indexscan", "enable_indexonlyscan"
]

all_48_hint_sets = '''hashjoin,indexonlyscan
hashjoin,indexonlyscan,indexscan
hashjoin,indexonlyscan,indexscan,mergejoin
hashjoin,indexonlyscan,indexscan,mergejoin,nestloop
hashjoin,indexonlyscan,indexscan,mergejoin,seqscan
hashjoin,indexonlyscan,indexscan,nestloop
hashjoin,indexonlyscan,indexscan,nestloop,seqscan
hashjoin,indexonlyscan,indexscan,seqscan
hashjoin,indexonlyscan,mergejoin
hashjoin,indexonlyscan,mergejoin,nestloop
hashjoin,indexonlyscan,mergejoin,nestloop,seqscan
hashjoin,indexonlyscan,mergejoin,seqscan
hashjoin,indexonlyscan,nestloop
hashjoin,indexonlyscan,nestloop,seqscan
hashjoin,indexonlyscan,seqscan
hashjoin,indexscan
hashjoin,indexscan,mergejoin
hashjoin,indexscan,mergejoin,nestloop
hashjoin,indexscan,mergejoin,nestloop,seqscan
hashjoin,indexscan,mergejoin,seqscan
hashjoin,indexscan,nestloop
hashjoin,indexscan,nestloop,seqscan
hashjoin,indexscan,seqscan
hashjoin,mergejoin,nestloop,seqscan
hashjoin,mergejoin,seqscan
hashjoin,nestloop,seqscan
hashjoin,seqscan
indexonlyscan,indexscan,mergejoin
indexonlyscan,indexscan,mergejoin,nestloop
indexonlyscan,indexscan,mergejoin,nestloop,seqscan
indexonlyscan,indexscan,mergejoin,seqscan
indexonlyscan,indexscan,nestloop
indexonlyscan,indexscan,nestloop,seqscan
indexonlyscan,mergejoin
indexonlyscan,mergejoin,nestloop
indexonlyscan,mergejoin,nestloop,seqscan
indexonlyscan,mergejoin,seqscan
indexonlyscan,nestloop
indexonlyscan,nestloop,seqscan
indexscan,mergejoin
indexscan,mergejoin,nestloop
indexscan,mergejoin,nestloop,seqscan
indexscan,mergejoin,seqscan
indexscan,nestloop
indexscan,nestloop,seqscan
mergejoin,nestloop,seqscan
mergejoin,seqscan
nestloop,seqscan'''

all_48_hint_sets = all_48_hint_sets.split('\n')
all_48_hint_sets = [ ["enable_"+j for j in i.split(',')] for i in all_48_hint_sets]

def arm_idx_to_hints(arm_idx: int) -> list[str]:
    hints = []
    for option in ALL_OPTIONS:
        hints.append(f"SET {option} TO off")

    if arm_idx >= 1 and arm_idx < 49:
        for i in all_48_hint_sets[arm_idx - 1]:
            hints.append(f"SET {i} TO on")
    elif arm_idx == 0:
        for option in ALL_OPTIONS:
            hints.append(f"SET {option} TO on") # default PG setting
    else:
        print('48 hint set error')
        exit(0)
    return hints

def main(args: argparse.Namespace):
    names, queries = read_workload(args.workload)
    dataset_path = f'datasets/{args.database}/{args.workload}/Bao'
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    names_path = os.path.join(dataset_path, 'names.json')
    with open(names_path, 'w') as f:
        json.dump(names, f)
    with DBConn(args.database) as db:
        db.prewarm()
        for query_idx, (name, query) in enumerate(zip(names, queries)):
            if query_idx < args.query_begin:
                continue
            plan_set = set()
            arms = []
            for arm in range(49):
                hints = arm_idx_to_hints(arm)
                plan = db.get_plan(query, hints)
                plan_str = json.dumps(plan)
                if plan_str not in plan_set:
                    plan_set.add(plan_str)
                    arms.append(arm)
            timeout = 0
            samples = []
            print(f"Generate plans of {name}")
            for arm in tqdm(arms):
                hints = arm_idx_to_hints(arm)
                if args.generate_candidate_plans:
                    sample = db.get_plan(query, hints)
                else:
                    try:
                        try:
                            db.get_result(query, hints, timeout=timeout)
                        except psycopg2.errors.QueryCanceled:
                            db.rollback()
                        sample = db.get_result(query, hints, timeout=timeout)
                        if timeout == 0:
                            timeout = 4 * sample['Execution Time']
                            if timeout >= 240000:
                                timeout = max(sample['Execution Time'], 240000)
                            elif timeout <= 5000:
                                timeout = 5000
                    except psycopg2.errors.QueryCanceled:
                        sample = db.get_plan(query, hints)
                        db.rollback()
                samples.append(sample)
            query_path = os.path.join(dataset_path, f'query_{query_idx:04d}.json')
            with open(query_path, 'w') as f:
                json.dump(samples, f)
            option_path = os.path.join(dataset_path, f'option_{query_idx:04d}.json')
            with open(option_path, 'w') as f:
                json.dump([arm if arm != None else None for arm in arms], f)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--database', type=str, default='imdb')
    parser.add_argument('--workload', type=str, default='JOB')
    parser.add_argument('--query_begin', type=int, default=0)
    parser.add_argument('--generate_candidate_plans', action='store_true', help='generate candidate plans instead of execution results')
    args = parser.parse_args()
    main(args)