-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_dataset.sh
More file actions
executable file
·85 lines (63 loc) · 2.71 KB
/
prepare_dataset.sh
File metadata and controls
executable file
·85 lines (63 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
LOGS_DIR="LOGS"
if [ ! -d "data" ]; then
echo "Creating data directory..."
mkdir data
fi
# Check if LOGS_DIR exists, if not, create it
if [ ! -d $LOGS_DIR ]; then
echo "Creating LOGS directory..."
mkdir $LOGS_DIR
fi
FSE2022_FILE="data/fse2022_data.tar.bz2"
SBES2022_FILE="data/sbes2022_data.json"
SBES2022_METADATA="data/dataset1_metadata.csv"
FSE2022_DIR="data/fse2022"
{
# Download the FSE2022 data if it doesn't already exist
if [ ! -f $FSE2022_FILE ]; then
echo "Downloading FSE2022 data..."
curl -L "https://zenodo.org/records/6366908/files/fse-2022-MergeBERT-data.tar.bz2?download=1" -o $FSE2022_FILE
else
echo "FSE2022 data already exists. Skipping download."
fi
# Extract the FSE2022 data if the directory doesn't already exist
if [ ! -d $FSE2022_DIR ]; then
echo "Extracting FSE2022 data..."
tar -xjf $FSE2022_FILE -C data
else
echo "FSE2022 data already extracted. Skipping extraction."
fi
# Download the SBES2022 data if it doesn't already exist
if [ ! -f $SBES2022_FILE ]; then
echo "Downloading SBES2022 data..."
curl -L "https://api.figshare.com/v2/file/download/49875561" -o $SBES2022_FILE
else
echo "SBES2022 data already exists. Skipping download."
fi
# Download the SBES2022 metadata if it doesn't already exist
if [ ! -f $SBES2022_METADATA ]; then
echo "Downloading SBES2022 metadata..."
curl -L "https://api.figshare.com/v2/file/download/50335902" -o $SBES2022_METADATA
else
echo "SBES2022 metadata already exists. Skipping download."
fi
} | tee -a $LOGS_DIR/download_datasets.txt
echo "Running FSE2022 dataset transformation..."
python3 transform_fse2022_dataset_for_ase2023.py 2>&1 | tee $LOGS_DIR/transform_fse2022_dataset_for_ase2023.txt
echo "Running SBES2022 dataset transformation..."
python3 transform_SBES2022_dataset_to_ase2023.py 2>&1 | tee $LOGS_DIR/transform_SBES2022_dataset_to_ase2023.txt
python3 filter_FSE_dataset.py 2>&1 | tee $LOGS_DIR/filter_FSE_dataset.txt
python3 separate_datasets.py 2>&1 | tee $LOGS_DIR/separate_datasets.txt
python3 extract_combination_on_separated_datasets.py 2>&1 | tee $LOGS_DIR/extract_combination_on_separated_datasets.txt
cd mergeGen
{
python3 run_total_dataset.py dataset1 dataset_parallel.py
python3 run_total_dataset.py dataset2 dataset_parallel.py
python3 run_total_dataset.py dataset2_Java dataset_parallel.py
python3 run_total_dataset.py dataset2_CSharp dataset_parallel.py
python3 run_total_dataset.py dataset2_JavaScript dataset_parallel.py
python3 run_total_dataset.py dataset2_TypeScript dataset_parallel.py
} | tee -a $LOGS_DIR/proccess_mergegen_data.txt
cd ..
echo "All tasks completed."