-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocess.py
More file actions
145 lines (122 loc) · 5.77 KB
/
process.py
File metadata and controls
145 lines (122 loc) · 5.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import argparse
import logging
import multiprocessing as mp
import os
from datetime import datetime
from pathlib import Path
import glob
import librosa
from zsvision.zs_multiproc import starmap_with_kwargs
import soundfile as sf
import pandas as pd
import wget
def resample_audio(load_folder: Path, save_folder: Path, audio_path: Path, resample_rate):
"""
Resampling one audio file.
Inputs:
load_folder: path of top directory of audio files to be resampled
save_folder: Location where audio file is downloaded
audio_path: location where to save the resampled file
resample_rate: resample rate for the audio files
"""
relative_path = os.path.relpath(audio_path, load_folder)
try:
resampled_audio_path = os.path.join(save_folder, relative_path)
os.makedirs(os.path.sep.join(os.path.join(save_folder, relative_path).split(os.path.sep)[0:-1]), exist_ok=True)
audio, _ = librosa.load(audio_path, sr = resample_rate)
sf.write(resampled_audio_path, audio, resample_rate, 'PCM_24')
except Exception as e:
logging.info(f'File {relative_path} could not be resampled because of error {e}')
def resample_audios(load_folder: Path, save_folder: Path, logging, resample_rate, processes):
"""
Resampling all audio files residing within the load_folder path.
Inputs:
load_folder: Location where audio file to be resampled reside. The folder structure here will be mimiced at save_folder
save_folder: Location where to save the resampled audio files
logging: Logging module containing information about the progress of the code
processes: Number of processes downloading audio content at
the same time
"""
audio_paths = glob.glob(os.path.join(load_folder,'**','*.wav'), recursive=True)
audio_paths = [x for x in audio_paths if not any(ext in os.sep.join(x.split(os.sep)[0:-1]) for ext in ['44800','44100','16000','22050','32000'])]
kwarg_list = []
for audio_path in audio_paths:
kwarg_list.append({
"load_folder": load_folder,
"save_folder": save_folder,
"audio_path": audio_path,
"resample_rate": resample_rate,
})
pool_func = resample_audio
if processes > 1:
with mp.Pool(processes=processes) as pool:
starmap_with_kwargs(pool=pool, func=pool_func, kwargs_iter=kwarg_list)
else:
for idx, kwarg in enumerate(kwarg_list):
pool_func(**kwarg)
def download_audio(save_folder: Path, file_name: Path, link: str):
"""
download one audio file.
Inputs:
save_folder: folder to save audio files
file_path: location to save the audo file
link: link to download the audio file
"""
audio_save_path = os.path.join(save_folder,file_name)
try:
wget.download(link, audio_save_path)
except Exception as e:
logging.info(f'File {link} could not be downloaded because of error {e}')
def download_audios(csv_path: Path, save_folder: Path, logging, processes):
"""
Download all audio files listed in fname.csv
Inputs:
load_folder: Location where audio file to be resampled reside. The folder structure here will be mimiced at save_folder
save_folder: Location where to save the resampled audio files
logging: Logging module containing information about the progress of the code
processes: Number of processes downloading audio content at
the same time
"""
fname = pd.read_csv(csv_path)
download_links = list(fname['download_link'])
file_names = list(fname['fname'])
kwarg_list = []
for i, link in enumerate(download_links):
kwarg_list.append({
"save_folder": save_folder,
"file_name": file_names[i],
"link": link,
})
pool_func = download_audio
if processes > 1:
with mp.Pool(processes=processes) as pool:
starmap_with_kwargs(pool=pool, func=pool_func, kwargs_iter=kwarg_list)
else:
for idx, kwarg in enumerate(kwarg_list):
pool_func(**kwarg)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--csv_path", type=Path, required=True)
parser.add_argument("--save_folder_path", type=Path, required=True)
parser.add_argument("--resample_rate", type=int, default=44100, choices=[44100,16000,22050,32000,44100,44800])
parser.add_argument("--processes", type=int, default=1)
args = parser.parse_args()
# Create necessary folders
os.makedirs(os.path.join(args.save_folder_path, str(args.resample_rate)), exist_ok=True)
resample_folder_path = os.path.join(args.save_folder_path, str(args.resample_rate))
os.makedirs(os.path.join(resample_folder_path,'logs'), exist_ok=True)
os.makedirs(os.path.join(resample_folder_path, 'audios'), exist_ok=True)
logging.basicConfig(filename=os.path.join(resample_folder_path,'logs',f"{datetime.now().strftime(r'%m%d_%H%M%S')}.log"), level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())
logging.info(f'Creating folder {args.save_folder_path}/download')
download_folder_path = os.path.join(args.save_folder_path, 'download')
os.makedirs(download_folder_path, exist_ok=True)
# Download files
logging.info('Starting to download files')
download_audios(args.csv_path, download_folder_path, logging, args.processes)
# Resample files
logging.info('Starting to resample files')
resample_save_path = os.path.join(resample_folder_path, 'audios')
resample_audios(download_folder_path, resample_save_path, logging, args.resample_rate, args.processes)
if __name__ == "__main__":
main()