Skip to content

Commit ed13143

Browse files
Merge pull request #438 from bioinfo-chru-strasbourg/plugins_databases
Plugins databases
2 parents 266dd26 + 47ec03c commit ed13143

File tree

9 files changed

+196
-113
lines changed

9 files changed

+196
-113
lines changed

config/param.extann.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@
361361
}
362362
},
363363
"gnomad.v2.1.1.lof_metrics.by_gene.ordered.genes": {
364+
"reference": "hg19",
364365
"genes": {
365366
"Number": ".",
366367
"Type": "String",

howard/functions/from_extann.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import time
88
import yaml # type: ignore
99
import subprocess
10+
import itertools
1011

1112
from howard.functions.commons import command, full_path, transcripts_file_to_df
1213

@@ -428,8 +429,11 @@ def get_aliases(gene: str, alias: pd.DataFrame) -> list:
428429
:param alias: HNGC dataframe from raw txt file
429430
"""
430431
try:
431-
alias_gene = find_rows_with_substring(alias, gene).values.tolist()[0]
432-
432+
alias_gene_tmp = find_rows_with_substring(alias, gene).values.tolist()
433+
if all(isinstance(sublist, list) for sublist in alias_gene_tmp):
434+
alias_gene = itertools.chain(*alias_gene_tmp)
435+
else:
436+
alias_gene = alias_gene_tmp
433437
except AttributeError:
434438
return []
435439
alias_gene_splitted = []

plugins/update_database/__main__.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
from howard.functions.commons import DEFAULT_DATABASE_FOLDER
66
import multiprocess as mp
7+
import concurrent.futures
78

89
sys.path.append(os.path.join(os.path.dirname(__file__)))
910
from plugins.update_database import clinvar, gnomad, cadd, omim
@@ -26,9 +27,7 @@
2627
},
2728
"update_config": {
2829
"help": """Path of json configuration file.\n""",
29-
"default": os.path.join(
30-
os.path.dirname(__file__), "config", "update_databases.json"
31-
),
30+
"default": os.path.join(os.path.dirname(__file__), "config", "update_databases.json"),
3231
"type": str,
3332
},
3433
"current_folder": {
@@ -71,7 +70,7 @@
7170

7271
# Main function
7372
def main(args: argparse) -> None:
74-
"""
73+
"""hist | grep
7574
Query input VCF file and show result
7675
"""
7776

@@ -88,13 +87,30 @@ def main(args: argparse) -> None:
8887

8988
elif args.database == "gnomad":
9089
log.info("Update Gnomad")
91-
gnomad.Gnomad(
90+
gnom = gnomad.Gnomad(
9291
database=args.database,
9392
databases_folder=args.databases_folder,
9493
config_json=args.update_config,
9594
current_folder=args.current_folder,
9695
data_folder=args.data_folder,
97-
).update_gnomad()
96+
)
97+
98+
with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
99+
futures = {
100+
executor.submit(gnom, file): os.path.join(gnom.data_folder, file)
101+
for file in os.listdir(gnom.data_folder)
102+
if file.endswith(".bgz")
103+
and not os.path.exists(
104+
os.path.join(gnom.data_folder, file).replace(".vcf.bgz", ".parsed.vcf.gz")
105+
)
106+
}
107+
for future in concurrent.futures.as_completed(futures):
108+
try:
109+
result = future.result()
110+
log.info(result)
111+
except Exception as e:
112+
log.error(f"Error processing {futures[future]}: {e}")
113+
gnom.update_gnomad()
98114

99115
elif args.database == "CADD":
100116
cadd_input = [
@@ -107,9 +123,7 @@ def main(args: argparse) -> None:
107123
input_args = cadd.update_cadd(
108124
cadd_input,
109125
os.path.join(args.data_folder, "processing"),
110-
os.path.join(
111-
args.data_folder, f"CADD.generated.{utils.now()}.partition.parquet"
112-
),
126+
os.path.join(args.data_folder, f"CADD.generated.{utils.now()}.partition.parquet"),
113127
)
114128
# Start processing
115129
with mp.Pool(10) as p:

plugins/update_database/config/update_databases.json

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,39 +166,67 @@
166166
"Description": "Gene symbol"
167167
},
168168
"transcript": {
169-
"Number": "1",
169+
"Number": ".",
170170
"Type": "String",
171171
"Description": "Refgene transcript"
172172
},
173173
"OMIM_phenotype": {
174-
"Number": "1",
174+
"Number": ".",
175175
"Type": "String",
176176
"Description": "List of OMIM phenotype"
177177
},
178178
"OMIM_inheritance": {
179-
"Number": "1",
179+
"Number": ".",
180180
"Type": "String",
181181
"Description": "List of OMIM inheritance"
182182
},
183183
"OMIM_ID": {
184-
"Number": "1",
184+
"Number": ".",
185185
"Type": "Integer",
186186
"Description": "List of OMIM ID"
187187
},
188188
"OMIM_morbid": {
189-
"Number": "1",
189+
"Number": ".",
190190
"Type": "String",
191191
"Description": "Morbid Yes"
192192
},
193193
"OMIM_morbid_candidate": {
194-
"Number": "1",
194+
"Number": ".",
195195
"Type": "String",
196196
"Description": "Morbid candidate Yes"
197197
}
198198
}
199199
},
200200
"omim_ambiguous": {
201-
"full": ["ARMD10", "CELIAC4", "CORD7","CRAC1","DDH1","DFNA48","EOS","ETL4","HPC3","IDDM21","LGS","MRT4","MRX45","MRX46","MRX88","MRX89","MRX91","MRX92","MS2","NAIC","OS4","PAC1","PCAP","PPR1","PPR3","PSORS12","SHFM3","SLC22A1L","TST1","TST2"],
201+
"full": ["ARMD10", "CELIAC4", "CORD7","CRAC1","DDH1","DFNA48","EOS","ETL4","HPC3","IDDM21","LGS","MRT4","MRX45","MRX46","MRX88","MRX89","MRX91","MRX92","MS2","NAIC","OS4","PAC1","PCAP","PPR1","PPR3","PSORS12","SHFM3","SLC22A1L","TST1","TST2","ZWS","WS","MP1","DCR","EMP","ACP","ORP1", "RP15","CCA1","CLF","MST","RCD1","RMD1","RP17"],
202202
"keep": ["ARMD10", "CELIAC4", "CORD7", "CRAC1", "DDH1", "DFNA48", "SLC22A1L"]
203-
}
203+
},
204+
"gnomad_fields": [
205+
"nhomalt_joint",
206+
"AC_joint",
207+
"AN_joint",
208+
"AC_joint_XY",
209+
"AC_joint_afr",
210+
"AC_joint_ami",
211+
"AC_joint_amr",
212+
"AC_joint_asj",
213+
"AC_joint_eas",
214+
"AC_joint_fin",
215+
"AC_joint_mid",
216+
"AC_joint_nfe",
217+
"AC_joint_sas",
218+
"AC_joint_remaining",
219+
"AN_joint_afr",
220+
"AN_joint_ami",
221+
"AN_joint_amr",
222+
"AN_joint_asj",
223+
"AN_joint_eas",
224+
"AN_joint_fin",
225+
"AN_joint_mid",
226+
"AN_joint_nfe",
227+
"AN_joint_sas",
228+
"AN_joint_remaining",
229+
"AC_grpmax_joint",
230+
"AN_grpmax_joint"
231+
]
204232
}

plugins/update_database/factory.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@ def vcf_to_parquet(self, file):
356356
threads = round(get_threads() / 2)
357357
memory = str(round(int(re.match(r'\d+', get_memory())[0]) / 2))+"G"
358358
param = {"input": file, "output": output, "explode": {"explode_infos": True}, "threads": threads, "memory": memory}
359+
log.debug(f"VCF to parquet param: {param}")
359360
convert(
360361
argparse.Namespace(
361362
command="convert",

0 commit comments

Comments
 (0)