bioinfo-chru-strasbourg
diff --git a/‎config/param.extann.json‎
Lines changed: 1 addition & 0 deletions b/‎config/param.extann.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎howard/functions/from_extann.py‎
Lines changed: 6 additions & 2 deletions b/‎howard/functions/from_extann.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎plugins/update_database/__main__.py‎
Lines changed: 23 additions & 9 deletions b/‎plugins/update_database/__main__.py‎
Lines changed: 23 additions & 9 deletions
diff --git a/‎plugins/update_database/config/update_databases.json‎
Lines changed: 36 additions & 8 deletions b/‎plugins/update_database/config/update_databases.json‎
Lines changed: 36 additions & 8 deletions
diff --git a/‎plugins/update_database/factory.py‎
Lines changed: 1 addition & 0 deletions b/‎plugins/update_database/factory.py‎
Lines changed: 1 addition & 0 deletions
@@ -361,6 +361,7 @@
         }
     },
     "gnomad.v2.1.1.lof_metrics.by_gene.ordered.genes": {
+            "reference": "hg19",
             "genes": {
                 "Number": ".",
                 "Type": "String",
 
@@ -7,6 +7,7 @@
 import time
 import yaml  # type: ignore
 import subprocess
+import itertools
 
 from howard.functions.commons import command, full_path, transcripts_file_to_df
 
@@ -428,8 +429,11 @@ def get_aliases(gene: str, alias: pd.DataFrame) -> list:
     :param alias: HNGC dataframe from raw txt file
     """
     try:
-        alias_gene = find_rows_with_substring(alias, gene).values.tolist()[0]
-
+        alias_gene_tmp = find_rows_with_substring(alias, gene).values.tolist()
+        if all(isinstance(sublist, list) for sublist in alias_gene_tmp):
+            alias_gene = itertools.chain(*alias_gene_tmp)
+        else:
+            alias_gene = alias_gene_tmp
     except AttributeError:
         return []
     alias_gene_splitted = []
 
@@ -4,6 +4,7 @@
 import os
 from howard.functions.commons import DEFAULT_DATABASE_FOLDER
 import multiprocess as mp
+import concurrent.futures
 
 sys.path.append(os.path.join(os.path.dirname(__file__)))
 from plugins.update_database import clinvar, gnomad, cadd, omim
@@ -26,9 +27,7 @@
     },
     "update_config": {
         "help": """Path of json configuration file.\n""",
-        "default": os.path.join(
-            os.path.dirname(__file__), "config", "update_databases.json"
-        ),
+        "default": os.path.join(os.path.dirname(__file__), "config", "update_databases.json"),
         "type": str,
     },
     "current_folder": {
@@ -71,7 +70,7 @@
 
 # Main function
 def main(args: argparse) -> None:
-    """
+    """hist | grep
     Query input VCF file and show result
     """
 
@@ -88,13 +87,30 @@ def main(args: argparse) -> None:
 
     elif args.database == "gnomad":
         log.info("Update Gnomad")
-        gnomad.Gnomad(
+        gnom = gnomad.Gnomad(
             database=args.database,
             databases_folder=args.databases_folder,
             config_json=args.update_config,
             current_folder=args.current_folder,
             data_folder=args.data_folder,
-        ).update_gnomad()
+        )
+
+        with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
+            futures = {
+                executor.submit(gnom, file): os.path.join(gnom.data_folder, file)
+                for file in os.listdir(gnom.data_folder)
+                if file.endswith(".bgz")
+                and not os.path.exists(
+                    os.path.join(gnom.data_folder, file).replace(".vcf.bgz", ".parsed.vcf.gz")
+                )
+            }
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    result = future.result()
+                    log.info(result)
+                except Exception as e:
+                    log.error(f"Error processing {futures[future]}: {e}")
+        gnom.update_gnomad()
 
     elif args.database == "CADD":
         cadd_input = [
@@ -107,9 +123,7 @@ def main(args: argparse) -> None:
             input_args = cadd.update_cadd(
                 cadd_input,
                 os.path.join(args.data_folder, "processing"),
-                os.path.join(
-                    args.data_folder, f"CADD.generated.{utils.now()}.partition.parquet"
-                ),
+                os.path.join(args.data_folder, f"CADD.generated.{utils.now()}.partition.parquet"),
             )
         # Start processing
         with mp.Pool(10) as p:
 
@@ -166,39 +166,67 @@
                 "Description": "Gene symbol"
             },
             "transcript": {
-                "Number": "1",
+                "Number": ".",
                 "Type": "String",
                 "Description": "Refgene transcript"
             },
             "OMIM_phenotype": {
-                "Number": "1",
+                "Number": ".",
                 "Type": "String",
                 "Description": "List of OMIM phenotype"
             },
             "OMIM_inheritance": {
-                "Number": "1",
+                "Number": ".",
                 "Type": "String",
                 "Description": "List of OMIM inheritance"
             },
             "OMIM_ID": {
-                "Number": "1",
+                "Number": ".",
                 "Type": "Integer",
                 "Description": "List of OMIM ID"
             },
             "OMIM_morbid": {
-                "Number": "1",
+                "Number": ".",
                 "Type": "String",
                 "Description": "Morbid Yes"
             },
             "OMIM_morbid_candidate": {
-                "Number": "1",
+                "Number": ".",
                 "Type": "String",
                 "Description": "Morbid candidate Yes"
             }
         }
     },
     "omim_ambiguous": {
-        "full": ["ARMD10", "CELIAC4", "CORD7","CRAC1","DDH1","DFNA48","EOS","ETL4","HPC3","IDDM21","LGS","MRT4","MRX45","MRX46","MRX88","MRX89","MRX91","MRX92","MS2","NAIC","OS4","PAC1","PCAP","PPR1","PPR3","PSORS12","SHFM3","SLC22A1L","TST1","TST2"],
+        "full": ["ARMD10", "CELIAC4", "CORD7","CRAC1","DDH1","DFNA48","EOS","ETL4","HPC3","IDDM21","LGS","MRT4","MRX45","MRX46","MRX88","MRX89","MRX91","MRX92","MS2","NAIC","OS4","PAC1","PCAP","PPR1","PPR3","PSORS12","SHFM3","SLC22A1L","TST1","TST2","ZWS","WS","MP1","DCR","EMP","ACP","ORP1", "RP15","CCA1","CLF","MST","RCD1","RMD1","RP17"],
         "keep": ["ARMD10", "CELIAC4", "CORD7", "CRAC1", "DDH1", "DFNA48", "SLC22A1L"]
-    }
+    },
+    "gnomad_fields": [
+        "nhomalt_joint",
+        "AC_joint",
+        "AN_joint",
+        "AC_joint_XY",
+        "AC_joint_afr",
+        "AC_joint_ami",
+        "AC_joint_amr",
+        "AC_joint_asj",
+        "AC_joint_eas",
+        "AC_joint_fin",
+        "AC_joint_mid",
+        "AC_joint_nfe",
+        "AC_joint_sas",
+        "AC_joint_remaining",
+        "AN_joint_afr",
+        "AN_joint_ami",
+        "AN_joint_amr",
+        "AN_joint_asj",
+        "AN_joint_eas",
+        "AN_joint_fin",
+        "AN_joint_mid",
+        "AN_joint_nfe",
+        "AN_joint_sas",
+        "AN_joint_remaining",
+        "AC_grpmax_joint",
+        "AN_grpmax_joint"
+    ]
 }
@@ -356,6 +356,7 @@ def vcf_to_parquet(self, file):
         threads = round(get_threads() / 2)
         memory = str(round(int(re.match(r'\d+', get_memory())[0]) / 2))+"G"
         param = {"input": file, "output": output, "explode": {"explode_infos": True}, "threads": threads, "memory": memory}
+        log.debug(f"VCF to parquet param: {param}")
         convert(
             argparse.Namespace(
                 command="convert",
Original file line number	Diff line number	Diff line change
`@@ -361,6 +361,7 @@`
`361`	`361`	`}`
`362`	`362`	`},`
`363`	`363`	`"gnomad.v2.1.1.lof_metrics.by_gene.ordered.genes": {`
	`364`	`+ "reference": "hg19",`
`364`	`365`	`"genes": {`
`365`	`366`	`"Number": ".",`
`366`	`367`	`"Type": "String",`