Skip to content

Commit a613f5a

Browse files
authored
Merge branch 'staging' into biopython
2 parents 568a6ac + 834039a commit a613f5a

File tree

2 files changed

+235
-2
lines changed

2 files changed

+235
-2
lines changed

helpers.py

Lines changed: 234 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
import settings
99
from jsonschema import validate, ValidationError, Draft7Validator
1010
from flask import render_template_string
11+
import pandas as pd
12+
from io import StringIO
13+
import re
1114
from auth import KeycloakAuth
1215
from sendgrid import SendGridAPIClient
1316
from sendgrid.helpers.mail import (
@@ -472,10 +475,239 @@ def get_minio_client(self):
472475

473476
except Exception as e:
474477
raise e
475-
478+
479+
476480
def tsv_to_json(tsv_string, project_id):
477-
import re
481+
"""
482+
Convert TSV string to JSON list using pandas for robust parsing.
483+
Falls back to legacy implementation on error.
484+
"""
485+
try:
486+
return tsv_to_json_pandas(tsv_string, project_id)
487+
except Exception as e:
488+
logger.warning(f"Pandas TSV parsing failed, falling back to legacy: {e}")
489+
return tsv_to_json_legacy(tsv_string, project_id)
490+
491+
492+
def tsv_to_json_pandas(tsv_string, project_id):
493+
"""
494+
Refactored TSV to JSON conversion using pandas for parsing and jsonschema for validation.
495+
496+
IMPROVEMENTS:
497+
1. Pandas handles TSV parsing robustly (edge cases, escaping, quotes)
498+
2. Vectorized operations are much faster for large datasets
499+
3. Automatic type inference with better null handling
500+
4. Less manual string manipulation
501+
5. Built-in handling of missing values
502+
6. Cleaner, more maintainable code
503+
"""
504+
505+
with get_db_cursor() as cursor:
506+
# Get schema
507+
cursor.execute(
508+
"""
509+
SELECT pathogen_id
510+
FROM projects
511+
WHERE id = %s AND deleted_at IS NULL
512+
""",
513+
(project_id,),
514+
)
515+
project_record = cursor.fetchone()
516+
if not project_record:
517+
raise ValueError(f"Project ID {project_id} not found")
518+
519+
pathogen_id = project_record["pathogen_id"]
520+
521+
cursor.execute(
522+
"""
523+
SELECT schema_id
524+
FROM pathogens
525+
WHERE id = %s AND deleted_at IS NULL
526+
""",
527+
(pathogen_id,),
528+
)
529+
pathogen_record = cursor.fetchone()
530+
if not pathogen_record:
531+
raise ValueError(f"Pathogen ID {pathogen_id} not found")
532+
533+
schema_id = pathogen_record["schema_id"]
534+
535+
cursor.execute(
536+
"""
537+
SELECT schema
538+
FROM schemas
539+
WHERE id = %s AND deleted_at IS NULL
540+
""",
541+
(schema_id,),
542+
)
543+
schema_record = cursor.fetchone()
544+
if not schema_record:
545+
raise ValueError(f"Schema ID {schema_id} not found")
546+
547+
schema = schema_record["schema"]
548+
549+
# IMPROVEMENT 1: Use pandas to parse TSV
550+
# Handles edge cases like quoted fields, embedded tabs, etc.
551+
try:
552+
df = pd.read_csv(
553+
StringIO(tsv_string),
554+
sep='\t',
555+
dtype=str, # Read everything as string initially
556+
keep_default_na=False, # Don't convert empty strings to NaN
557+
na_values=[''], # But still recognize empty strings as NA
558+
skipinitialspace=True # Strip leading whitespace
559+
)
560+
except Exception as e:
561+
raise ValueError(f"Failed to parse TSV: {str(e)}")
562+
563+
# IMPROVEMENT 2: Strip whitespace from column names
564+
df.columns = df.columns.str.strip()
565+
566+
# IMPROVEMENT 3: Process columns based on schema using vectorized operations
567+
properties = schema.get("properties", {})
568+
569+
for column in df.columns:
570+
if column not in properties:
571+
continue # Skip columns not in schema
572+
573+
field_schema = properties[column]
574+
575+
# Handle oneOf fields
576+
if "oneOf" in field_schema:
577+
# Apply the oneOf processing to each value in the column
578+
df[column] = df[column].apply(lambda x: process_oneof_field_pandas(x, field_schema))
579+
else:
580+
# Process based on field type
581+
field_type = field_schema.get("type")
582+
split_regex = field_schema.get("x-split-regex")
583+
584+
if field_type == "array":
585+
# Process array fields
586+
df[column] = df[column].apply(lambda x: process_array_field(x, split_regex))
587+
588+
elif field_type == "number":
589+
# Convert to numeric, keeping strings that can't be converted
590+
df[column] = df[column].apply(lambda x: convert_to_number(x))
591+
592+
elif field_type == "string":
593+
# Ensure string type, replace NaN with empty string
594+
df[column] = df[column].fillna('')
595+
596+
# IMPROVEMENT 7: Convert to list of dicts efficiently
597+
# pandas to_dict is much faster than manual iteration
598+
json_list = df.to_dict('records')
599+
600+
return json_list
601+
602+
603+
def process_array_field(value, split_regex=None):
604+
"""
605+
Process array field with optional regex splitting.
606+
607+
IMPROVEMENT: Cleaner logic, better error handling
608+
"""
609+
if pd.isna(value) or not value or not isinstance(value, str):
610+
return []
611+
612+
value = value.strip()
613+
if not value:
614+
return []
615+
616+
try:
617+
if split_regex:
618+
split_values = re.split(split_regex, value)
619+
else:
620+
split_values = value.split(',')
621+
622+
# Strip and filter empty values
623+
return [v.strip() for v in split_values if v.strip()]
624+
625+
except re.error:
626+
# Fallback to comma split on regex error
627+
return [v.strip() for v in value.split(',') if v.strip()]
628+
629+
630+
def convert_to_number(value):
631+
"""
632+
Convert value to number (int or float) if possible, otherwise return original.
633+
"""
634+
if pd.isna(value) or not value or not isinstance(value, str):
635+
return None
636+
637+
value = value.strip()
638+
if not value:
639+
return None
640+
641+
try:
642+
if '.' in value:
643+
return float(value)
644+
else:
645+
return int(value)
646+
except ValueError:
647+
return value
648+
478649

650+
def process_oneof_field_pandas(value, field_schema):
651+
"""
652+
Process a field with oneOf schema.
653+
654+
IMPROVEMENT: Same logic but cleaner structure
655+
"""
656+
# If empty, return empty string
657+
if pd.isna(value) or not value or (isinstance(value, str) and not value.strip()):
658+
return ""
659+
660+
if isinstance(value, str):
661+
value = value.strip()
662+
663+
# Try to determine the correct type from oneOf options
664+
oneof_options = field_schema.get("oneOf", [])
665+
666+
for option in oneof_options:
667+
# Skip the empty string option
668+
if option.get("maxLength") == 0:
669+
continue
670+
671+
option_type = option.get("type")
672+
673+
# Try number conversion
674+
if option_type == "number":
675+
try:
676+
if '.' in str(value):
677+
return float(value)
678+
else:
679+
return int(value)
680+
except (ValueError, TypeError):
681+
continue
682+
683+
# Try array with enum
684+
if option_type == "array":
685+
split_regex = field_schema.get("x-split-regex", ",\\s*")
686+
try:
687+
split_values = re.split(split_regex, str(value))
688+
split_values = [v.strip() for v in split_values if v.strip()]
689+
if split_values:
690+
return split_values
691+
except:
692+
# Fallback to comma split
693+
split_values = [v.strip() for v in str(value).split(",") if v.strip()]
694+
if split_values:
695+
return split_values
696+
697+
# Check if it matches an enum
698+
if "enum" in option:
699+
if value in option["enum"]:
700+
return value
701+
702+
# If no type matched, return as string
703+
return str(value) if value is not None else ""
704+
705+
706+
def tsv_to_json_legacy(tsv_string, project_id):
707+
"""
708+
Legacy TSV to JSON conversion using manual parsing.
709+
Kept as fallback for the pandas-based implementation.
710+
"""
479711
tsv_string = tsv_string.replace('\r\n', '\n').replace('\r', '\n')
480712

481713
with get_db_cursor() as cursor:

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ sendgrid==6.12.5
1010
minio==7.2.8
1111
pip_system_certs==5.3
1212
biopython==1.84
13+
pandas==2.2.3

0 commit comments

Comments
 (0)