Merge pull request #201 from OpenUpSA/pandas

desafinadude · web-flow · commit 834039a26b26 · 2025-12-08T11:01:48.000+02:00
pandas
diff --git a/helpers.py b/helpers.py
@@ -8,6 +8,9 @@
 import settings
 from jsonschema import validate, ValidationError, Draft7Validator
 from flask import render_template_string
+import pandas as pd
+from io import StringIO
+import re
 from auth import KeycloakAuth
 from sendgrid import SendGridAPIClient
 from sendgrid.helpers.mail import (
@@ -470,10 +473,239 @@ def get_minio_client(self):
         
     except Exception as e:
         raise e
-    
+
+
 def tsv_to_json(tsv_string, project_id):
-    import re
+    """
+    Convert TSV string to JSON list using pandas for robust parsing.
+    Falls back to legacy implementation on error.
+    """
+    try:
+        return tsv_to_json_pandas(tsv_string, project_id)
+    except Exception as e:
+        logger.warning(f"Pandas TSV parsing failed, falling back to legacy: {e}")
+        return tsv_to_json_legacy(tsv_string, project_id)
+
+
+def tsv_to_json_pandas(tsv_string, project_id):
+    """
+    Refactored TSV to JSON conversion using pandas for parsing and jsonschema for validation.
+    
+    IMPROVEMENTS:
+    1. Pandas handles TSV parsing robustly (edge cases, escaping, quotes)
+    2. Vectorized operations are much faster for large datasets
+    3. Automatic type inference with better null handling
+    4. Less manual string manipulation
+    5. Built-in handling of missing values
+    6. Cleaner, more maintainable code
+    """
+    
+    with get_db_cursor() as cursor:
+        # Get schema
+        cursor.execute(
+            """
+            SELECT pathogen_id
+            FROM projects
+            WHERE id = %s AND deleted_at IS NULL
+            """,
+            (project_id,),
+        )
+        project_record = cursor.fetchone()
+        if not project_record:
+            raise ValueError(f"Project ID {project_id} not found")
+
+        pathogen_id = project_record["pathogen_id"]
+
+        cursor.execute(
+            """
+            SELECT schema_id
+            FROM pathogens
+            WHERE id = %s AND deleted_at IS NULL
+            """,
+            (pathogen_id,),
+        )
+        pathogen_record = cursor.fetchone()
+        if not pathogen_record:
+            raise ValueError(f"Pathogen ID {pathogen_id} not found")
+
+        schema_id = pathogen_record["schema_id"]
+
+        cursor.execute(
+            """
+            SELECT schema
+            FROM schemas
+            WHERE id = %s AND deleted_at IS NULL
+            """,
+            (schema_id,),
+        )
+        schema_record = cursor.fetchone()
+        if not schema_record:
+            raise ValueError(f"Schema ID {schema_id} not found")
+
+        schema = schema_record["schema"]
+
+    # IMPROVEMENT 1: Use pandas to parse TSV
+    # Handles edge cases like quoted fields, embedded tabs, etc.
+    try:
+        df = pd.read_csv(
+            StringIO(tsv_string),
+            sep='\t',
+            dtype=str,  # Read everything as string initially
+            keep_default_na=False,  # Don't convert empty strings to NaN
+            na_values=[''],  # But still recognize empty strings as NA
+            skipinitialspace=True  # Strip leading whitespace
+        )
+    except Exception as e:
+        raise ValueError(f"Failed to parse TSV: {str(e)}")
+
+    # IMPROVEMENT 2: Strip whitespace from column names
+    df.columns = df.columns.str.strip()
+
+    # IMPROVEMENT 3: Process columns based on schema using vectorized operations
+    properties = schema.get("properties", {})
+    
+    for column in df.columns:
+        if column not in properties:
+            continue  # Skip columns not in schema
+            
+        field_schema = properties[column]
+        
+        # Handle oneOf fields
+        if "oneOf" in field_schema:
+            # Apply the oneOf processing to each value in the column
+            df[column] = df[column].apply(lambda x: process_oneof_field_pandas(x, field_schema))
+        else:
+            # Process based on field type
+            field_type = field_schema.get("type")
+            split_regex = field_schema.get("x-split-regex")
+            
+            if field_type == "array":
+                # Process array fields
+                df[column] = df[column].apply(lambda x: process_array_field(x, split_regex))
+            
+            elif field_type == "number":
+                # Convert to numeric, keeping strings that can't be converted
+                df[column] = df[column].apply(lambda x: convert_to_number(x))
+            
+            elif field_type == "string":
+                # Ensure string type, replace NaN with empty string
+                df[column] = df[column].fillna('')
+
+    # IMPROVEMENT 7: Convert to list of dicts efficiently
+    # pandas to_dict is much faster than manual iteration
+    json_list = df.to_dict('records')
+    
+    return json_list
 
+
+def process_array_field(value, split_regex=None):
+    """
+    Process array field with optional regex splitting.
+    
+    IMPROVEMENT: Cleaner logic, better error handling
+    """
+    if pd.isna(value) or not value or not isinstance(value, str):
+        return []
+    
+    value = value.strip()
+    if not value:
+        return []
+    
+    try:
+        if split_regex:
+            split_values = re.split(split_regex, value)
+        else:
+            split_values = value.split(',')
+        
+        # Strip and filter empty values
+        return [v.strip() for v in split_values if v.strip()]
+    
+    except re.error:
+        # Fallback to comma split on regex error
+        return [v.strip() for v in value.split(',') if v.strip()]
+
+
+def convert_to_number(value):
+    """
+    Convert value to number (int or float) if possible, otherwise return original.
+    """
+    if pd.isna(value) or not value or not isinstance(value, str):
+        return None
+    
+    value = value.strip()
+    if not value:
+        return None
+    
+    try:
+        if '.' in value:
+            return float(value)
+        else:
+            return int(value)
+    except ValueError:
+        return value
+
+
+def process_oneof_field_pandas(value, field_schema):
+    """
+    Process a field with oneOf schema.
+    
+    IMPROVEMENT: Same logic but cleaner structure
+    """
+    # If empty, return empty string
+    if pd.isna(value) or not value or (isinstance(value, str) and not value.strip()):
+        return ""
+    
+    if isinstance(value, str):
+        value = value.strip()
+    
+    # Try to determine the correct type from oneOf options
+    oneof_options = field_schema.get("oneOf", [])
+    
+    for option in oneof_options:
+        # Skip the empty string option
+        if option.get("maxLength") == 0:
+            continue
+        
+        option_type = option.get("type")
+        
+        # Try number conversion
+        if option_type == "number":
+            try:
+                if '.' in str(value):
+                    return float(value)
+                else:
+                    return int(value)
+            except (ValueError, TypeError):
+                continue
+        
+        # Try array with enum
+        if option_type == "array":
+            split_regex = field_schema.get("x-split-regex", ",\\s*")
+            try:
+                split_values = re.split(split_regex, str(value))
+                split_values = [v.strip() for v in split_values if v.strip()]
+                if split_values:
+                    return split_values
+            except:
+                # Fallback to comma split
+                split_values = [v.strip() for v in str(value).split(",") if v.strip()]
+                if split_values:
+                    return split_values
+        
+        # Check if it matches an enum
+        if "enum" in option:
+            if value in option["enum"]:
+                return value
+    
+    # If no type matched, return as string
+    return str(value) if value is not None else ""
+
+    
+def tsv_to_json_legacy(tsv_string, project_id):
+    """
+    Legacy TSV to JSON conversion using manual parsing.
+    Kept as fallback for the pandas-based implementation.
+    """
     tsv_string = tsv_string.replace('\r\n', '\n').replace('\r', '\n')
     
     with get_db_cursor() as cursor:
diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,4 @@ pytest==7.4.3
 sendgrid==6.12.5
 minio==7.2.8
 pip_system_certs==5.3
+pandas==2.2.3