@@ -489,17 +489,9 @@ def tsv_to_json(tsv_string, project_id):
489489
490490def tsv_to_json_pandas (tsv_string , project_id ):
491491 """
492- Refactored TSV to JSON conversion using pandas for parsing and jsonschema for validation.
493-
494- IMPROVEMENTS:
495- 1. Pandas handles TSV parsing robustly (edge cases, escaping, quotes)
496- 2. Vectorized operations are much faster for large datasets
497- 3. Automatic type inference with better null handling
498- 4. Less manual string manipulation
499- 5. Built-in handling of missing values
500- 6. Cleaner, more maintainable code
492+ TSV to JSON conversion using pandas, with schema-driven type handling.
493+ Matches legacy functionality but is faster and more robust.
501494 """
502-
503495 with get_db_cursor () as cursor :
504496 # Get schema
505497 cursor .execute (
@@ -544,54 +536,55 @@ def tsv_to_json_pandas(tsv_string, project_id):
544536
545537 schema = schema_record ["schema" ]
546538
547- # IMPROVEMENT 1: Use pandas to parse TSV
548-
539+ # Parse TSV with pandas
549540 try :
550541 df = pd .read_csv (
551542 StringIO (tsv_string ),
552543 sep = '\t ' ,
553- dtype = str , # Read everything as string initially
554- keep_default_na = False , # Don't convert empty strings to NaN
555- na_values = ['' ], # But still recognize empty strings as NA
556- skipinitialspace = True # Strip leading whitespace
544+ dtype = str ,
545+ keep_default_na = False ,
546+ na_values = ['' ],
547+ skipinitialspace = True
557548 )
558549 except Exception as e :
559550 raise ValueError (f"Failed to parse TSV: { str (e )} " )
560551
561-
562552 df .columns = df .columns .str .strip ()
563-
564-
565553 properties = schema .get ("properties" , {})
566-
554+
567555 for column in df .columns :
568556 if column not in properties :
569557 continue # Skip columns not in schema
570-
558+
571559 field_schema = properties [column ]
572-
560+
573561 # Handle oneOf fields
574562 if "oneOf" in field_schema :
575- # Apply the oneOf processing to each value in the column
576563 df [column ] = df [column ].apply (lambda x : process_oneof_field_pandas (x , field_schema ))
577- else :
578- # Process based on field type
579- field_type = field_schema .get ("type" )
580- split_regex = field_schema .get ("x-split-regex" )
581-
582- if field_type == "array" :
583- # Process array fields
584- df [column ] = df [column ].apply (lambda x : process_array_field (x , split_regex ))
585-
586- elif field_type == "number" :
587- # Convert to numeric, keeping strings that can't be converted
588- df [column ] = df [column ].apply (lambda x : convert_to_number (x ))
589-
590- elif field_type == "string" :
591- # Ensure string type, replace NaN with empty string
592- df [column ] = df [column ].fillna ('' )
564+ continue
565+
566+ field_type = field_schema .get ("type" )
567+ split_regex = field_schema .get ("x-split-regex" )
568+
569+ # Handle arrays
570+ if field_type == "array" :
571+ df [column ] = df [column ].apply (lambda x : process_array_field (x , split_regex ))
572+
573+ # Handle numbers
574+ elif field_type == "number" :
575+ df [column ] = df [column ].apply (lambda x : convert_to_number (x ))
576+
577+ # Handle dates (string with format date)
578+ elif field_type == "string" and field_schema .get ("format" ) == "date" :
579+ df [column ] = df [column ].apply (lambda x : x if x and x .strip () else None )
580+
581+ # Handle strings
582+ elif field_type == "string" :
583+ df [column ] = df [column ].apply (lambda x : x if x is not None else "" )
584+
585+ # Other types: leave as is
593586
594- # Replace all NaN with None so JSON is valid for Postgres
587+ # Replace all NaN with None for JSON compatibility
595588 df = df .where (pd .notnull (df ), None )
596589 json_list = df .to_dict ('records' )
597590 return json_list
0 commit comments