Skip to content

Commit 2e244bb

Browse files
authored
Merge pull request #223 from OpenUpSA/elastic-debugging
tsv_to_json update
2 parents 0b32cfc + dac2258 commit 2e244bb

File tree

1 file changed

+33
-40
lines changed

1 file changed

+33
-40
lines changed

helpers.py

Lines changed: 33 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -489,17 +489,9 @@ def tsv_to_json(tsv_string, project_id):
489489

490490
def tsv_to_json_pandas(tsv_string, project_id):
491491
"""
492-
Refactored TSV to JSON conversion using pandas for parsing and jsonschema for validation.
493-
494-
IMPROVEMENTS:
495-
1. Pandas handles TSV parsing robustly (edge cases, escaping, quotes)
496-
2. Vectorized operations are much faster for large datasets
497-
3. Automatic type inference with better null handling
498-
4. Less manual string manipulation
499-
5. Built-in handling of missing values
500-
6. Cleaner, more maintainable code
492+
TSV to JSON conversion using pandas, with schema-driven type handling.
493+
Matches legacy functionality but is faster and more robust.
501494
"""
502-
503495
with get_db_cursor() as cursor:
504496
# Get schema
505497
cursor.execute(
@@ -544,54 +536,55 @@ def tsv_to_json_pandas(tsv_string, project_id):
544536

545537
schema = schema_record["schema"]
546538

547-
# IMPROVEMENT 1: Use pandas to parse TSV
548-
539+
# Parse TSV with pandas
549540
try:
550541
df = pd.read_csv(
551542
StringIO(tsv_string),
552543
sep='\t',
553-
dtype=str, # Read everything as string initially
554-
keep_default_na=False, # Don't convert empty strings to NaN
555-
na_values=[''], # But still recognize empty strings as NA
556-
skipinitialspace=True # Strip leading whitespace
544+
dtype=str,
545+
keep_default_na=False,
546+
na_values=[''],
547+
skipinitialspace=True
557548
)
558549
except Exception as e:
559550
raise ValueError(f"Failed to parse TSV: {str(e)}")
560551

561-
562552
df.columns = df.columns.str.strip()
563-
564-
565553
properties = schema.get("properties", {})
566-
554+
567555
for column in df.columns:
568556
if column not in properties:
569557
continue # Skip columns not in schema
570-
558+
571559
field_schema = properties[column]
572-
560+
573561
# Handle oneOf fields
574562
if "oneOf" in field_schema:
575-
# Apply the oneOf processing to each value in the column
576563
df[column] = df[column].apply(lambda x: process_oneof_field_pandas(x, field_schema))
577-
else:
578-
# Process based on field type
579-
field_type = field_schema.get("type")
580-
split_regex = field_schema.get("x-split-regex")
581-
582-
if field_type == "array":
583-
# Process array fields
584-
df[column] = df[column].apply(lambda x: process_array_field(x, split_regex))
585-
586-
elif field_type == "number":
587-
# Convert to numeric, keeping strings that can't be converted
588-
df[column] = df[column].apply(lambda x: convert_to_number(x))
589-
590-
elif field_type == "string":
591-
# Ensure string type, replace NaN with empty string
592-
df[column] = df[column].fillna('')
564+
continue
565+
566+
field_type = field_schema.get("type")
567+
split_regex = field_schema.get("x-split-regex")
568+
569+
# Handle arrays
570+
if field_type == "array":
571+
df[column] = df[column].apply(lambda x: process_array_field(x, split_regex))
572+
573+
# Handle numbers
574+
elif field_type == "number":
575+
df[column] = df[column].apply(lambda x: convert_to_number(x))
576+
577+
# Handle dates (string with format date)
578+
elif field_type == "string" and field_schema.get("format") == "date":
579+
df[column] = df[column].apply(lambda x: x if x and x.strip() else None)
580+
581+
# Handle strings
582+
elif field_type == "string":
583+
df[column] = df[column].apply(lambda x: x if x is not None else "")
584+
585+
# Other types: leave as is
593586

594-
# Replace all NaN with None so JSON is valid for Postgres
587+
# Replace all NaN with None for JSON compatibility
595588
df = df.where(pd.notnull(df), None)
596589
json_list = df.to_dict('records')
597590
return json_list

0 commit comments

Comments
 (0)