|
8 | 8 | import settings |
9 | 9 | from jsonschema import validate, ValidationError, Draft7Validator |
10 | 10 | from flask import render_template_string |
| 11 | +import pandas as pd |
| 12 | +from io import StringIO |
| 13 | +import re |
11 | 14 | from auth import KeycloakAuth |
12 | 15 | from sendgrid import SendGridAPIClient |
13 | 16 | from sendgrid.helpers.mail import ( |
@@ -470,10 +473,239 @@ def get_minio_client(self): |
470 | 473 |
|
471 | 474 | except Exception as e: |
472 | 475 | raise e |
473 | | - |
| 476 | + |
| 477 | + |
474 | 478 | def tsv_to_json(tsv_string, project_id): |
475 | | - import re |
| 479 | + """ |
| 480 | + Convert TSV string to JSON list using pandas for robust parsing. |
| 481 | + Falls back to legacy implementation on error. |
| 482 | + """ |
| 483 | + try: |
| 484 | + return tsv_to_json_pandas(tsv_string, project_id) |
| 485 | + except Exception as e: |
| 486 | + logger.warning(f"Pandas TSV parsing failed, falling back to legacy: {e}") |
| 487 | + return tsv_to_json_legacy(tsv_string, project_id) |
| 488 | + |
| 489 | + |
| 490 | +def tsv_to_json_pandas(tsv_string, project_id): |
| 491 | + """ |
| 492 | + Refactored TSV to JSON conversion using pandas for parsing and jsonschema for validation. |
| 493 | + |
| 494 | + IMPROVEMENTS: |
| 495 | + 1. Pandas handles TSV parsing robustly (edge cases, escaping, quotes) |
| 496 | + 2. Vectorized operations are much faster for large datasets |
| 497 | + 3. Automatic type inference with better null handling |
| 498 | + 4. Less manual string manipulation |
| 499 | + 5. Built-in handling of missing values |
| 500 | + 6. Cleaner, more maintainable code |
| 501 | + """ |
| 502 | + |
| 503 | + with get_db_cursor() as cursor: |
| 504 | + # Get schema |
| 505 | + cursor.execute( |
| 506 | + """ |
| 507 | + SELECT pathogen_id |
| 508 | + FROM projects |
| 509 | + WHERE id = %s AND deleted_at IS NULL |
| 510 | + """, |
| 511 | + (project_id,), |
| 512 | + ) |
| 513 | + project_record = cursor.fetchone() |
| 514 | + if not project_record: |
| 515 | + raise ValueError(f"Project ID {project_id} not found") |
| 516 | + |
| 517 | + pathogen_id = project_record["pathogen_id"] |
| 518 | + |
| 519 | + cursor.execute( |
| 520 | + """ |
| 521 | + SELECT schema_id |
| 522 | + FROM pathogens |
| 523 | + WHERE id = %s AND deleted_at IS NULL |
| 524 | + """, |
| 525 | + (pathogen_id,), |
| 526 | + ) |
| 527 | + pathogen_record = cursor.fetchone() |
| 528 | + if not pathogen_record: |
| 529 | + raise ValueError(f"Pathogen ID {pathogen_id} not found") |
| 530 | + |
| 531 | + schema_id = pathogen_record["schema_id"] |
| 532 | + |
| 533 | + cursor.execute( |
| 534 | + """ |
| 535 | + SELECT schema |
| 536 | + FROM schemas |
| 537 | + WHERE id = %s AND deleted_at IS NULL |
| 538 | + """, |
| 539 | + (schema_id,), |
| 540 | + ) |
| 541 | + schema_record = cursor.fetchone() |
| 542 | + if not schema_record: |
| 543 | + raise ValueError(f"Schema ID {schema_id} not found") |
| 544 | + |
| 545 | + schema = schema_record["schema"] |
| 546 | + |
| 547 | + # IMPROVEMENT 1: Use pandas to parse TSV |
| 548 | + # Handles edge cases like quoted fields, embedded tabs, etc. |
| 549 | + try: |
| 550 | + df = pd.read_csv( |
| 551 | + StringIO(tsv_string), |
| 552 | + sep='\t', |
| 553 | + dtype=str, # Read everything as string initially |
| 554 | + keep_default_na=False, # Don't convert empty strings to NaN |
| 555 | + na_values=[''], # But still recognize empty strings as NA |
| 556 | + skipinitialspace=True # Strip leading whitespace |
| 557 | + ) |
| 558 | + except Exception as e: |
| 559 | + raise ValueError(f"Failed to parse TSV: {str(e)}") |
| 560 | + |
| 561 | + # IMPROVEMENT 2: Strip whitespace from column names |
| 562 | + df.columns = df.columns.str.strip() |
| 563 | + |
| 564 | + # IMPROVEMENT 3: Process columns based on schema using vectorized operations |
| 565 | + properties = schema.get("properties", {}) |
| 566 | + |
| 567 | + for column in df.columns: |
| 568 | + if column not in properties: |
| 569 | + continue # Skip columns not in schema |
| 570 | + |
| 571 | + field_schema = properties[column] |
| 572 | + |
| 573 | + # Handle oneOf fields |
| 574 | + if "oneOf" in field_schema: |
| 575 | + # Apply the oneOf processing to each value in the column |
| 576 | + df[column] = df[column].apply(lambda x: process_oneof_field_pandas(x, field_schema)) |
| 577 | + else: |
| 578 | + # Process based on field type |
| 579 | + field_type = field_schema.get("type") |
| 580 | + split_regex = field_schema.get("x-split-regex") |
| 581 | + |
| 582 | + if field_type == "array": |
| 583 | + # Process array fields |
| 584 | + df[column] = df[column].apply(lambda x: process_array_field(x, split_regex)) |
| 585 | + |
| 586 | + elif field_type == "number": |
| 587 | + # Convert to numeric, keeping strings that can't be converted |
| 588 | + df[column] = df[column].apply(lambda x: convert_to_number(x)) |
| 589 | + |
| 590 | + elif field_type == "string": |
| 591 | + # Ensure string type, replace NaN with empty string |
| 592 | + df[column] = df[column].fillna('') |
| 593 | + |
| 594 | + # IMPROVEMENT 7: Convert to list of dicts efficiently |
| 595 | + # pandas to_dict is much faster than manual iteration |
| 596 | + json_list = df.to_dict('records') |
| 597 | + |
| 598 | + return json_list |
476 | 599 |
|
| 600 | + |
| 601 | +def process_array_field(value, split_regex=None): |
| 602 | + """ |
| 603 | + Process array field with optional regex splitting. |
| 604 | + |
| 605 | + IMPROVEMENT: Cleaner logic, better error handling |
| 606 | + """ |
| 607 | + if pd.isna(value) or not value or not isinstance(value, str): |
| 608 | + return [] |
| 609 | + |
| 610 | + value = value.strip() |
| 611 | + if not value: |
| 612 | + return [] |
| 613 | + |
| 614 | + try: |
| 615 | + if split_regex: |
| 616 | + split_values = re.split(split_regex, value) |
| 617 | + else: |
| 618 | + split_values = value.split(',') |
| 619 | + |
| 620 | + # Strip and filter empty values |
| 621 | + return [v.strip() for v in split_values if v.strip()] |
| 622 | + |
| 623 | + except re.error: |
| 624 | + # Fallback to comma split on regex error |
| 625 | + return [v.strip() for v in value.split(',') if v.strip()] |
| 626 | + |
| 627 | + |
| 628 | +def convert_to_number(value): |
| 629 | + """ |
| 630 | + Convert value to number (int or float) if possible, otherwise return original. |
| 631 | + """ |
| 632 | + if pd.isna(value) or not value or not isinstance(value, str): |
| 633 | + return None |
| 634 | + |
| 635 | + value = value.strip() |
| 636 | + if not value: |
| 637 | + return None |
| 638 | + |
| 639 | + try: |
| 640 | + if '.' in value: |
| 641 | + return float(value) |
| 642 | + else: |
| 643 | + return int(value) |
| 644 | + except ValueError: |
| 645 | + return value |
| 646 | + |
| 647 | + |
| 648 | +def process_oneof_field_pandas(value, field_schema): |
| 649 | + """ |
| 650 | + Process a field with oneOf schema. |
| 651 | + |
| 652 | + IMPROVEMENT: Same logic but cleaner structure |
| 653 | + """ |
| 654 | + # If empty, return empty string |
| 655 | + if pd.isna(value) or not value or (isinstance(value, str) and not value.strip()): |
| 656 | + return "" |
| 657 | + |
| 658 | + if isinstance(value, str): |
| 659 | + value = value.strip() |
| 660 | + |
| 661 | + # Try to determine the correct type from oneOf options |
| 662 | + oneof_options = field_schema.get("oneOf", []) |
| 663 | + |
| 664 | + for option in oneof_options: |
| 665 | + # Skip the empty string option |
| 666 | + if option.get("maxLength") == 0: |
| 667 | + continue |
| 668 | + |
| 669 | + option_type = option.get("type") |
| 670 | + |
| 671 | + # Try number conversion |
| 672 | + if option_type == "number": |
| 673 | + try: |
| 674 | + if '.' in str(value): |
| 675 | + return float(value) |
| 676 | + else: |
| 677 | + return int(value) |
| 678 | + except (ValueError, TypeError): |
| 679 | + continue |
| 680 | + |
| 681 | + # Try array with enum |
| 682 | + if option_type == "array": |
| 683 | + split_regex = field_schema.get("x-split-regex", ",\\s*") |
| 684 | + try: |
| 685 | + split_values = re.split(split_regex, str(value)) |
| 686 | + split_values = [v.strip() for v in split_values if v.strip()] |
| 687 | + if split_values: |
| 688 | + return split_values |
| 689 | + except: |
| 690 | + # Fallback to comma split |
| 691 | + split_values = [v.strip() for v in str(value).split(",") if v.strip()] |
| 692 | + if split_values: |
| 693 | + return split_values |
| 694 | + |
| 695 | + # Check if it matches an enum |
| 696 | + if "enum" in option: |
| 697 | + if value in option["enum"]: |
| 698 | + return value |
| 699 | + |
| 700 | + # If no type matched, return as string |
| 701 | + return str(value) if value is not None else "" |
| 702 | + |
| 703 | + |
| 704 | +def tsv_to_json_legacy(tsv_string, project_id): |
| 705 | + """ |
| 706 | + Legacy TSV to JSON conversion using manual parsing. |
| 707 | + Kept as fallback for the pandas-based implementation. |
| 708 | + """ |
477 | 709 | tsv_string = tsv_string.replace('\r\n', '\n').replace('\r', '\n') |
478 | 710 |
|
479 | 711 | with get_db_cursor() as cursor: |
|
0 commit comments