diff --git a/helpers.py b/helpers.py index 853bd9b..2b09ff5 100644 --- a/helpers.py +++ b/helpers.py @@ -26,6 +26,7 @@ ) import base64 from minio import Minio +from Bio import SeqIO from database import get_db_cursor import os @@ -1076,34 +1077,34 @@ async def check_for_sequence_data(isolate, split_on_fasta_headers=True): socket.setdefaulttimeout(original_timeout) # Restore timeout even on error return False, f"Error loading FASTA file from MinIO: {str(e)}" - # 5. Parse the FASTA file to check if header is in the file - fasta_lines = fasta_content.splitlines() - sequence_lines = [] - recording = False - header_found = False - - for line in fasta_lines: - if line.startswith('>'): - # Check if this header matches what we're looking for - if line.startswith(f'>{fasta_header} ') or line == f'>{fasta_header}' or line.startswith(f'>{fasta_header}\t'): - recording = True - header_found = True - sequence_lines.append(line) - else: - # If we were recording and hit a different header, stop - if recording: - break - else: - if recording: - sequence_lines.append(line) - - print(sequence_lines) + # 5. Parse the FASTA file using BioPython to check if header is in the file + fasta_handle = StringIO(fasta_content) + + sequence_found = None + try: + for record in SeqIO.parse(fasta_handle, "fasta"): + # BioPython's record.id is the header without '>' + # record.description contains the full header line + # Check various header match patterns + if (record.id == fasta_header or + record.description == fasta_header or + record.description.startswith(fasta_header + ' ') or + record.description.startswith(fasta_header + '\t')): + sequence_found = record + break + except ValueError as e: + return False, f"Invalid FASTA format: {str(e)}" # 6. Return error if header not found - if not header_found: + if not sequence_found: return False, f"Header '{fasta_header}' not found in {fasta_file} for isolate '{isolate_sample_id}'" - sequence_data = '\n'.join(sequence_lines) + # Validate sequence isn't empty + if len(sequence_found.seq) == 0: + return False, f"Empty sequence found for header: {fasta_header}" + + # Reconstruct FASTA format for storage + sequence_data = f">{sequence_found.description}\n{str(sequence_found.seq)}" if not sequence_data.strip(): return False, f"No sequence data found for isolate '{isolate_sample_id}'" diff --git a/requirements.txt b/requirements.txt index 945043d..a08ea91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ pytest==7.4.3 sendgrid==6.12.5 minio==7.2.8 pip_system_certs==5.3 +biopython==1.84 pandas==2.2.3 diff --git a/worker.py b/worker.py index f6bb596..ad36d77 100644 --- a/worker.py +++ b/worker.py @@ -3,6 +3,7 @@ import sys import json import asyncio +from io import StringIO from jobs import get_next_job, mark_job_done, mark_job_failed from database import get_db_cursor from helpers import check_for_sequence_data, send_to_elastic2