From 568a6acc5959b843df96bdd659049691e65e2c6d Mon Sep 17 00:00:00 2001 From: desafinadude Date: Sat, 6 Dec 2025 19:33:06 +0200 Subject: [PATCH 1/3] handling fasta files with biopython --- helpers.py | 50 +++++++++++++++++++++++++----------------------- requirements.txt | 1 + worker.py | 2 ++ 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/helpers.py b/helpers.py index c16fe27..0d0ee6e 100644 --- a/helpers.py +++ b/helpers.py @@ -23,6 +23,8 @@ ) import base64 from minio import Minio +from Bio import SeqIO +from io import StringIO from database import get_db_cursor import os @@ -844,34 +846,34 @@ async def check_for_sequence_data(isolate, split_on_fasta_headers=True): socket.setdefaulttimeout(original_timeout) # Restore timeout even on error return False, f"Error loading FASTA file from MinIO: {str(e)}" - # 5. Parse the FASTA file to check if header is in the file - fasta_lines = fasta_content.splitlines() - sequence_lines = [] - recording = False - header_found = False - - for line in fasta_lines: - if line.startswith('>'): - # Check if this header matches what we're looking for - if line.startswith(f'>{fasta_header} ') or line == f'>{fasta_header}' or line.startswith(f'>{fasta_header}\t'): - recording = True - header_found = True - sequence_lines.append(line) - else: - # If we were recording and hit a different header, stop - if recording: - break - else: - if recording: - sequence_lines.append(line) - - print(sequence_lines) + # 5. Parse the FASTA file using BioPython to check if header is in the file + fasta_handle = StringIO(fasta_content) + + sequence_found = None + try: + for record in SeqIO.parse(fasta_handle, "fasta"): + # BioPython's record.id is the header without '>' + # record.description contains the full header line + # Check various header match patterns + if (record.id == fasta_header or + record.description == fasta_header or + record.description.startswith(fasta_header + ' ') or + record.description.startswith(fasta_header + '\t')): + sequence_found = record + break + except ValueError as e: + return False, f"Invalid FASTA format: {str(e)}" # 6. Return error if header not found - if not header_found: + if not sequence_found: return False, f"Header '{fasta_header}' not found in {fasta_file} for isolate '{isolate_sample_id}'" - sequence_data = '\n'.join(sequence_lines) + # Validate sequence isn't empty + if len(sequence_found.seq) == 0: + return False, f"Empty sequence found for header: {fasta_header}" + + # Reconstruct FASTA format for storage + sequence_data = f">{sequence_found.description}\n{str(sequence_found.seq)}" if not sequence_data.strip(): return False, f"No sequence data found for isolate '{isolate_sample_id}'" diff --git a/requirements.txt b/requirements.txt index eb5dc1b..5d3df7c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ pytest==7.4.3 sendgrid==6.12.5 minio==7.2.8 pip_system_certs==5.3 +biopython==1.84 diff --git a/worker.py b/worker.py index f6bb596..fbb7e47 100644 --- a/worker.py +++ b/worker.py @@ -3,6 +3,8 @@ import sys import json import asyncio +from io import StringIO +from Bio import SeqIO from jobs import get_next_job, mark_job_done, mark_job_failed from database import get_db_cursor from helpers import check_for_sequence_data, send_to_elastic2 From ea2b437ee03a0df302b48bd2783d1c41bd5a1419 Mon Sep 17 00:00:00 2001 From: Di M Date: Mon, 8 Dec 2025 12:09:12 +0200 Subject: [PATCH 2/3] Update worker.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/worker.py b/worker.py index fbb7e47..ad36d77 100644 --- a/worker.py +++ b/worker.py @@ -4,7 +4,6 @@ import json import asyncio from io import StringIO -from Bio import SeqIO from jobs import get_next_job, mark_job_done, mark_job_failed from database import get_db_cursor from helpers import check_for_sequence_data, send_to_elastic2 From e7419821388e44d005f6b4c179abe9bc5c42084e Mon Sep 17 00:00:00 2001 From: Di M Date: Mon, 8 Dec 2025 12:09:27 +0200 Subject: [PATCH 3/3] Update helpers.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- helpers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/helpers.py b/helpers.py index ec266f0..2b09ff5 100644 --- a/helpers.py +++ b/helpers.py @@ -27,7 +27,6 @@ import base64 from minio import Minio from Bio import SeqIO -from io import StringIO from database import get_db_cursor import os