Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 25 additions & 24 deletions helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)
import base64
from minio import Minio
from Bio import SeqIO

from database import get_db_cursor
import os
Expand Down Expand Up @@ -1076,34 +1077,34 @@ async def check_for_sequence_data(isolate, split_on_fasta_headers=True):
socket.setdefaulttimeout(original_timeout) # Restore timeout even on error
return False, f"Error loading FASTA file from MinIO: {str(e)}"

# 5. Parse the FASTA file to check if header is in the file
fasta_lines = fasta_content.splitlines()
sequence_lines = []
recording = False
header_found = False

for line in fasta_lines:
if line.startswith('>'):
# Check if this header matches what we're looking for
if line.startswith(f'>{fasta_header} ') or line == f'>{fasta_header}' or line.startswith(f'>{fasta_header}\t'):
recording = True
header_found = True
sequence_lines.append(line)
else:
# If we were recording and hit a different header, stop
if recording:
break
else:
if recording:
sequence_lines.append(line)

print(sequence_lines)
# 5. Parse the FASTA file using BioPython to check if header is in the file
fasta_handle = StringIO(fasta_content)

sequence_found = None
try:
for record in SeqIO.parse(fasta_handle, "fasta"):
# BioPython's record.id is the header without '>'
# record.description contains the full header line
# Check various header match patterns
if (record.id == fasta_header or
record.description == fasta_header or
record.description.startswith(fasta_header + ' ') or
record.description.startswith(fasta_header + '\t')):
sequence_found = record
break
except ValueError as e:
return False, f"Invalid FASTA format: {str(e)}"

# 6. Return error if header not found
if not header_found:
if not sequence_found:
return False, f"Header '{fasta_header}' not found in {fasta_file} for isolate '{isolate_sample_id}'"

sequence_data = '\n'.join(sequence_lines)
# Validate sequence isn't empty
if len(sequence_found.seq) == 0:
return False, f"Empty sequence found for header: {fasta_header}"

# Reconstruct FASTA format for storage
sequence_data = f">{sequence_found.description}\n{str(sequence_found.seq)}"
Copy link

Copilot AI Dec 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor behavioral change: The BioPython implementation returns the sequence as a single unwrapped line, whereas the original implementation preserved line breaks from the source file. While both are valid FASTA format, if line wrapping is desired (typically 60-80 characters per line), consider using textwrap.fill() or BioPython's SeqIO.write() to wrap the sequence.

Copilot uses AI. Check for mistakes.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot open a new pull request to apply changes based on this feedback


if not sequence_data.strip():
return False, f"No sequence data found for isolate '{isolate_sample_id}'"
Comment on lines 1109 to 1110
Copy link

Copilot AI Dec 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check is now redundant. Since sequence_found is guaranteed to be non-None (line 1100) and have a non-empty sequence (line 1104), the reconstructed sequence_data at line 1108 will always contain content and never be empty. Consider removing this check.

Copilot uses AI. Check for mistakes.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot open a new pull request to apply changes based on this feedback

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ pytest==7.4.3
sendgrid==6.12.5
minio==7.2.8
pip_system_certs==5.3
biopython==1.84
pandas==2.2.3
1 change: 1 addition & 0 deletions worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
import json
import asyncio
from io import StringIO
from jobs import get_next_job, mark_job_done, mark_job_failed
from database import get_db_cursor
from helpers import check_for_sequence_data, send_to_elastic2
Expand Down