-
Notifications
You must be signed in to change notification settings - Fork 0
handling fasta files with biopython #202
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ | |
| ) | ||
| import base64 | ||
| from minio import Minio | ||
| from Bio import SeqIO | ||
|
|
||
| from database import get_db_cursor | ||
| import os | ||
|
|
@@ -1076,34 +1077,34 @@ async def check_for_sequence_data(isolate, split_on_fasta_headers=True): | |
| socket.setdefaulttimeout(original_timeout) # Restore timeout even on error | ||
| return False, f"Error loading FASTA file from MinIO: {str(e)}" | ||
|
|
||
| # 5. Parse the FASTA file to check if header is in the file | ||
| fasta_lines = fasta_content.splitlines() | ||
| sequence_lines = [] | ||
| recording = False | ||
| header_found = False | ||
|
|
||
| for line in fasta_lines: | ||
| if line.startswith('>'): | ||
| # Check if this header matches what we're looking for | ||
| if line.startswith(f'>{fasta_header} ') or line == f'>{fasta_header}' or line.startswith(f'>{fasta_header}\t'): | ||
| recording = True | ||
| header_found = True | ||
| sequence_lines.append(line) | ||
| else: | ||
| # If we were recording and hit a different header, stop | ||
| if recording: | ||
| break | ||
| else: | ||
| if recording: | ||
| sequence_lines.append(line) | ||
|
|
||
| print(sequence_lines) | ||
| # 5. Parse the FASTA file using BioPython to check if header is in the file | ||
| fasta_handle = StringIO(fasta_content) | ||
|
|
||
| sequence_found = None | ||
| try: | ||
| for record in SeqIO.parse(fasta_handle, "fasta"): | ||
| # BioPython's record.id is the header without '>' | ||
| # record.description contains the full header line | ||
| # Check various header match patterns | ||
| if (record.id == fasta_header or | ||
| record.description == fasta_header or | ||
| record.description.startswith(fasta_header + ' ') or | ||
| record.description.startswith(fasta_header + '\t')): | ||
| sequence_found = record | ||
| break | ||
| except ValueError as e: | ||
| return False, f"Invalid FASTA format: {str(e)}" | ||
|
|
||
| # 6. Return error if header not found | ||
| if not header_found: | ||
| if not sequence_found: | ||
| return False, f"Header '{fasta_header}' not found in {fasta_file} for isolate '{isolate_sample_id}'" | ||
|
|
||
| sequence_data = '\n'.join(sequence_lines) | ||
| # Validate sequence isn't empty | ||
| if len(sequence_found.seq) == 0: | ||
| return False, f"Empty sequence found for header: {fasta_header}" | ||
|
|
||
| # Reconstruct FASTA format for storage | ||
| sequence_data = f">{sequence_found.description}\n{str(sequence_found.seq)}" | ||
|
|
||
| if not sequence_data.strip(): | ||
| return False, f"No sequence data found for isolate '{isolate_sample_id}'" | ||
|
Comment on lines
1109
to
1110
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,4 +9,5 @@ pytest==7.4.3 | |
| sendgrid==6.12.5 | ||
| minio==7.2.8 | ||
| pip_system_certs==5.3 | ||
| biopython==1.84 | ||
| pandas==2.2.3 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor behavioral change: The BioPython implementation returns the sequence as a single unwrapped line, whereas the original implementation preserved line breaks from the source file. While both are valid FASTA format, if line wrapping is desired (typically 60-80 characters per line), consider using
textwrap.fill()or BioPython'sSeqIO.write()to wrap the sequence.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@copilot open a new pull request to apply changes based on this feedback