Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions barcodeforge/ref_muts.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,14 @@ def process_and_reroot_lineages(
seqs = SeqIO.to_dict(SeqIO.parse(sequences_fasta_path, "fasta"))
ref = SeqIO.read(reference_fasta_path, "fasta")

sample_ids = set(sample_muts_df["sample"])
missing_count = len(sample_ids - set(seqs.keys()))
if missing_count:
total_count = len(sample_ids)
console.print(
f"[{STYLES['warning']}]Warning: {missing_count} out of {total_count} samples ({missing_count / total_count:.1%}) were not found in the FASTA file.[/{STYLES['warning']}]"
)

# if reference in the sample mutations file, use that as the root
if sample_muts_df[sample_muts_df["sample"] == ref.id].shape[0] > 0:
console.print(
Expand Down Expand Up @@ -170,18 +178,14 @@ def process_and_reroot_lineages(
)
seq = seqs.get(sample_id, None)
if seq is None:
# It's better to raise an error or handle this case explicitly
console.print(
f"[{STYLES['warning']}]Warning: Sample {sample_id} not found in FASTA file. Skipping.[/{STYLES['warning']}]"
)
# In debug mode, log a per-sample warning and track missing samples for a summary warning below.
if debug:
console.print(
f"[{STYLES['warning']}]Warning: Sample {sample_id} not found in FASTA file. Skipping.[/{STYLES['warning']}]"
)
continue
root_seqs.append(_construct_root_sequence(root_muts, seq))

if not root_seqs:
raise ValueError(
"No valid root sequences could be generated. Check input FASTA and sample mutations."
)

root = _derive_root_sequence(root_seqs)
additional_muts = _compare_sequences(ref, root)

Expand Down
148 changes: 103 additions & 45 deletions tests/test_ref_muts.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,54 +320,14 @@ def test_process_and_reroot_lineages_ref_not_in_muts_infer_root(
pd.testing.assert_frame_equal(original_lineages_df, rerooted_lineages_df)


def test_process_and_reroot_lineages_value_error_empty_root_seqs(
sample_ref_fasta_file,
sample_lineage_paths_file,
tmp_path,
mocker,
):
# sample_muts_df is not empty, but no corresponding sequences are found.
muts_content = (
"sampleD\tgene1:A1T\nsampleE\tgene1:C2G" # These samples are not in seqs_file
)
muts_file = tmp_path / "empty_root_seqs_muts.tsv"
muts_file.write_text(muts_content)

# Empty sequences.fasta or sequences that don't match sampleD/sampleE
seqs_content = ">sampleA\nATGC\n>sampleB\nCGTA"
seqs_file = tmp_path / "empty_root_seqs_sequences.fasta"
seqs_file.write_text(seqs_content)

output_additional_muts = tmp_path / "additional_muts_value_error.tsv"
output_rerooted_lineages = tmp_path / "rerooted_lineages_value_error.tsv"

mocked_console = MagicMock(
spec=Console
) # Though not strictly needed for ValueError, good for consistency
mocker.patch("barcodeforge.ref_muts.console", mocked_console)

with pytest.raises(
ValueError,
match="No valid root sequences could be generated. Check input FASTA and sample mutations.",
):
process_and_reroot_lineages(
debug=False,
sample_muts_path=str(muts_file),
reference_fasta_path=sample_ref_fasta_file,
sequences_fasta_path=str(seqs_file),
input_lineage_paths_path=sample_lineage_paths_file,
output_additional_muts_path=str(output_additional_muts),
output_rerooted_lineage_paths_path=str(output_rerooted_lineages),
)


def test_process_and_reroot_lineages_warning_missing_sample_in_fasta(
sample_ref_fasta_file, # ref_genome
sample_lineage_paths_file,
tmp_path,
mocker,
):
# sample_muts_path with sampleA (valid) and sampleMissing (not in FASTA)
# 1 out of 2 samples missing, so summary warning is printed
muts_content = "sampleA\tgene1:A2T,A7G\nsampleMissing\tgene1:C1G"
muts_file = tmp_path / "missing_fasta_sample_muts.tsv"
muts_file.write_text(muts_content)
Expand Down Expand Up @@ -400,15 +360,22 @@ def test_process_and_reroot_lineages_warning_missing_sample_in_fasta(
assert output_additional_muts.exists()
assert output_rerooted_lineages.exists()

# Verify warning for missing sample
expected_warning_call_substr = (
"[yellow]Warning: Sample sampleMissing not found in FASTA file. Skipping."
)
# Verify summary warning is shown (any missing samples triggers warning)
expected_warning_call_substr = "1 out of 2 samples"
assert any(
expected_warning_call_substr in str(c_args)
for c_args in mocked_console.print.call_args_list
)

# Verify per-sample warning is NOT shown when debug=False
per_sample_warning_substr = (
"Sample sampleMissing not found in FASTA file. Skipping."
)
assert not any(
per_sample_warning_substr in str(c_args)
for c_args in mocked_console.print.call_args_list
)

# Verify inference based on sampleA
df_add_muts = pd.read_csv(output_additional_muts, sep="\t")
assert "position" in df_add_muts.columns or df_add_muts.empty
Expand All @@ -424,3 +391,94 @@ def test_process_and_reroot_lineages_warning_missing_sample_in_fasta(
)
else:
pd.testing.assert_frame_equal(original_lineages_df, rerooted_lineages_df)


def test_process_and_reroot_lineages_debug_shows_per_sample_warning(
sample_ref_fasta_file, # ref_genome
sample_lineage_paths_file,
tmp_path,
mocker,
):
# When debug=True, the per-sample warning should be shown for each missing sample
muts_content = "sampleA\tgene1:A2T,A7G\nsampleMissing\tgene1:C1G"
muts_file = tmp_path / "debug_missing_muts.tsv"
muts_file.write_text(muts_content)

seqs_content = ">sampleA\nATAAAAAGAA\n>sampleOther\nCCCCCCCCCC"
seqs_file = tmp_path / "debug_missing_seqs.fasta"
seqs_file.write_text(seqs_content)

output_additional_muts = tmp_path / "additional_muts_debug.tsv"
output_rerooted_lineages = tmp_path / "rerooted_lineages_debug.tsv"

mocked_console = MagicMock(spec=Console)
mocker.patch("barcodeforge.ref_muts.console", mocked_console)

process_and_reroot_lineages(
debug=True,
sample_muts_path=str(muts_file),
reference_fasta_path=sample_ref_fasta_file,
sequences_fasta_path=str(seqs_file),
input_lineage_paths_path=sample_lineage_paths_file,
output_additional_muts_path=str(output_additional_muts),
output_rerooted_lineage_paths_path=str(output_rerooted_lineages),
)

# Verify per-sample warning IS shown when debug=True
per_sample_warning_substr = (
"Sample sampleMissing not found in FASTA file. Skipping."
)
assert any(
per_sample_warning_substr in str(c_args)
for c_args in mocked_console.print.call_args_list
)


def test_process_and_reroot_lineages_summary_warning_any_missing(
sample_ref_fasta_file,
sample_lineage_paths_file,
tmp_path,
mocker,
):
# 1 out of 5 samples missing — warning is always shown when any samples are missing
muts_content = (
"sampleA\tgene1:A2T\n"
"sampleB\tgene1:A6G\n"
"sampleC\tgene1:A3T\n"
"sampleD\tgene1:A4T\n"
"sampleMissing\tgene1:C1G"
)
muts_file = tmp_path / "below_threshold_muts.tsv"
muts_file.write_text(muts_content)

seqs_content = (
">sampleA\nATAAAAAGAA\n"
">sampleB\nAAAAAAGAAA\n"
">sampleC\nAAATAAAAAA\n"
">sampleD\nAAAATAAAAA\n"
)
seqs_file = tmp_path / "below_threshold_seqs.fasta"
seqs_file.write_text(seqs_content)

output_additional_muts = tmp_path / "additional_muts_below.tsv"
output_rerooted_lineages = tmp_path / "rerooted_lineages_below.tsv"

mocked_console = MagicMock(spec=Console)
mocker.patch("barcodeforge.ref_muts.console", mocked_console)

process_and_reroot_lineages(
debug=False,
sample_muts_path=str(muts_file),
reference_fasta_path=sample_ref_fasta_file,
sequences_fasta_path=str(seqs_file),
input_lineage_paths_path=sample_lineage_paths_file,
output_additional_muts_path=str(output_additional_muts),
output_rerooted_lineage_paths_path=str(output_rerooted_lineages),
)

# Verify summary warning IS shown even when only 1 out of 5 samples is missing
summary_warning_substr = "were not found in the FASTA file"
assert any(
summary_warning_substr in str(c_args)
for c_args in mocked_console.print.call_args_list
)
Loading