diff --git a/barcodeforge/ref_muts.py b/barcodeforge/ref_muts.py index 3f24aae..70f7807 100755 --- a/barcodeforge/ref_muts.py +++ b/barcodeforge/ref_muts.py @@ -133,6 +133,14 @@ def process_and_reroot_lineages( seqs = SeqIO.to_dict(SeqIO.parse(sequences_fasta_path, "fasta")) ref = SeqIO.read(reference_fasta_path, "fasta") + sample_ids = set(sample_muts_df["sample"]) + missing_count = len(sample_ids - set(seqs.keys())) + if missing_count: + total_count = len(sample_ids) + console.print( + f"[{STYLES['warning']}]Warning: {missing_count} out of {total_count} samples ({missing_count / total_count:.1%}) were not found in the FASTA file.[/{STYLES['warning']}]" + ) + # if reference in the sample mutations file, use that as the root if sample_muts_df[sample_muts_df["sample"] == ref.id].shape[0] > 0: console.print( @@ -170,18 +178,14 @@ def process_and_reroot_lineages( ) seq = seqs.get(sample_id, None) if seq is None: - # It's better to raise an error or handle this case explicitly - console.print( - f"[{STYLES['warning']}]Warning: Sample {sample_id} not found in FASTA file. Skipping.[/{STYLES['warning']}]" - ) + # In debug mode, log a per-sample warning and track missing samples for a summary warning below. + if debug: + console.print( + f"[{STYLES['warning']}]Warning: Sample {sample_id} not found in FASTA file. Skipping.[/{STYLES['warning']}]" + ) continue root_seqs.append(_construct_root_sequence(root_muts, seq)) - if not root_seqs: - raise ValueError( - "No valid root sequences could be generated. Check input FASTA and sample mutations." - ) - root = _derive_root_sequence(root_seqs) additional_muts = _compare_sequences(ref, root) diff --git a/tests/test_ref_muts.py b/tests/test_ref_muts.py index 43a97a5..377b35f 100644 --- a/tests/test_ref_muts.py +++ b/tests/test_ref_muts.py @@ -320,47 +320,6 @@ def test_process_and_reroot_lineages_ref_not_in_muts_infer_root( pd.testing.assert_frame_equal(original_lineages_df, rerooted_lineages_df) -def test_process_and_reroot_lineages_value_error_empty_root_seqs( - sample_ref_fasta_file, - sample_lineage_paths_file, - tmp_path, - mocker, -): - # sample_muts_df is not empty, but no corresponding sequences are found. - muts_content = ( - "sampleD\tgene1:A1T\nsampleE\tgene1:C2G" # These samples are not in seqs_file - ) - muts_file = tmp_path / "empty_root_seqs_muts.tsv" - muts_file.write_text(muts_content) - - # Empty sequences.fasta or sequences that don't match sampleD/sampleE - seqs_content = ">sampleA\nATGC\n>sampleB\nCGTA" - seqs_file = tmp_path / "empty_root_seqs_sequences.fasta" - seqs_file.write_text(seqs_content) - - output_additional_muts = tmp_path / "additional_muts_value_error.tsv" - output_rerooted_lineages = tmp_path / "rerooted_lineages_value_error.tsv" - - mocked_console = MagicMock( - spec=Console - ) # Though not strictly needed for ValueError, good for consistency - mocker.patch("barcodeforge.ref_muts.console", mocked_console) - - with pytest.raises( - ValueError, - match="No valid root sequences could be generated. Check input FASTA and sample mutations.", - ): - process_and_reroot_lineages( - debug=False, - sample_muts_path=str(muts_file), - reference_fasta_path=sample_ref_fasta_file, - sequences_fasta_path=str(seqs_file), - input_lineage_paths_path=sample_lineage_paths_file, - output_additional_muts_path=str(output_additional_muts), - output_rerooted_lineage_paths_path=str(output_rerooted_lineages), - ) - - def test_process_and_reroot_lineages_warning_missing_sample_in_fasta( sample_ref_fasta_file, # ref_genome sample_lineage_paths_file, @@ -368,6 +327,7 @@ def test_process_and_reroot_lineages_warning_missing_sample_in_fasta( mocker, ): # sample_muts_path with sampleA (valid) and sampleMissing (not in FASTA) + # 1 out of 2 samples missing, so summary warning is printed muts_content = "sampleA\tgene1:A2T,A7G\nsampleMissing\tgene1:C1G" muts_file = tmp_path / "missing_fasta_sample_muts.tsv" muts_file.write_text(muts_content) @@ -400,15 +360,22 @@ def test_process_and_reroot_lineages_warning_missing_sample_in_fasta( assert output_additional_muts.exists() assert output_rerooted_lineages.exists() - # Verify warning for missing sample - expected_warning_call_substr = ( - "[yellow]Warning: Sample sampleMissing not found in FASTA file. Skipping." - ) + # Verify summary warning is shown (any missing samples triggers warning) + expected_warning_call_substr = "1 out of 2 samples" assert any( expected_warning_call_substr in str(c_args) for c_args in mocked_console.print.call_args_list ) + # Verify per-sample warning is NOT shown when debug=False + per_sample_warning_substr = ( + "Sample sampleMissing not found in FASTA file. Skipping." + ) + assert not any( + per_sample_warning_substr in str(c_args) + for c_args in mocked_console.print.call_args_list + ) + # Verify inference based on sampleA df_add_muts = pd.read_csv(output_additional_muts, sep="\t") assert "position" in df_add_muts.columns or df_add_muts.empty @@ -424,3 +391,94 @@ def test_process_and_reroot_lineages_warning_missing_sample_in_fasta( ) else: pd.testing.assert_frame_equal(original_lineages_df, rerooted_lineages_df) + + +def test_process_and_reroot_lineages_debug_shows_per_sample_warning( + sample_ref_fasta_file, # ref_genome + sample_lineage_paths_file, + tmp_path, + mocker, +): + # When debug=True, the per-sample warning should be shown for each missing sample + muts_content = "sampleA\tgene1:A2T,A7G\nsampleMissing\tgene1:C1G" + muts_file = tmp_path / "debug_missing_muts.tsv" + muts_file.write_text(muts_content) + + seqs_content = ">sampleA\nATAAAAAGAA\n>sampleOther\nCCCCCCCCCC" + seqs_file = tmp_path / "debug_missing_seqs.fasta" + seqs_file.write_text(seqs_content) + + output_additional_muts = tmp_path / "additional_muts_debug.tsv" + output_rerooted_lineages = tmp_path / "rerooted_lineages_debug.tsv" + + mocked_console = MagicMock(spec=Console) + mocker.patch("barcodeforge.ref_muts.console", mocked_console) + + process_and_reroot_lineages( + debug=True, + sample_muts_path=str(muts_file), + reference_fasta_path=sample_ref_fasta_file, + sequences_fasta_path=str(seqs_file), + input_lineage_paths_path=sample_lineage_paths_file, + output_additional_muts_path=str(output_additional_muts), + output_rerooted_lineage_paths_path=str(output_rerooted_lineages), + ) + + # Verify per-sample warning IS shown when debug=True + per_sample_warning_substr = ( + "Sample sampleMissing not found in FASTA file. Skipping." + ) + assert any( + per_sample_warning_substr in str(c_args) + for c_args in mocked_console.print.call_args_list + ) + + +def test_process_and_reroot_lineages_summary_warning_any_missing( + sample_ref_fasta_file, + sample_lineage_paths_file, + tmp_path, + mocker, +): + # 1 out of 5 samples missing — warning is always shown when any samples are missing + muts_content = ( + "sampleA\tgene1:A2T\n" + "sampleB\tgene1:A6G\n" + "sampleC\tgene1:A3T\n" + "sampleD\tgene1:A4T\n" + "sampleMissing\tgene1:C1G" + ) + muts_file = tmp_path / "below_threshold_muts.tsv" + muts_file.write_text(muts_content) + + seqs_content = ( + ">sampleA\nATAAAAAGAA\n" + ">sampleB\nAAAAAAGAAA\n" + ">sampleC\nAAATAAAAAA\n" + ">sampleD\nAAAATAAAAA\n" + ) + seqs_file = tmp_path / "below_threshold_seqs.fasta" + seqs_file.write_text(seqs_content) + + output_additional_muts = tmp_path / "additional_muts_below.tsv" + output_rerooted_lineages = tmp_path / "rerooted_lineages_below.tsv" + + mocked_console = MagicMock(spec=Console) + mocker.patch("barcodeforge.ref_muts.console", mocked_console) + + process_and_reroot_lineages( + debug=False, + sample_muts_path=str(muts_file), + reference_fasta_path=sample_ref_fasta_file, + sequences_fasta_path=str(seqs_file), + input_lineage_paths_path=sample_lineage_paths_file, + output_additional_muts_path=str(output_additional_muts), + output_rerooted_lineage_paths_path=str(output_rerooted_lineages), + ) + + # Verify summary warning IS shown even when only 1 out of 5 samples is missing + summary_warning_substr = "were not found in the FASTA file" + assert any( + summary_warning_substr in str(c_args) + for c_args in mocked_console.print.call_args_list + )