Refractoring and rt fix

RobbinBouwmeester · RobbinBouwmeester · commit 635c2172114c · 2026-03-27T19:34:58.000+01:00
diff --git a/feature_generators/features_retention_time.py b/feature_generators/features_retention_time.py
@@ -7,7 +7,7 @@
 
 def add_retention_time_features(
     df_psms: pl.DataFrame,
-    predictions_deeplc: pl.DataFrame,
+    predictions_deeplc: pl.DataFrame = None,
     filter_rel_rt_error: float = 0.2,
     rt_prediction_error_abs: bool = True,
     rt_prediction_error_abs_relative: bool = True,
@@ -27,8 +27,10 @@ def add_retention_time_features(
     when falling back to absolute error.
 
     Args:
-        df_psms: PSM DataFrame with 'peptide' and 'rt' columns
-        predictions_deeplc: DataFrame with 'peptide' and 'rt_predictions' columns
+        df_psms: PSM DataFrame with 'peptide' and 'rt' columns.
+        predictions_deeplc: DataFrame with 'peptide' and 'rt_predictions' columns.
+            If None, df_psms must already contain 'rt_predictions' (e.g. from
+            predict_deeplc_pl in Stage 1).
         filter_rel_rt_error: Maximum relative RT error threshold for filtering (default: 0.2)
         rt_prediction_error_abs: Whether to calculate absolute RT error (default: True)
         rt_prediction_error_abs_relative: Whether to calculate relative RT error (default: True)
@@ -38,7 +40,8 @@ def add_retention_time_features(
         Added columns: rt_predictions, rt_prediction_error_abs, rt_prediction_error_abs_relative
     """
 
-    df_psms = df_psms.join(predictions_deeplc, on=["peptide"], how="left")
+    if predictions_deeplc is not None:
+        df_psms = df_psms.join(predictions_deeplc, on=["peptide"], how="left")
     # max_rt is the latest observed retention time in the experiment. Dividing
     # by max_rt converts absolute RT error into a fraction of the experiment's
     # total RT range, so the filter_rel_rt_error threshold (e.g. 0.2 = 20%)
diff --git a/mumdia.py b/mumdia.py
@@ -1290,10 +1290,7 @@ def calculate_features(
     )
 
     # Step 2: Compute RT error features and filter out poor predictions.
-    # BUG: add_retention_time_features() requires (df_psms, predictions_deeplc, ...)
-    # but predictions_deeplc is not passed here. This call is missing the 2nd positional arg.
-    # It works only if df_psms already has 'rt_predictions' column (joined in Step 1)
-    # AND the function signature is updated to make predictions_deeplc optional.
+    # predictions_deeplc=None because rt_predictions is already in df_psms from Step 1.
     log_info("Obtaining features retention time...")
     df_psms = add_retention_time_features(df_psms, filter_rel_rt_error=0.15)
 
diff --git a/tests/test_config_compatibility.py b/tests/test_config_compatibility.py
@@ -1,82 +1,69 @@
-#!/usr/bin/env python3
-"""
-Test script to verify that both old and new config formats work with the new system.
+"""Tests for configuration backwards compatibility between old nested and new flat formats."""
 
-This demonstrates that the new system is fully backwards compatible.
-"""
+import os
+
+import pytest
 
 from config import load_config_from_json
 
-def test_config_format(config_path, format_name):
-    """Test loading and using a specific config format."""
-    print(f"\n{'='*60}")
-    print(f"🧪 Testing {format_name} Config Format")
-    print(f"📁 File: {config_path}")
-    print(f"{'='*60}")
-    
-    try:
-        # Load config
-        config = load_config_from_json(config_path)
-        print(f"✅ Config loaded successfully!")
-        
-        # Show basic parameters
-        print(f"\n📋 Basic Parameters:")
-        print(f"   mzML file: {config.mzml_file}")
-        print(f"   FASTA file: {config.fasta_file}")
-        print(f"   Result dir: {config.result_dir}")
-        
-        # Show search parameter differences
-        print(f"\n🔬 Search Parameters:")
-        initial_config = config.get_initial_search_config()
-        full_config = config.get_full_search_config()
-        
-        print(f"   Parameter           Initial Search    Full Search")
-        print(f"   cleave_at           {initial_config['database']['enzyme']['cleave_at']:15} {full_config['database']['enzyme']['cleave_at']}")
-        print(f"   deisotope           {str(initial_config['deisotope']):15} {str(full_config['deisotope'])}")
-        print(f"   report_psms         {str(initial_config['report_psms']):15} {str(full_config['report_psms'])}")
-        print(f"   max_variable_mods   {str(initial_config['database']['max_variable_mods']):15} {str(full_config['database']['max_variable_mods'])}")
-        
-        # Show MuMDIA settings
-        mumdia_config = config.get_mumdia_config()
-        print(f"\n📊 MuMDIA Settings:")
-        print(f"   FDR initial search: {config.fdr_init_search}")
-        print(f"   Read initial pickle: {mumdia_config['read_initial_search_pickle']}")
-        print(f"   Write initial pickle: {mumdia_config['write_initial_search_pickle']}")
-        
-        print(f"\n✅ {format_name} format works perfectly!")
-        return True
-        
-    except Exception as e:
-        print(f"❌ Error testing {format_name} format: {e}")
-        return False
 
-def main():
-    print("🔧 MuMDIA Config Backwards Compatibility Test")
-    print("Testing both old (nested) and new (flat) config formats...")
-    
-    # Test old nested format
-    old_works = test_config_format("configs/config.json", "Legacy/Old Nested")
-    
-    # Test new flat format
-    new_works = test_config_format("configs/config_simple.json", "New Simplified Flat")
-    
-    print(f"\n{'='*60}")
-    print("🎯 Test Summary")
-    print(f"{'='*60}")
-    print(f"Legacy config (nested):    {'✅ PASS' if old_works else '❌ FAIL'}")
-    print(f"New config (flat):         {'✅ PASS' if new_works else '❌ FAIL'}")
-    print(f"Backwards compatibility:   {'✅ MAINTAINED' if old_works and new_works else '❌ BROKEN'}")
-    
-    if old_works and new_works:
-        print(f"\n🎉 SUCCESS: Both config formats work!")
-        print(f"   • Users can keep using their existing config.json files")
-        print(f"   • Users can also switch to the new simplified format")
-        print(f"   • The new system automatically detects and converts formats")
-        print(f"\nTo run MuMDIA:")
-        print(f"   python run.py configs/config.json        # Old format")
-        print(f"   python run.py configs/config_simple.json # New format")
-    else:
-        print(f"\n❌ FAILURE: Config compatibility is broken!")
+@pytest.mark.unit
+class TestConfigCompatibility:
+    """Verify that both old and new config formats load correctly."""
+
+    @pytest.fixture
+    def legacy_config_path(self):
+        path = "configs/config.json"
+        if not os.path.exists(path):
+            pytest.skip("configs/config.json not found")
+        return path
+
+    @pytest.fixture
+    def flat_config_path(self):
+        path = "configs/config_simple.json"
+        if not os.path.exists(path):
+            pytest.skip("configs/config_simple.json not found")
+        return path
+
+    def _check_config(self, config):
+        """Common assertions for any loaded config."""
+        assert hasattr(config, "mzml_file")
+        assert hasattr(config, "fasta_file")
+        assert hasattr(config, "result_dir")
+
+        initial = config.get_initial_search_config()
+        full = config.get_full_search_config()
+
+        assert "database" in initial
+        assert "database" in full
+        assert "enzyme" in initial["database"]
+        assert "report_psms" in initial
+
+        mumdia = config.get_mumdia_config()
+        assert "read_initial_search_pickle" in mumdia
+        assert "write_deeplc_pickle" in mumdia
+
+    def test_legacy_nested_format(self, legacy_config_path):
+        """Load the old nested (sage_basic / sage / mumdia) format."""
+        config = load_config_from_json(legacy_config_path)
+        self._check_config(config)
+
+    def test_flat_format(self, flat_config_path):
+        """Load the new flat format with _initial_search / _full_search overrides."""
+        config = load_config_from_json(flat_config_path)
+        self._check_config(config)
+
+    def test_override_mechanism(self, flat_config_path):
+        """Verify that search-stage overrides produce different configs."""
+        config = load_config_from_json(flat_config_path)
+        initial = config.get_initial_search_config()
+        full = config.get_full_search_config()
 
-if __name__ == "__main__":
-    main()
+        # At least one parameter should differ between stages
+        differs = (
+            initial["report_psms"] != full["report_psms"]
+            or initial["deisotope"] != full["deisotope"]
+            or initial["database"]["enzyme"]["cleave_at"]
+            != full["database"]["enzyme"]["cleave_at"]
+        )
+        assert differs, "Initial and full search configs should have at least one difference"
diff --git a/tests/test_diann_features.py b/tests/test_diann_features.py
@@ -296,7 +296,7 @@ def load_test_data():
         return None
 
 
-def test_single_precursor(data, precursor_id="GYINSLGALTGGQALQQAK/2"):
+def check_single_precursor(data, precursor_id="GYINSLGALTGGQALQQAK/2"):
     """Test feature generation for a single precursor."""
 
     logger.info(f"Testing feature generation for precursor: {precursor_id}")
@@ -379,7 +379,7 @@ def print_feature_summary(features):
     print("\n" + "=" * 50)
 
 
-def test_multiple_precursors(data, max_precursors=5):
+def check_multiple_precursors(data, max_precursors=5):
     """Test feature generation for multiple precursors."""
 
     logger.info(f"Testing feature generation for up to {max_precursors} precursors")
@@ -392,7 +392,7 @@ def test_multiple_precursors(data, max_precursors=5):
     for precursor_id in unique_precursors:
         try:
             logger.info(f"Processing {precursor_id}")
-            features = test_single_precursor(data, precursor_id)
+            features = check_single_precursor(data, precursor_id)
             if features:
                 results[precursor_id] = features
             else:
@@ -436,7 +436,7 @@ def benchmark_performance(data, num_tests=10):
         start_time = time.time()
 
         try:
-            features = test_single_precursor(data, precursor_id)
+            features = check_single_precursor(data, precursor_id)
             if features:
                 elapsed = time.time() - start_time
                 times.append(elapsed)
@@ -1023,7 +1023,7 @@ def main_sequential():
 
     # Test single precursor
     print("\n1. Testing single precursor...")
-    features = test_single_precursor(data)
+    features = check_single_precursor(data)
 
     if features:
         print_feature_summary(features)
diff --git a/tests/test_workflow_integration.py b/tests/test_workflow_integration.py
@@ -41,53 +41,68 @@ class TestWorkflowIntegration:
 
     @pytest.mark.skipif(not RUN_MODULE_AVAILABLE, reason="run module not available")
     @pytest.mark.integration
-    def test_argument_parsing_integration(self):
-        """Test command line argument parsing and validation."""
-        with patch(
-            "sys.argv",
-            ["run.py", "--mzml_file", "test.mzML", "--result_dir", "test_results"],
-        ):
-            parser, args = run.parse_arguments()
-
-            assert hasattr(args, "mzml_file")
-            assert hasattr(args, "result_dir")
-            assert args.mzml_file == "test.mzML"
-            assert args.result_dir == "test_results"
+    def test_config_loading_integration(self):
+        """Test loading config from JSON and generating search configs."""
+        from config import MuMDIAConfig, load_config_from_json
 
-    @pytest.mark.skipif(not RUN_MODULE_AVAILABLE, reason="run module not available")
-    @pytest.mark.integration
-    def test_config_modification_workflow(self):
-        """Test configuration modification and validation workflow."""
-        # Create a temporary config file
+        # Create a temporary config file in the new flat format
         config_data = {
-            "mumdia": {"fdr_init_search": 0.01, "min_occurrences": 1},
-            "sage": {"database": {"fasta": "test.fasta"}},
+            "mzml_file": "test.mzML",
+            "fasta_file": "test.fasta",
+            "result_dir": "test_results",
+            "fdr_init_search": 0.01,
         }
 
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".json", delete=False
+        ) as f:
             json.dump(config_data, f)
             config_path = f.name
 
         try:
-            with tempfile.TemporaryDirectory() as temp_dir:
-                # Mock parser and args with proper defaults
-                parser = Mock()
-                parser._actions = []
+            config = load_config_from_json(config_path)
+            assert config.mzml_file == "test.mzML"
+            assert config.result_dir == "test_results"
+            assert config.fdr_init_search == 0.01
+
+            initial = config.get_initial_search_config()
+            full = config.get_full_search_config()
+            assert "database" in initial
+            assert "database" in full
+        finally:
+            os.unlink(config_path)
 
-                # Create a proper args namespace without sentinel objects
-                args = argparse.Namespace(
-                    fdr_init_search=0.05, min_occurrences=1, database=None, fasta=None
-                )
+    @pytest.mark.skipif(not RUN_MODULE_AVAILABLE, reason="run module not available")
+    @pytest.mark.integration
+    def test_legacy_config_conversion(self):
+        """Test that legacy nested configs are auto-converted."""
+        from config import load_config_from_json
 
-                # Test config modification
-                new_config_path = run.modify_config(config_path, temp_dir, parser, args)
+        config_data = {
+            "sage_basic": {
+                "database": {"fasta": "test.fasta", "enzyme": {"cleave_at": "KR"}},
+                "deisotope": False,
+                "report_psms": 5,
+            },
+            "sage": {
+                "database": {"fasta": "test.fasta", "enzyme": {"cleave_at": "$"}},
+                "deisotope": True,
+                "report_psms": 12,
+            },
+            "mumdia": {"fdr_init_search": 0.01},
+        }
 
-                # Verify new config was created
-                assert os.path.exists(new_config_path)
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".json", delete=False
+        ) as f:
+            json.dump(config_data, f)
+            config_path = f.name
 
-                with open(new_config_path) as f:
-                    updated_config = json.load(f)
-                    assert "mumdia" in updated_config
+        try:
+            config = load_config_from_json(config_path)
+            mumdia_config = config.get_mumdia_config()
+            assert "fdr_init_search" in mumdia_config
+            assert mumdia_config["fdr_init_search"] == 0.01
         finally:
             os.unlink(config_path)