settylab · settylab-dotto-bot · May 29, 2026 · May 29, 2026
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+## [0.8.0] - 2026-05-28
+
+### Behavior changes
+
+These three changes correct discrepancies between the implementation and the manuscript that describes Kompot's statistics. Two are numerical scale shifts that preserve relative rankings; the third is a default-value harmonization. Re-tune any absolute thresholds calibrated against 0.7.0.
+
+ - **Mahalanobis denominator now sums covariances**: the gene-wise Mahalanobis distance used by `DifferentialExpression.predict(compute_mahalanobis=True)` now computes the posterior combined covariance as `Σ_a + Σ_b` (the variance of the difference of two independent posterior estimators) instead of `(Σ_a + Σ_b) / 2`. Matches the manuscript definition `D(a,b) = sqrt((μ_a − μ_b)^T (Σ_a + Σ_b)^(-1) (μ_a − μ_b))`. **Effect**: absolute Mahalanobis distances in the GP-only regime contract by a factor of `√2` (and `D²` by 2). Relative rankings of genes are unchanged because the same scale factor applies everywhere, and FDR thresholds re-calibrate against the null. The sample-variance branch was already correctly summed.
+ - **Differential-abundance posterior tail probability is now one-sided**: `DifferentialAbundance.predict()` returns `PTP = Φ(−|z|)` (one-sided), matching the manuscript definition `PTP(x_i) = Φ(−|Δ(x_i)|/√(σ_a² + σ_b²))`. Previous releases returned `2·Φ(−|z|)` (two-sided). **Effect**: numeric PTP values are halved relative to 0.7.0; equivalently, `neg_log10_fold_change_ptp` increases by `log10(2) ≈ 0.301`. The threshold `PTP < 1e-3` previously corresponded to `|z| ≥ 3.29` and now corresponds to `|z| ≥ 3.09`. Re-tune any hard-coded `ptp_threshold` chosen against 0.7.0 if you want to preserve the old call-rate.
+ - **`use_empirical_variance` default is now `False` everywhere**: harmonized across `kompot.de()` (already False), `DifferentialExpression.__init__`, `ExpressionModel.__init__`, `kompot.smooth_expression()`, the deprecated `compute_differential_expression()` and `compute_smoothed_expression()` wrappers, and the CLI `smooth_config_template.yaml`. Previously these four entry points defaulted to `True`, inconsistent with both the recommended `kompot.de()` path and the manuscript's "empirical variance is disabled by default" statement. Code that relies on empirical variance must now pass `use_empirical_variance=True` explicitly.
+
 ### New features
 
  - **`--dry-run` flag for `kompot de` CLI**: estimates memory, disk, and output field requirements without running the analysis. Outputs machine-parseable JSON to stdout and a human-readable report to stderr. Exit code reflects feasibility.

@@ -751,7 +751,7 @@ def compute_differential_expression(
     return_full_results: bool = False,
     store_posterior_covariance: bool = False,
     allow_single_condition_variance: bool = False,
-    use_empirical_variance: bool = True,
+    use_empirical_variance: bool = False,
     progress: bool = True,
     null_genes="auto",
     null_seed=42,

@@ -109,7 +109,7 @@ def smooth_expression(
     ls = gp.ls if gp is not None else None
     ls_factor = gp.ls_factor if gp is not None else 10.0
     n_landmarks = gp.n_landmarks if gp is not None else 5000
-    use_empirical_variance = gp.use_empirical_variance if gp is not None else True
+    use_empirical_variance = gp.use_empirical_variance if gp is not None else False
     eps = gp.eps if gp is not None else 1e-8
     random_state = gp.random_state if gp is not None else None
     batch_size = gp.batch_size if gp is not None else 500
@@ -393,7 +393,7 @@ def compute_smoothed_expression(
     sigma: float = 1.0,
     ls: Optional[float] = None,
     ls_factor: float = 10.0,
-    use_empirical_variance: bool = True,
+    use_empirical_variance: bool = False,
     eps: float = 1e-8,
     random_state: Optional[int] = None,
     batch_size: int = 500,

@@ -26,7 +26,7 @@ n_landmarks: 5000              # Number of landmarks for Nystrom approximation
 sample_col: null               # Column in adata.obs with sample IDs
 
 # Empirical variance (heteroscedastic noise):
-use_empirical_variance: true   # Estimate per-gene noise from GP residuals
+use_empirical_variance: false  # Estimate per-gene noise from GP residuals
 
 # GP kernel parameters:
 sigma: 1.0                     # Noise level for function estimator

@@ -603,11 +603,12 @@ def compute_sample_variance2(X_batch):
         sd = np.sqrt(log_fold_change_uncertainty + self.eps)
         log_fold_change_zscore = log_fold_change / sd
 
-        # Compute PTP (Posterior Tail Probability) in natural log (base e)
+        # Compute PTP (Posterior Tail Probability) in natural log (base e).
+        # One-sided per manuscript: PTP = Φ(−|z|) = min(Φ(z), Φ(−z)) for real z.
         ln_ptp = np.minimum(
             normal.logcdf(log_fold_change_zscore),
             normal.logcdf(-log_fold_change_zscore),
-        ) + np.log(2)
+        )
 
         # Convert from natural log to negative log10 (for better volcano plot visualization)
         # ln_ptp is a log of a small value (typically < 1), so it's negative

@@ -42,7 +42,7 @@ def __init__(
         self,
         n_landmarks: Optional[int] = None,
         use_sample_variance: Optional[bool] = None,
-        use_empirical_variance: bool = True,
+        use_empirical_variance: bool = False,
         eps: float = 1e-8,  # Increased default epsilon for better numerical stability
         jit_compile: bool = False,
         function_predictor1: Optional[Any] = None,
@@ -625,8 +625,10 @@ def compute_mahalanobis_distances(
             # Points for sample variance computation
             variance_points = X
 
-        # Average the covariance matrices
-        combined_cov = (cov1 + cov2) / 2
+        # Sum the covariance matrices: Σ_a + Σ_b is the variance of the
+        # difference of independent posterior estimators, matching the
+        # Mahalanobis denominator defined in the manuscript.
+        combined_cov = cov1 + cov2
         del cov1, cov2
 
         # For sample variance, use diag=False to get full covariance matrices

@@ -112,6 +112,7 @@ class ExpressionModel:
         Number of landmarks for Nystrom approximation.
     use_empirical_variance : bool
         Whether to estimate per-gene empirical variance from GP residuals.
+        By default False.
     eps : float
         Small constant for numerical stability.
     random_state : int, optional
@@ -135,7 +136,7 @@ class ExpressionModel:
     def __init__(
         self,
         n_landmarks: Optional[int] = None,
-        use_empirical_variance: bool = True,
+        use_empirical_variance: bool = False,
         eps: float = 1e-8,
         random_state: Optional[int] = None,
         batch_size: int = 500,

@@ -898,10 +898,10 @@ def estimate_differential_expression_resources(
         shape=cov_matrix_shape,
     )
 
-    # Combined covariance matrix (averaged)
+    # Combined covariance matrix (sum: Σ_a + Σ_b)
     plan.add_requirement(
         "Combined covariance matrix",
-        cov_size,  # (cov1 + cov2) / 2
+        cov_size,  # cov1 + cov2
         "memory",
         shape=cov_matrix_shape,
     )

@@ -1,3 +1,3 @@
 """Version information."""
 
-__version__ = "0.7.0"
+__version__ = "0.8.0"
@@ -49,7 +49,7 @@ ignore = ["E203", "W503"]
 
 [project]
 name = "kompot"
-version = "0.7.0"
+version = "0.8.0"
 description = "Differential abundance and gene expression analysis using Mahalanobis distance with JAX backend"
 readme = "README.md"
 authors = [