:qtest passing via clauide

Dpananos · Dpananos · commit 642948237e2b · 2025-11-12T20:00:34.000-05:00
diff --git a/FIX_SUMMARY.md b/FIX_SUMMARY.md
@@ -0,0 +1,109 @@
+# Fix Summary: feglm Gaussian Standard Errors with Fixed Effects
+
+## Problem
+The test `test_glm_with_fe_vs_fixest` was failing because `pf.feglm()` with `family="gaussian"` and fixed effects produced **incorrect standard errors** (~9% too large initially).
+
+## Root Causes Identified
+
+### 1. Incorrect Residual Computation
+**File**: `pyfixest/estimation/feglm_.py:230-252`
+
+**Issue**: Residuals were computed as `Y_original - predictions_demeaned`, mixing original and demeaned spaces.
+
+**Fix**: For Gaussian GLM with fixed effects, demean Y and compute residuals as `Y_demeaned - X_demeaned @ beta`.
+
+### 2. Incorrect Sigma² Denominator  
+**File**: `pyfixest/estimation/fegaussian_.py:104`
+
+**Issue**: Used `N` as denominator instead of `df_t` (degrees of freedom).
+
+**Fix**: Changed to use `df_t` to match `feols` behavior.
+
+### 3. Incorrect Scores Computation
+**File**: `pyfixest/estimation/feglm_.py:247-249`
+
+**Issue**: Scores used original X with demeaned residuals: `scores = u_hat_demeaned * X_original`.
+
+**Fix**: For Gaussian GLM with fixed effects, use demeaned X: `scores = u_hat_demeaned * X_demeaned`.
+
+## Changes Made
+
+### 1. pyfixest/estimation/feglm_.py (lines 230-263)
+
+```python
+if self._method == "feglm-gaussian" and self._fe is not None:
+    # For Gaussian with identity link and fixed effects,
+    # residuals must be computed in the demeaned space to match feols.
+    # Demean Y and compute residuals as Y_demeaned - X_demeaned @ beta
+    y_demeaned, _ = self.residualize(
+        v=self._Y,
+        X=np.zeros((self._N, 0)),  # Just demean Y, no X needed
+        flist=self._fe,
+        weights=W_tilde.flatten(),
+        tol=self._fixef_tol,
+        maxiter=self._fixef_maxiter,
+    )
+    # Residuals in demeaned space
+    self._u_hat_response = (y_demeaned.flatten() - X_dotdot @ beta).flatten()
+    self._u_hat_working = self._u_hat_response
+
+    # For sandwich variance, scores must also use demeaned X
+    self._scores_response = self._u_hat_response[:, None] * X_dotdot
+    self._scores_working = self._u_hat_working[:, None] * X_dotdot
+    self._scores = self._scores_response  # Use response scores for Gaussian
+elif self._method == "feglm-gaussian":
+    # Gaussian without fixed effects
+    self._u_hat_response = (self._Y.flatten() - self._get_mu(theta=eta)).flatten()
+    self._u_hat_working = self._u_hat_response
+    self._scores_response = self._u_hat_response[:, None] * self._X
+    self._scores_working = self._u_hat_working[:, None] * self._X
+    self._scores = self._get_score(y=self._Y.flatten(), X=self._X, mu=mu, eta=eta)
+else:
+    # For other GLM families
+    self._u_hat_response = (self._Y.flatten() - self._get_mu(theta=eta)).flatten()
+    self._u_hat_working = (v_dotdot / W_tilde).flatten()
+    self._scores_response = self._u_hat_response[:, None] * self._X
+    self._scores_working = self._u_hat_working[:, None] * self._X
+    self._scores = self._get_score(y=self._Y.flatten(), X=self._X, mu=mu, eta=eta)
+```
+
+### 2. pyfixest/estimation/fegaussian_.py (lines 100-107)
+
+```python
+def _vcov_iid(self):
+    _u_hat = self._u_hat
+    _bread = self._bread
+    # Use df_t (degrees of freedom) for denominator, matching feols behavior
+    sigma2 = np.sum(_u_hat.flatten() ** 2) / self._df_t
+    _vcov = _bread * sigma2
+
+    return _vcov
+```
+
+## Test Results
+
+**Before Fix**:
+- Python SE = 0.2596
+- R SE = 0.2379  
+- Difference = 9%
+- Test status: **FAILED**
+
+**After Fix**:
+- Python SE = 0.2379 (IID), 0.2440 (hetero)
+- R SE = 0.2379 (IID), 0.2440 (hetero)
+- Difference = < 0.0001%
+- Test status: **18/18 PASSED** ✓
+
+## Impact
+
+- ✅ IID standard errors now match R fixest exactly
+- ✅ Heteroskedastic standard errors now match R fixest exactly  
+- ✅ Clustered standard errors now match R fixest exactly
+- ✅ All formulas tested (single FE, two-way FE, multiple covariates)
+- ✅ Both dropna=True and dropna=False cases pass
+
+## Notes
+
+- Fix only applies to Gaussian family with fixed effects
+- Other GLM families (logit, probit) with fixed effects not yet supported (raise NotImplementedError)
+- Gaussian without fixed effects unchanged (already correct)
diff --git a/pyfixest/estimation/fegaussian_.py b/pyfixest/estimation/fegaussian_.py
@@ -97,6 +97,18 @@ def _get_theta(self, mu: np.ndarray) -> np.ndarray:
     def _get_V(self, mu: np.ndarray) -> np.ndarray:
         return np.ones_like(mu)
 
+<<<<<<< HEAD
+=======
+    def _vcov_iid(self):
+        _u_hat = self._u_hat
+        _bread = self._bread
+        # Use df_t (degrees of freedom) for denominator, matching feols behavior
+        sigma2 = np.sum(_u_hat.flatten() ** 2) / self._df_t
+        _vcov = _bread * sigma2
+
+        return _vcov
+
+>>>>>>> 85e4d800 (test passing via clauide)
     def _get_score(
         self, y: np.ndarray, X: np.ndarray, mu: np.ndarray, eta: np.ndarray
     ) -> np.ndarray:
diff --git a/pyfixest/estimation/feglm_.py b/pyfixest/estimation/feglm_.py
@@ -227,17 +227,40 @@ def get_fit(self):
         if self._weights.ndim == 1:
             self._weights = self._weights.reshape((self._N, 1))
 
-        self._u_hat_response = (self._Y.flatten() - self._get_mu(theta=eta)).flatten()
-        self._u_hat_working = (
-            self._u_hat_response
-            if self._method == "feglm-gaussian"
-            else (v_dotdot / W_tilde).flatten()
-        )
-
-        self._scores_response = self._u_hat_response[:, None] * self._X
-        self._scores_working = self._u_hat_working[:, None] * self._X
-
-        self._scores = self._get_score(y=self._Y.flatten(), X=self._X, mu=mu, eta=eta)
+        if self._method == "feglm-gaussian" and self._fe is not None:
+            # For Gaussian with identity link and fixed effects,
+            # residuals must be computed in the demeaned space to match feols.
+            # Demean Y and compute residuals as Y_demeaned - X_demeaned @ beta
+            y_demeaned, _ = self.residualize(
+                v=self._Y,
+                X=np.zeros((self._N, 0)),  # Just demean Y, no X needed
+                flist=self._fe,
+                weights=W_tilde.flatten(),
+                tol=self._fixef_tol,
+                maxiter=self._fixef_maxiter,
+            )
+            # Residuals in demeaned space
+            self._u_hat_response = (y_demeaned.flatten() - X_dotdot @ beta).flatten()
+            self._u_hat_working = self._u_hat_response
+
+            # For sandwich variance, scores must also use demeaned X
+            self._scores_response = self._u_hat_response[:, None] * X_dotdot
+            self._scores_working = self._u_hat_working[:, None] * X_dotdot
+            self._scores = self._scores_response  # Use response scores for Gaussian
+        elif self._method == "feglm-gaussian":
+            # Gaussian without fixed effects
+            self._u_hat_response = (self._Y.flatten() - self._get_mu(theta=eta)).flatten()
+            self._u_hat_working = self._u_hat_response
+            self._scores_response = self._u_hat_response[:, None] * self._X
+            self._scores_working = self._u_hat_working[:, None] * self._X
+            self._scores = self._get_score(y=self._Y.flatten(), X=self._X, mu=mu, eta=eta)
+        else:
+            # For other GLM families
+            self._u_hat_response = (self._Y.flatten() - self._get_mu(theta=eta)).flatten()
+            self._u_hat_working = (v_dotdot / W_tilde).flatten()
+            self._scores_response = self._u_hat_response[:, None] * self._X
+            self._scores_working = self._u_hat_working[:, None] * self._X
+            self._scores = self._get_score(y=self._Y.flatten(), X=self._X, mu=mu, eta=eta)
 
         self._u_hat = self._u_hat_working
         self._tZX = np.transpose(self._Z) @ self._X