Fix #848: Pass estimation_sample_size parameter to individual trees in UpliftRandomForestClassifier (#850)

mohsinm-dev · web-flow · commit 6f4ec718c0e8 · 2025-09-25T20:14:47.000-07:00
diff --git a/causalml/inference/tree/uplift.pyx b/causalml/inference/tree/uplift.pyx
@@ -243,7 +243,7 @@ def group_uniqueCounts_to_arr(np.ndarray[TR_TYPE_t, ndim=1] treatment_idx,
         tv = treatment_idx[i]
         # assume treatment index is in range
         out_arr[2*tv] += 1
-        # assume y should be either 0 or 1, so this is summing 
+        # assume y should be either 0 or 1, so this is summing
         out_arr[2*tv + 1] += y[i]
     # adjust the entry at index 2*i to be N(Y = 0, T = i) = N(T = i) - N(Y = 1, T = i)
     for i in range(n_class):
@@ -322,7 +322,7 @@ def group_counts_by_divide(
             tv = treatment_idx[i]
             # assume treatment index is in range
             out_arr[2*tv] += 1
-            # assume y should be either 0 or 1, so this is summing 
+            # assume y should be either 0 or 1, so this is summing
             out_arr[2*tv + 1] += y[i]
     # adjust the entry at index 2*i to be N(Y = 0, T = i) = N(T = i) - N(Y = 1, T = i)
     for i in range(n_class):
@@ -360,9 +360,9 @@ class UpliftTreeClassifier:
     n_reg: int, optional (default=100)
         The regularization parameter defined in Rzepakowski et al. 2012, the weight (in terms of sample size) of the
         parent node influence on the child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods.
-    
+
     early_stopping_eval_diff_scale: float, optional (default=1)
-        If train and valid uplift score diff bigger than 
+        If train and valid uplift score diff bigger than
         min(train_uplift_score,valid_uplift_score)/early_stopping_eval_diff_scale, stop.
 
     control_name: string
@@ -404,7 +404,7 @@ class UpliftTreeClassifier:
             self.arr_eval_func = self.arr_evaluate_ED
         elif evaluationFunction == 'Chi':
             self.evaluationFunction = self.evaluate_Chi
-            self.arr_eval_func = self.arr_evaluate_Chi     
+            self.arr_eval_func = self.arr_evaluate_Chi
         elif evaluationFunction == 'DDP':
             self.evaluationFunction = self.evaluate_DDP
             self.arr_eval_func = self.arr_evaluate_DDP
@@ -465,7 +465,7 @@ class UpliftTreeClassifier:
             y_val = (y_val > 0).astype(Y_TYPE) # make sure it is 0 or 1, and is int8
             treatment_val = np.asarray(treatment_val)
             assert len(y_val) == len(treatment_val), 'Data length must be equal for X_val, treatment_val, and y_val.'
-        
+
         # Get treatment group keys. self.classes_[0] is reserved for the control group.
         treatment_groups = sorted([x for x in list(set(treatment)) if x != self.control_name])
         self.classes_ = [self.control_name]
@@ -1336,7 +1336,7 @@ class UpliftTreeClassifier:
                          np.ndarray[N_TYPE_t, ndim=1] right_node_summary_n):
         '''
         Calculate likelihood ratio test statistic as split evaluation criterion for a given node
-        
+
         NOTE: n_class should be 2.
 
         Args
@@ -1365,7 +1365,7 @@ class UpliftTreeClassifier:
             Has type numpy.int32.
             The counts of each of the control
             and treament groups of the right node, i.e. [N(T=i)...]
-                
+
         Returns
         -------
         lrt : Likelihood ratio test statistic
@@ -1422,7 +1422,7 @@ class UpliftTreeClassifier:
     def evaluate_IDDP(nodeSummary):
         '''
         Calculate Delta P as split evaluation criterion for a given node.
-        
+
         Args
         ----
         nodeSummary : dictionary
@@ -1444,7 +1444,7 @@ class UpliftTreeClassifier:
                           np.ndarray[N_TYPE_t, ndim=1] node_summary_n):
         '''
         Calculate Delta P as split evaluation criterion for a given node.
-        
+
         Args
         ----
         node_summary_p : array of shape [n_class]
@@ -1589,7 +1589,7 @@ class UpliftTreeClassifier:
             Normalization factor.
         '''
         cdef N_TYPE_t[::1] cur_summary_n = cur_node_summary_n
-        cdef N_TYPE_t[::1] left_summary_n = left_node_summary_n        
+        cdef N_TYPE_t[::1] left_summary_n = left_node_summary_n
         cdef int n_class = cur_summary_n.shape[0]
         cdef int i = 0
 
@@ -1929,7 +1929,7 @@ class UpliftTreeClassifier:
         cdef np.ndarray[N_TYPE_t, ndim=1] val_left_summary_n = np.zeros(self.n_class, dtype = N_TYPE)
         cdef np.ndarray[P_TYPE_t, ndim=1] val_right_summary_p = np.zeros(self.n_class, dtype = P_TYPE)
         cdef np.ndarray[N_TYPE_t, ndim=1] val_right_summary_n = np.zeros(self.n_class, dtype = N_TYPE)
-        
+
         # dummy
         cdef int has_parent_summary = 0
         if parentNodeSummary_p is None:
@@ -2107,7 +2107,7 @@ class UpliftTreeClassifier:
                     for k in range(n_class):
                         if (abs(val_left_summary_p[k] - left_summary_p[k]) >
                                 min(val_left_summary_p[k], left_summary_p[k])/early_stopping_eval_diff_scale or
-                            abs(val_right_summary_p[k] - right_summary_p[k]) > 
+                            abs(val_right_summary_p[k] - right_summary_p[k]) >
                                 min(val_right_summary_p[k], right_summary_p[k])/early_stopping_eval_diff_scale):
                             early_stopping_flag = True
                             break
@@ -2160,13 +2160,13 @@ class UpliftTreeClassifier:
                         norm_factor = self.arr_normI(cur_summary_n, left_summary_n, alpha=0.9)
                     else:
                         norm_factor = 1
-                    gain = gain / norm_factor 
+                    gain = gain / norm_factor
                 if (gain > bestGain and len_X_l > min_samples_leaf and len_X_r > min_samples_leaf):
                     bestGain = gain
                     bestGainImp = gain_for_imp
                     best_col = col
                     best_value = value
-        
+
         # after finding the best split col and value
         if best_col is not None:
             bestAttribute = (best_col, best_value)
@@ -2364,7 +2364,7 @@ class UpliftRandomForestClassifier:
         child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods.
 
     early_stopping_eval_diff_scale: float, optional (default=1)
-        If train and valid uplift score diff bigger than 
+        If train and valid uplift score diff bigger than
         min(train_uplift_score,valid_uplift_score)/early_stopping_eval_diff_scale, stop.
 
     control_name: string
@@ -2427,6 +2427,7 @@ class UpliftRandomForestClassifier:
         self.control_name = control_name
         self.normalization = normalization
         self.honesty = honesty
+        self.estimation_sample_size = estimation_sample_size
         self.n_jobs = n_jobs
         self.joblib_prefer = joblib_prefer
 
@@ -2477,6 +2478,7 @@ class UpliftRandomForestClassifier:
                 control_name=self.control_name,
                 normalization=self.normalization,
                 honesty=self.honesty,
+                estimation_sample_size=self.estimation_sample_size,
                 random_state=random_state.randint(MAX_INT))
             for _ in range(self.n_estimators)
         ]
@@ -2512,7 +2514,7 @@ class UpliftRandomForestClassifier:
             x_val_bt = X_val[bt_val_index]
             y_val_bt = y_val[bt_val_index]
             treatment_val_bt = treatment_val[bt_val_index]
-    
+
             tree.fit(X=x_train_bt, treatment=treatment_train_bt, y=y_train_bt, X_val=x_val_bt, treatment_val=treatment_val_bt, y_val=y_val_bt)
         return tree