From f0367434f9e79091fdf563b789d38c860bb358df Mon Sep 17 00:00:00 2001 From: floriankozikowski Date: Tue, 29 Jul 2025 14:16:41 +0200 Subject: [PATCH 1/2] add fit_intercept support for LBFGS --- issue320.py | 184 +++++++++++++++++++++++++++++++++++++++++ skglm/solvers/lbfgs.py | 56 +++++++++---- 2 files changed, 225 insertions(+), 15 deletions(-) create mode 100644 issue320.py diff --git a/issue320.py b/issue320.py new file mode 100644 index 00000000..a340a96c --- /dev/null +++ b/issue320.py @@ -0,0 +1,184 @@ +import numpy as np +import time + +from skglm import GeneralizedLinearEstimator +from skglm.datafits import Poisson +from skglm.penalties import L2 +from skglm.solvers import LBFGS +from skglm.utils.data import make_correlated_data +from sklearn.linear_model import PoissonRegressor +from sklearn.metrics import mean_poisson_deviance, mean_absolute_error + + +def generate_correlated_poisson_data( + n_samples=20000, + n_features=50, + rho=0.5, + density=0.5, + seed=42 +): + print("\n1. Generating synthetic correlated data for Poisson GLM...") + print( + f" (n_samples={n_samples}, n_features={n_features}, " + f"rho={rho}, density={density})" + ) + + # Use make_correlated_data to get X and w_true. + X, _, w_true = make_correlated_data( + n_samples=n_samples, + n_features=n_features, + rho=rho, + snr=10, + density=density, + random_state=seed + ) + + # Define a true intercept + intercept_true = -1.0 + + # Calculate the linear predictor + eta = intercept_true + X @ w_true + + # Apply the inverse link function + eta = np.clip(eta, -15, 15) + mu = np.exp(eta) + + # Generate the Poisson-distributed response variable + rng = np.random.default_rng(seed) + y = rng.poisson(mu) + + return X, y, w_true, intercept_true + + +def run_benchmark(): + """Main function to run the GLM benchmark.""" + + # 1. Generate data + # Parameters for data generation + N_SAMPLES = 100000 + N_FEATURES = 1000 + RHO = 0.6 + DENSITY = 0.5 # Sparsity of true coefficients + + X, y_true, w_true, intercept_true = generate_correlated_poisson_data( + n_samples=N_SAMPLES, + n_features=N_FEATURES, + rho=RHO, + density=DENSITY, + seed=42 + ) + + # 2. Shared model parameters + print("\n2. Setting up models...") + alpha = 0.01 # L2 regularization strength + tol = 1e-4 # Same tolerance as sklearn's PoissonRegressor + iter_n = 1000 # Increase max_iter to allow convergence + + # 3a. Fit the GLM with skglm + print("\n3a. Fitting the GLM with skglm...") + estimator_skglm = GeneralizedLinearEstimator( + datafit=Poisson(), + # Using L2 penalty (Ridge) for LBFGS compatibility + penalty=L2(alpha=alpha), + solver=LBFGS(verbose=False, tol=tol, max_iter=iter_n, fit_intercept=True) + ) + + start_time_skglm = time.perf_counter() + estimator_skglm.fit(X, y_true) + end_time_skglm = time.perf_counter() + skglm_fit_time = end_time_skglm - start_time_skglm + print(f" skglm fit complete in {skglm_fit_time:.4f} seconds.") + + # 3b. Fit the GLM with scikit-learn + print("\n3b. Fitting the GLM with scikit-learn...") + # PoissonRegressor in sklearn uses an L2 penalty. + estimator_sklearn = PoissonRegressor( + alpha=alpha, + fit_intercept=True, + tol=tol, + solver='lbfgs', + max_iter=iter_n + ) + + start_time_sklearn = time.time() + estimator_sklearn.fit(X, y_true) + end_time_sklearn = time.time() + sklearn_fit_time = end_time_sklearn - start_time_sklearn + print(f" sklearn fit complete in {sklearn_fit_time:.4f} seconds.") + + # 4. Compare the results + print("\n" + "="*80) + print("RESULTS COMPARISON") + print("="*80) + + # --- Coefficient Comparison --- + print("\n--- Coefficient Comparison ---") + + # Intercept + print(f"{'Parameter':<20} | {'Ground Truth':<15} | " + f"{'skglm Fit':<15} | {'sklearn Fit':<15}") + print("-" * 75) + print(f"{'Intercept':<20} | {intercept_true:<15.4f} | " + f"{estimator_skglm.intercept_:<15.4f} | " + f"{estimator_sklearn.intercept_:<15.4f}") + + # MAE of Coefficients + mae_skglm = mean_absolute_error(w_true, estimator_skglm.coef_) + mae_sklearn = mean_absolute_error(w_true, estimator_sklearn.coef_) + print(f"\n{'MAE vs. w_true':<20} | {'':<15} | " + f"{mae_skglm:<15.6f} | {mae_sklearn:<15.6f}") + + # Spot-check of first 5 coefficients + print("\nSpot-check of first 5 coefficients:") + print(f"{'Parameter':<12} | {'Ground Truth':<15} | " + f"{'skglm Fit':<15} | {'sklearn Fit':<15}") + print("-" * 65) + for i in range(min(5, N_FEATURES)): + print( + f"w_{i:<10} | {w_true[i]:<15.4f} | " + f"{estimator_skglm.coef_[i]:<15.4f} | " + f"{estimator_sklearn.coef_[i]:<15.4f}") + + # --- Timing Comparison --- + print("\n--- Fitting Time Comparison ---") + print(f"skglm (LBFGS): {skglm_fit_time:.4f} seconds") + print(f"sklearn (L-BFGS): {sklearn_fit_time:.4f} seconds") + if skglm_fit_time < sklearn_fit_time: + speedup = sklearn_fit_time / \ + skglm_fit_time if skglm_fit_time > 0 else float('inf') + print(f" >> skglm was {speedup:.2f}x faster.") + else: + speedup = skglm_fit_time / \ + sklearn_fit_time if sklearn_fit_time > 0 else float('inf') + print(f" >> sklearn was {speedup:.2f}x faster.") + + # --- Performance Metrics Comparison --- + def calculate_metrics(estimator, X, y_true): + y_pred = estimator.predict(X) + # clip to avoid log(0) in deviance calculation + y_pred = np.clip(y_pred, 1e-9, None) + dev_model = len(y_true) * mean_poisson_deviance(y_true, y_pred) + return dev_model + + dev_model_skglm = calculate_metrics(estimator_skglm, X, y_true) + dev_model_sklearn = calculate_metrics(estimator_sklearn, X, y_true) + + # Null deviance + y_null = np.full_like(y_true, fill_value=y_true.mean(), dtype=float) + dev_null = len(y_true) * mean_poisson_deviance(y_true, y_null) + + pseudo_r2_skglm = 1.0 - (dev_model_skglm / dev_null) + pseudo_r2_sklearn = 1.0 - (dev_model_sklearn / dev_null) + + print("\n--- Performance Metrics ---") + print(f"{'Metric':<30} | {'skglm':<15} | {'sklearn':<15}") + print("-" * 65) + print(f"{'Model Deviance':<30} | {dev_model_skglm:<15,.2f} | " + f"{dev_model_sklearn:<15,.2f}") + print(f"{'Null Deviance':<30} | {dev_null:<15,.2f} | {dev_null:<15,.2f}") + print(f"{'Pseudo R² (Deviance Explained)':<30} | " + f"{pseudo_r2_skglm:<15.4f} | {pseudo_r2_sklearn:<15.4f}") + + +if __name__ == "__main__": + run_benchmark() diff --git a/skglm/solvers/lbfgs.py b/skglm/solvers/lbfgs.py index 854be64e..e57c7178 100644 --- a/skglm/solvers/lbfgs.py +++ b/skglm/solvers/lbfgs.py @@ -24,6 +24,9 @@ class LBFGS(BaseSolver): tol : float, default 1e-4 Tolerance for convergence. + fit_intercept : bool, default False + Whether or not to fit an intercept. + verbose : bool, default False Amount of verbosity. 0/False is silent. """ @@ -31,9 +34,11 @@ class LBFGS(BaseSolver): _datafit_required_attr = ("gradient",) _penalty_required_attr = ("gradient",) - def __init__(self, max_iter=50, tol=1e-4, verbose=False): + def __init__(self, max_iter=50, tol=1e-4, fit_intercept=False, verbose=False): self.max_iter = max_iter self.tol = tol + self.fit_intercept = fit_intercept + self.warm_start = False self.verbose = verbose def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): @@ -46,25 +51,46 @@ def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): datafit.initialize(X, y) def objective(w): - Xw = X @ w - datafit_value = datafit.value(y, w, Xw) - penalty_value = penalty.value(w) + if self.fit_intercept: + Xw = X @ w[:-1] + w[-1] + datafit_value = datafit.value(y, w[:-1], Xw) + penalty_value = penalty.value(w[:-1]) + else: + Xw = X @ w + datafit_value = datafit.value(y, w, Xw) + penalty_value = penalty.value(w) return datafit_value + penalty_value def d_jac(w): - Xw = X @ w - datafit_grad = datafit.gradient(X, y, Xw) - penalty_grad = penalty.gradient(w) - - return datafit_grad + penalty_grad + if self.fit_intercept: + Xw = X @ w[:-1] + w[-1] + datafit_grad = datafit.gradient(X, y, Xw) + penalty_grad = penalty.gradient(w[:-1]) + intercept_grad = datafit.intercept_update_step(y, Xw) + return np.concatenate([datafit_grad + penalty_grad, [intercept_grad]]) + else: + Xw = X @ w + datafit_grad = datafit.gradient(X, y, Xw) + penalty_grad = penalty.gradient(w) + + return datafit_grad + penalty_grad def s_jac(w): - Xw = X @ w - datafit_grad = datafit.gradient_sparse(X.data, X.indptr, X.indices, y, Xw) - penalty_grad = penalty.gradient(w) - - return datafit_grad + penalty_grad + if self.fit_intercept: + Xw = X @ w[:-1] + w[-1] + datafit_grad = datafit.gradient_sparse( + X.data, X.indptr, X.indices, y, Xw) + penalty_grad = penalty.gradient(w[:-1]) + intercept_grad = datafit.intercept_update_step(y, Xw) + return np.concatenate([datafit_grad + penalty_grad, [intercept_grad]]) + else: + Xw = X @ w + datafit_grad = datafit.gradient_sparse( + X.data, X.indptr, X.indices, y, Xw) + penalty_grad = penalty.gradient(w) + + return datafit_grad + penalty_grad def callback_post_iter(w_k): # save p_obj @@ -81,7 +107,7 @@ def callback_post_iter(w_k): ) n_features = X.shape[1] - w = np.zeros(n_features) if w_init is None else w_init + w = np.zeros(n_features + self.fit_intercept) if w_init is None else w_init jac = s_jac if issparse(X) else d_jac p_objs_out = [] From 1173877bba429c04146f7fed11a2ff6206430bdc Mon Sep 17 00:00:00 2001 From: floriankozikowski Date: Thu, 31 Jul 2025 11:14:15 +0200 Subject: [PATCH 2/2] refactor --- skglm/solvers/lbfgs.py | 46 ++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/skglm/solvers/lbfgs.py b/skglm/solvers/lbfgs.py index e57c7178..ba3d8b22 100644 --- a/skglm/solvers/lbfgs.py +++ b/skglm/solvers/lbfgs.py @@ -51,45 +51,39 @@ def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): datafit.initialize(X, y) def objective(w): + w_features = w[:n_features] + Xw = X @ w_features if self.fit_intercept: - Xw = X @ w[:-1] + w[-1] - datafit_value = datafit.value(y, w[:-1], Xw) - penalty_value = penalty.value(w[:-1]) - else: - Xw = X @ w - datafit_value = datafit.value(y, w, Xw) - penalty_value = penalty.value(w) - + Xw += w[-1] + datafit_value = datafit.value(y, w_features, Xw) + penalty_value = penalty.value(w_features) return datafit_value + penalty_value def d_jac(w): + w_features = w[:n_features] + Xw = X @ w_features + if self.fit_intercept: + Xw += w[-1] + datafit_grad = datafit.gradient(X, y, Xw) + penalty_grad = penalty.gradient(w_features) if self.fit_intercept: - Xw = X @ w[:-1] + w[-1] - datafit_grad = datafit.gradient(X, y, Xw) - penalty_grad = penalty.gradient(w[:-1]) - intercept_grad = datafit.intercept_update_step(y, Xw) + intercept_grad = datafit.raw_grad(y, Xw).sum() return np.concatenate([datafit_grad + penalty_grad, [intercept_grad]]) else: - Xw = X @ w - datafit_grad = datafit.gradient(X, y, Xw) - penalty_grad = penalty.gradient(w) - return datafit_grad + penalty_grad def s_jac(w): + w_features = w[:n_features] + Xw = X @ w_features + if self.fit_intercept: + Xw += w[-1] + datafit_grad = datafit.gradient_sparse( + X.data, X.indptr, X.indices, y, Xw) + penalty_grad = penalty.gradient(w_features) if self.fit_intercept: - Xw = X @ w[:-1] + w[-1] - datafit_grad = datafit.gradient_sparse( - X.data, X.indptr, X.indices, y, Xw) - penalty_grad = penalty.gradient(w[:-1]) - intercept_grad = datafit.intercept_update_step(y, Xw) + intercept_grad = datafit.raw_grad(y, Xw).sum() return np.concatenate([datafit_grad + penalty_grad, [intercept_grad]]) else: - Xw = X @ w - datafit_grad = datafit.gradient_sparse( - X.data, X.indptr, X.indices, y, Xw) - penalty_grad = penalty.gradient(w) - return datafit_grad + penalty_grad def callback_post_iter(w_k):