Merge remote-tracking branch 'origin/master'

nworb-cire · nworb-cire · commit 62b3efa4e067 · 2025-11-10T07:19:58.000-07:00
* origin/master: [MNT] add automatic test workflow (scikit-learn-contrib#146) add error message when user passes decision trees (scikit-learn-contrib#141) migrate from unittest to pytest (scikit-learn-contrib#140) Bump pypa/gh-action-pypi-publish in /.github/workflows (scikit-learn-contrib#144) evaluate features only after 5th iteration (scikit-learn-contrib#137) add gitignore (scikit-learn-contrib#139) fix typo in docstring (scikit-learn-contrib#138)
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -38,4 +38,4 @@ jobs:
     - name: Build a binary wheel
       run: python setup.py sdist bdist_wheel
     - name: Publish distribution 📦 to PyPI
-      uses: pypa/gh-action-pypi-publish@v1.9.0
+      uses: pypa/gh-action-pypi-publish@v1.13.0
diff --git a/.github/workflows/test_package.yml b/.github/workflows/test_package.yml
@@ -0,0 +1,78 @@
+name: Test boruta
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        include:
+          # Regular Python versions (no special package versions)
+          - python-version: "3.10"
+          - python-version: "3.12"
+          - python-version: "3.13"
+
+          # Python 3.11 with different scikit-learn versions
+          - python-version: "3.11"
+            sklearn-version: "1.5.2"
+          - python-version: "3.11"
+            sklearn-version: "1.6.1"
+          - python-version: "3.11"
+            sklearn-version: "1.7.0"
+
+          # Python 3.11 with different NumPy versions
+          - python-version: "3.11"
+            numpy-version: "1.26.4"
+          - python-version: "3.11"
+            numpy-version: "2.0.1"
+          - python-version: "3.11"
+            numpy-version: "2.1.1"
+          - python-version: "3.11"
+            numpy-version: "2.2.2"
+          - python-version: "3.11"
+            numpy-version: "2.3.1"
+
+    name: >-
+      Python ${{ matrix.python-version }}
+      ${{ matrix.sklearn-version && format('(scikit-learn {0})', matrix.sklearn-version) || '' }}
+      ${{ matrix.numpy-version && format('(NumPy {0})', matrix.numpy-version) || '' }}
+
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install -r test_requirements.txt
+
+          # Install specific scikit-learn version if defined
+          if [ -n "${{ matrix.sklearn-version }}" ]; then
+            echo "Installing scikit-learn==${{ matrix.sklearn-version }}"
+            pip install scikit-learn==${{ matrix.sklearn-version }}
+          fi
+
+          # Install specific NumPy version if defined
+          if [ -n "${{ matrix.numpy-version }}" ]; then
+            echo "Installing numpy==${{ matrix.numpy-version }}"
+            pip install numpy==${{ matrix.numpy-version }}
+          fi
+
+      - name: Test with pytest
+        run: |
+          pip install pytest
+          pytest
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,68 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# Miscelaneous
+.idea
+.vscode
+*.DS_Store
+*.db
+*.pptx
diff --git a/boruta/boruta_py.py b/boruta/boruta_py.py
@@ -45,13 +45,13 @@ class BorutaPy(BaseEstimator, SelectorMixin):
         crucial parameter. For more info, please read about the perc parameter.
     - Automatic tree number:
         Setting the n_estimator to 'auto' will calculate the number of trees
-        in each itartion based on the number of features under investigation.
+        in each iteration based on the number of features under investigation.
         This way more trees are used when the training data has many features
         and less when most of the features have been rejected.
     - Ranking of features:
         After fitting BorutaPy it provides the user with ranking of features.
         Confirmed ones are 1, Tentatives are 2, and the rejected are ranked
-        starting from 3, based on their feautre importance history through
+        starting from 3, based on their feature importance history through
         the iterations.
 
     We highly recommend using pruned trees with a depth between 3-7.
@@ -140,7 +140,7 @@ class BorutaPy(BaseEstimator, SelectorMixin):
     support_weak_ : array of shape [n_features]
 
         The mask of selected tentative features, which haven't gained enough
-        support during the max_iter number of iterations..
+        support during the max_iter number of iterations.
 
     ranking_ : array of shape [n_features]
 
@@ -328,7 +328,7 @@ def _fit(self, X, y):
 
         # set n_estimators
         if self.n_estimators != 'auto':
-            self.estimator.set_params(n_estimators=self.n_estimators)
+            self._set_n_estimators(self.n_estimators)
 
         # main feature selection loop
         while np.any(dec_reg == 0) and _iter < self.max_iter:
@@ -337,7 +337,7 @@ def _fit(self, X, y):
                 # number of features that aren't rejected
                 not_rejected = np.where(dec_reg >= 0)[0].shape[0]
                 n_tree = self._get_tree_num(not_rejected)
-                self.estimator.set_params(n_estimators=n_tree)
+                self._set_n_estimators(n_estimators=n_tree)
 
             # make sure we start with a new tree in each iteration
             if self._is_lightgbm:
@@ -358,13 +358,15 @@ def _fit(self, X, y):
             # register which feature is more imp than the max of shadows
             hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max)
 
-            # based on hit_reg we check if a feature is doing better than
-            # expected by chance
-            dec_reg = self._do_tests(dec_reg, hit_reg, _iter)
+            # Only test after the 5th round.
+            if _iter > 4:
+                # based on hit_reg we check if a feature is doing better than
+                # expected by chance
+                dec_reg = self._do_tests(dec_reg, hit_reg, _iter)
 
-            # print out confirmed features
-            if self.verbose > 0 and _iter < self.max_iter:
-                self._print_results(dec_reg, _iter, 0)
+                # print out confirmed features
+                if self.verbose > 0 and _iter < self.max_iter:
+                    self._print_results(dec_reg, _iter, 0)
             if _iter < self.max_iter:
                 _iter += 1
                 
@@ -454,6 +456,17 @@ def _transform(self, X, weak=False, return_df=False):
             X = X[:, indices]
         return X
 
+    def _set_n_estimators(self, n_estimators):
+        try:
+            self.estimator.set_params(n_estimators=n_estimators)
+        except ValueError:
+            raise ValueError(
+                f"The estimator {self.estimator} does not take the parameter "
+                "n_estimators. Use Random Forests or gradient boosting machines "
+                "instead."
+            )
+        return self
+
     def _get_support_mask(self):
         check_is_fitted(self, 'support_')
         return self.support_
diff --git a/boruta/test/test_boruta.py b/boruta/test/test_boruta.py
@@ -0,0 +1,83 @@
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
+
+from boruta import BorutaPy
+
+
+@pytest.mark.parametrize("tree_n,expected", [(10, 44), (100, 141)])
+def test_get_tree_num(tree_n, expected):
+    rfc = RandomForestClassifier(max_depth=10)
+    bt = BorutaPy(rfc)
+    assert bt._get_tree_num(tree_n) == expected
+
+
+@pytest.fixture(scope="module")
+def Xy():
+    np.random.seed(42)
+    y = np.random.binomial(1, 0.5, 1000)
+    X = np.zeros((1000, 10))
+
+    z = (y - np.random.binomial(1, 0.1, 1000) +
+         np.random.binomial(1, 0.1, 1000))
+    z[z == -1] = 0
+    z[z == 2] = 1
+
+    # 5 relevant features
+    X[:, 0] = z
+    X[:, 1] = (y * np.abs(np.random.normal(0, 1, 1000)) +
+               np.random.normal(0, 0.1, 1000))
+    X[:, 2] = y + np.random.normal(0, 1, 1000)
+    X[:, 3] = y**2 + np.random.normal(0, 1, 1000)
+    X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000)
+
+    # 5 irrelevant features
+    X[:, 5] = np.random.normal(0, 1, 1000)
+    X[:, 6] = np.random.poisson(1, 1000)
+    X[:, 7] = np.random.binomial(1, 0.3, 1000)
+    X[:, 8] = np.random.normal(0, 1, 1000)
+    X[:, 9] = np.random.poisson(1, 1000)
+
+    return X, y
+
+
+def test_if_boruta_extracts_relevant_features(Xy):
+    X, y = Xy
+    rfc = RandomForestClassifier()
+    bt = BorutaPy(rfc)
+    bt.fit(X, y)
+    assert list(range(5)) == list(np.where(bt.support_)[0])
+
+
+def test_if_it_works_with_dataframe_input(Xy):
+    X, y = Xy
+    X_df, y_df = pd.DataFrame(X), pd.Series(y)
+    bt = BorutaPy(RandomForestClassifier())
+    bt.fit(X_df, y_df)
+    assert list(range(5)) == list(np.where(bt.support_)[0])
+
+
+def test_dataframe_is_returned(Xy):
+    X, y = Xy
+    X_df, y_df = pd.DataFrame(X), pd.Series(y)
+    rfc = RandomForestClassifier()
+    bt = BorutaPy(rfc)
+    bt.fit(X_df, y_df)
+    assert isinstance(bt.transform(X_df, return_df=True), pd.DataFrame)
+
+
+@pytest.mark.parametrize("tree", [ExtraTreeClassifier(), DecisionTreeClassifier()])
+def test_boruta_with_decision_trees(tree, Xy):
+    msg = (
+        f"The estimator {tree} does not take the parameter "
+        "n_estimators. Use Random Forests or gradient boosting machines "
+        "instead."
+    )
+    X, y = Xy
+    bt = BorutaPy(tree)
+    with pytest.raises(ValueError) as record:
+        bt.fit(X, y)
+
+    assert str(record.value) == msg
diff --git a/boruta/test/unit_tests.py b/boruta/test/unit_tests.py
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+numpy>=1.26.4
+pandas>=2.2.0
+scikit-learn>=1.5.2
diff --git a/test_requirements.txt b/test_requirements.txt
@@ -0,0 +1,7 @@
+-r requirements.txt
+pytest>=5.4.1
+
+# repo maintenance tooling
+black>=21.5b1
+flake8>=3.9.2
+isort>=5.8.0

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+numpy>=1.26.4`
	`2`	`+pandas>=2.2.0`
	`3`	`+scikit-learn>=1.5.2`