From e9db220d75926e2a3bafe2b986078d980e5b5bac Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 15 Dec 2025 16:59:46 +0530 Subject: [PATCH] Scan files for packages faster Add a new "Binary" optional step which also scans for binaries in files. Also adds package scan performance improvements from scancode. Reference: https://github.com/aboutcode-org/scancode-toolkit/issues/4064 Signed-off-by: Ayan Sinha Mahapatra --- pyproject.toml | 8 +++----- scanpipe/pipelines/inspect_packages.py | 7 +++++++ scanpipe/pipes/scancode.py | 2 ++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a91bbc84d0..79a929d05c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,9 +58,9 @@ dependencies = [ # Docker "container-inspector==33.0.0", # ScanCode-toolkit - "scancode-toolkit[packages]==32.4.1", + "scancode-toolkit[packages]@git+https://github.com/aboutcode-org/scancode-toolkit.git@6b6a79b8a1c0b9789a466df4c5623ab723890a76", "extractcode[full]==31.0.0", - "commoncode==32.3.0", + "commoncode==32.4.0", "Beautifulsoup4[chardet]==4.13.4", "packageurl-python==0.17.6", # Workaround issue https://github.com/aboutcode-org/scancode.io/issues/1795 @@ -101,9 +101,7 @@ dependencies = [ # AboutCode pipeline "aboutcode.pipeline==0.2.1", # ScoreCode - "scorecode==0.0.4", - # Workaround issue https://github.com/aboutcode-org/scancode.io/issues/1885 - "click==8.2.1" + "scorecode==0.0.4" ] [project.optional-dependencies] diff --git a/scanpipe/pipelines/inspect_packages.py b/scanpipe/pipelines/inspect_packages.py index 7674f7f25f..b0155437b8 100644 --- a/scanpipe/pipelines/inspect_packages.py +++ b/scanpipe/pipelines/inspect_packages.py @@ -49,10 +49,16 @@ def steps(cls): cls.collect_and_create_codebase_resources, cls.flag_empty_files, cls.flag_ignored_resources, + cls.scan_binaries, cls.scan_for_application_packages, cls.resolve_dependencies, ) + @optional_step("Binary") + def scan_binaries(self): + """Scan binaries for package and dependency information.""" + self.scan_binaries = True + def scan_for_application_packages(self): """ Scan resources for package information to add DiscoveredPackage @@ -61,6 +67,7 @@ def scan_for_application_packages(self): scancode.scan_for_application_packages( project=self.project, assemble=True, + binary=self.scan_binaries or False, package_only=True, progress_logger=self.log, ) diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index 609e86b69c..4b182cdf86 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -409,6 +409,7 @@ def scan_for_files(project, resource_qs=None, progress_logger=None): def scan_for_application_packages( project, assemble=True, + binary=False, package_only=False, resource_qs=None, progress_logger=logger.info, @@ -431,6 +432,7 @@ def scan_for_application_packages( scan_func_kwargs = { "package_only": package_only, + "binary": binary, } # Collect detected Package data and save it to the CodebaseResource it was