From 49ab0add2ab52c1bd0a99700954342a1999e1de0 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 7 Oct 2025 09:55:00 +0000 Subject: [PATCH 1/5] gcs mode --- conf/default/reporting.conf.default | 4 ++ modules/reporting/gcs.py | 88 +++++++++++++++++++---------- 2 files changed, 61 insertions(+), 31 deletions(-) diff --git a/conf/default/reporting.conf.default b/conf/default/reporting.conf.default index cc56bfa46a8..9dcd41fa34d 100644 --- a/conf/default/reporting.conf.default +++ b/conf/default/reporting.conf.default @@ -231,6 +231,10 @@ exclude_dirs = logs, shots # Good examples are large report formats you don't need in GCS. exclude_files = +# Mode: zip - will submit all files and folders as unique zip archive. Useful to not spam pubsub notification on file creation. +# Mode: file - will submit one by one. +mode = zip + # Can be vm or json auth_by = vm # only if auth_by = json. The absolute path to your Google Cloud service account JSON key file. diff --git a/modules/reporting/gcs.py b/modules/reporting/gcs.py index f0ded8d10a8..15e26bb98fd 100644 --- a/modules/reporting/gcs.py +++ b/modules/reporting/gcs.py @@ -1,5 +1,7 @@ import os import logging +import tempfile +import zipfile from lib.cuckoo.common.constants import CUCKOO_ROOT from lib.cuckoo.common.abstracts import Report from lib.cuckoo.common.exceptions import CuckooReportError @@ -40,7 +42,6 @@ def run(self, results): ) return - # Read configuration options from gcs.conf # Read configuration options from gcs.conf and validate them bucket_name = self.options.get("bucket_name") if not bucket_name: @@ -66,8 +67,7 @@ def run(self, results): exclude_dirs_str = self.options.get("exclude_dirs", "") exclude_files_str = self.options.get("exclude_files", "") - # --- NEW: Parse the exclusion strings into sets for efficient lookups --- - # The `if item.strip()` ensures we don't have empty strings from trailing commas + # Parse the exclusion strings into sets for efficient lookups exclude_dirs = {item.strip() for item in exclude_dirs_str.split(",") if item.strip()} exclude_files = {item.strip() for item in exclude_files_str.split(",") if item.strip()} @@ -76,6 +76,9 @@ def run(self, results): if exclude_files: log.debug("GCS reporting will exclude files: %s", exclude_files) + # Get the upload mode, defaulting to 'file' for backward compatibility + mode = self.options.get("mode", "file") + try: # --- Authentication --- log.debug("Authenticating with Google Cloud Storage...") @@ -87,39 +90,62 @@ def run(self, results): "The specified GCS bucket '%s' does not exist or you don't have permission to access it.", bucket_name ) - # --- File Upload --- - # Use the analysis ID as a "folder" in the bucket analysis_id = results.get("info", {}).get("id") if not analysis_id: raise CuckooReportError("Could not get analysis ID from results.") - log.debug("Uploading files for analysis ID %d to GCS bucket '%s'", analysis_id, bucket_name) - - # self.analysis_path is the path to the analysis results directory - # e.g., /opt/cape/storage/analyses/123/ source_directory = self.analysis_path - for root, dirs, files in os.walk(source_directory): - # We modify 'dirs' in-place to prevent os.walk from descending into them. - # This is the most efficient way to skip entire directory trees. - dirs[:] = [d for d in dirs if d not in exclude_dirs] - - for filename in files: - # --- NEW: File Exclusion Logic --- - if filename in exclude_files: - log.debug("Skipping excluded file: %s", os.path.join(root, filename)) - continue # Skip to the next file - - local_path = os.path.join(root, filename) - relative_path = os.path.relpath(local_path, source_directory) - blob_name = f"{analysis_id}/{relative_path}" - - log.debug("Uploading '%s' to '%s'", local_path, blob_name) - - blob = bucket.blob(blob_name) - blob.upload_from_filename(local_path) - - log.info("Successfully uploaded files for analysis %d to GCS.", analysis_id) + if mode == "zip": + self.upload_zip_archive(bucket, analysis_id, source_directory, exclude_dirs, exclude_files) + elif mode == "file": + self.upload_files_individually(bucket, analysis_id, source_directory, exclude_dirs, exclude_files) + else: + raise CuckooReportError("Invalid GCS upload mode specified: %s. Must be 'file' or 'zip'.", mode) except Exception as e: - raise CuckooReportError("Failed to upload report to GCS: %s", str(e)) + raise CuckooReportError("Failed to upload report to GCS: %s", e) + + def upload_zip_archive(self, bucket, analysis_id, source_directory, exclude_dirs, exclude_files): + """Compresses and uploads the analysis directory as a single zip file.""" + log.debug("Compressing and uploading files for analysis ID %d to GCS bucket '%s'", analysis_id, bucket.name) + zip_name = "%s.zip" % analysis_id + blob_name = zip_name + + with tempfile.NamedTemporaryFile(delete=False) as tmp_zip_file: + with zipfile.ZipFile(tmp_zip_file, "w", zipfile.ZIP_DEFLATED) as archive: + for root, dirs, files in os.walk(source_directory): + dirs[:] = [d for d in dirs if d not in exclude_dirs] + for filename in files: + if filename in exclude_files: + log.debug("Skipping excluded file: %s", os.path.join(root, filename)) + continue + local_path = os.path.join(root, filename) + relative_path = os.path.relpath(local_path, source_directory) + archive.write(local_path, relative_path) + + log.debug("Uploading '%s' to '%s'", tmp_zip_file.name, blob_name) + blob = bucket.blob(blob_name) + blob.upload_from_filename(tmp_zip_file.name) + + os.unlink(tmp_zip_file.name) + log.info("Successfully uploaded archive for analysis %d to GCS.", analysis_id) + + def upload_files_individually(self, bucket, analysis_id, source_directory, exclude_dirs, exclude_files): + """Uploads analysis files individually to the GCS bucket.""" + log.debug("Uploading files for analysis ID %d to GCS bucket '%s'", analysis_id, bucket.name) + for root, dirs, files in os.walk(source_directory): + dirs[:] = [d for d in dirs if d not in exclude_dirs] + for filename in files: + if filename in exclude_files: + log.debug("Skipping excluded file: %s", os.path.join(root, filename)) + continue + local_path = os.path.join(root, filename) + relative_path = os.path.relpath(local_path, source_directory) + blob_name = "%s/%s" % (analysis_id, relative_path) + + log.debug("Uploading '%s' to '%s'", local_path, blob_name) + blob = bucket.blob(blob_name) + blob.upload_from_filename(local_path) + + log.info("Successfully uploaded files for analysis %d to GCS.", analysis_id) From 1ddc762a81e85802ed577e8cc26569a4f51361ec Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 7 Oct 2025 09:58:43 +0000 Subject: [PATCH 2/5] Update modules/reporting/gcs.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- modules/reporting/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/reporting/gcs.py b/modules/reporting/gcs.py index 15e26bb98fd..55e6d6354f9 100644 --- a/modules/reporting/gcs.py +++ b/modules/reporting/gcs.py @@ -142,7 +142,7 @@ def upload_files_individually(self, bucket, analysis_id, source_directory, exclu continue local_path = os.path.join(root, filename) relative_path = os.path.relpath(local_path, source_directory) - blob_name = "%s/%s" % (analysis_id, relative_path) + blob_name = f"{analysis_id}/{relative_path}" log.debug("Uploading '%s' to '%s'", local_path, blob_name) blob = bucket.blob(blob_name) From 176c4ef6907d1c63e9e78734c130173718e341ca Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 7 Oct 2025 09:58:56 +0000 Subject: [PATCH 3/5] Update modules/reporting/gcs.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- modules/reporting/gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/reporting/gcs.py b/modules/reporting/gcs.py index 55e6d6354f9..6269c5ddc63 100644 --- a/modules/reporting/gcs.py +++ b/modules/reporting/gcs.py @@ -104,7 +104,7 @@ def run(self, results): raise CuckooReportError("Invalid GCS upload mode specified: %s. Must be 'file' or 'zip'.", mode) except Exception as e: - raise CuckooReportError("Failed to upload report to GCS: %s", e) + raise CuckooReportError(f"Failed to upload report to GCS: {e}") from e def upload_zip_archive(self, bucket, analysis_id, source_directory, exclude_dirs, exclude_files): """Compresses and uploads the analysis directory as a single zip file.""" From 007d9c722318b8f79f227c23d97766cad449b737 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 7 Oct 2025 10:07:33 +0000 Subject: [PATCH 4/5] Update modules/reporting/gcs.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- modules/reporting/gcs.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/modules/reporting/gcs.py b/modules/reporting/gcs.py index 6269c5ddc63..b30d022a1a7 100644 --- a/modules/reporting/gcs.py +++ b/modules/reporting/gcs.py @@ -112,7 +112,8 @@ def upload_zip_archive(self, bucket, analysis_id, source_directory, exclude_dirs zip_name = "%s.zip" % analysis_id blob_name = zip_name - with tempfile.NamedTemporaryFile(delete=False) as tmp_zip_file: + with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_zip_file: + tmp_zip_file_name = tmp_zip_file.name with zipfile.ZipFile(tmp_zip_file, "w", zipfile.ZIP_DEFLATED) as archive: for root, dirs, files in os.walk(source_directory): dirs[:] = [d for d in dirs if d not in exclude_dirs] @@ -124,11 +125,12 @@ def upload_zip_archive(self, bucket, analysis_id, source_directory, exclude_dirs relative_path = os.path.relpath(local_path, source_directory) archive.write(local_path, relative_path) - log.debug("Uploading '%s' to '%s'", tmp_zip_file.name, blob_name) + try: + log.debug("Uploading '%s' to '%s'", tmp_zip_file_name, blob_name) blob = bucket.blob(blob_name) - blob.upload_from_filename(tmp_zip_file.name) - - os.unlink(tmp_zip_file.name) + blob.upload_from_filename(tmp_zip_file_name) + finally: + os.unlink(tmp_zip_file_name) log.info("Successfully uploaded archive for analysis %d to GCS.", analysis_id) def upload_files_individually(self, bucket, analysis_id, source_directory, exclude_dirs, exclude_files): From 83e346b27689125ae0066d7a66fa743456dd1e2a Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 7 Oct 2025 10:57:30 +0000 Subject: [PATCH 5/5] gcs mode --- modules/reporting/gcs.py | 44 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/modules/reporting/gcs.py b/modules/reporting/gcs.py index b30d022a1a7..ff2fc56ece1 100644 --- a/modules/reporting/gcs.py +++ b/modules/reporting/gcs.py @@ -106,6 +106,21 @@ def run(self, results): except Exception as e: raise CuckooReportError(f"Failed to upload report to GCS: {e}") from e + def _iter_files_to_upload(self, source_directory, exclude_dirs, exclude_files): + """Generator that yields files to be uploaded, skipping excluded ones.""" + for root, dirs, files in os.walk(source_directory): + # Exclude specified directories + dirs[:] = [d for d in dirs if d not in exclude_dirs] + for filename in files: + # Exclude specified files + if filename in exclude_files: + log.debug("Skipping excluded file: %s", os.path.join(root, filename)) + continue + + local_path = os.path.join(root, filename) + relative_path = os.path.relpath(local_path, source_directory) + yield local_path, relative_path + def upload_zip_archive(self, bucket, analysis_id, source_directory, exclude_dirs, exclude_files): """Compresses and uploads the analysis directory as a single zip file.""" log.debug("Compressing and uploading files for analysis ID %d to GCS bucket '%s'", analysis_id, bucket.name) @@ -115,15 +130,8 @@ def upload_zip_archive(self, bucket, analysis_id, source_directory, exclude_dirs with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_zip_file: tmp_zip_file_name = tmp_zip_file.name with zipfile.ZipFile(tmp_zip_file, "w", zipfile.ZIP_DEFLATED) as archive: - for root, dirs, files in os.walk(source_directory): - dirs[:] = [d for d in dirs if d not in exclude_dirs] - for filename in files: - if filename in exclude_files: - log.debug("Skipping excluded file: %s", os.path.join(root, filename)) - continue - local_path = os.path.join(root, filename) - relative_path = os.path.relpath(local_path, source_directory) - archive.write(local_path, relative_path) + for local_path, relative_path in self._iter_files_to_upload(source_directory, exclude_dirs, exclude_files): + archive.write(local_path, relative_path) try: log.debug("Uploading '%s' to '%s'", tmp_zip_file_name, blob_name) @@ -136,18 +144,10 @@ def upload_zip_archive(self, bucket, analysis_id, source_directory, exclude_dirs def upload_files_individually(self, bucket, analysis_id, source_directory, exclude_dirs, exclude_files): """Uploads analysis files individually to the GCS bucket.""" log.debug("Uploading files for analysis ID %d to GCS bucket '%s'", analysis_id, bucket.name) - for root, dirs, files in os.walk(source_directory): - dirs[:] = [d for d in dirs if d not in exclude_dirs] - for filename in files: - if filename in exclude_files: - log.debug("Skipping excluded file: %s", os.path.join(root, filename)) - continue - local_path = os.path.join(root, filename) - relative_path = os.path.relpath(local_path, source_directory) - blob_name = f"{analysis_id}/{relative_path}" - - log.debug("Uploading '%s' to '%s'", local_path, blob_name) - blob = bucket.blob(blob_name) - blob.upload_from_filename(local_path) + for local_path, relative_path in self._iter_files_to_upload(source_directory, exclude_dirs, exclude_files): + blob_name = f"{analysis_id}/{relative_path}" + log.debug("Uploading '%s' to '%s'", local_path, blob_name) + blob = bucket.blob(blob_name) + blob.upload_from_filename(local_path) log.info("Successfully uploaded files for analysis %d to GCS.", analysis_id)