From 0cade5edfa525b738b9a742cbb0ffc4326df9ba5 Mon Sep 17 00:00:00 2001 From: Varsha U N Date: Tue, 10 Jun 2025 08:35:40 +0530 Subject: [PATCH 1/2] add support to store packages/archives locally Signed-off-by: Varsha U N --- scancodeio/settings.py | 10 + scancodeio/urls.py | 3 + scanpipe/forms.py | 6 + .../0068_packagearchive_downloadedpackage.py | 44 +++++ ...69_packagearchive_package_file_and_more.py | 23 +++ ...0070_project_use_local_storage_and_more.py | 28 +++ scanpipe/models.py | 99 +++++++++- scanpipe/pipelines/deploy_to_develop.py | 40 ++++ scanpipe/pipelines/docker.py | 39 ++++ scanpipe/pipelines/docker_windows.py | 40 +++- scanpipe/pipelines/inspect_packages.py | 41 ++++ scanpipe/pipelines/load_inventory.py | 31 +++ scanpipe/pipelines/load_sbom.py | 34 ++++ scanpipe/pipelines/resolve_dependencies.py | 72 ++++++- scanpipe/pipelines/root_filesystem.py | 74 +++++++ scanpipe/pipelines/scan_codebase.py | 41 ++++ scanpipe/pipelines/scan_single_package.py | 33 ++++ scanpipe/pipes/fetch.py | 98 ++++++++- scanpipe/templates/scanpipe/project_form.html | 6 +- .../data/scancode/InjectCode-Examples.zip | Bin 0 -> 9831 bytes scanpipe/tests/test_models.py | 186 ++++++++++++++++++ 21 files changed, 942 insertions(+), 6 deletions(-) create mode 100644 scanpipe/migrations/0068_packagearchive_downloadedpackage.py create mode 100644 scanpipe/migrations/0069_packagearchive_package_file_and_more.py create mode 100644 scanpipe/migrations/0070_project_use_local_storage_and_more.py create mode 100644 scanpipe/tests/data/scancode/InjectCode-Examples.zip diff --git a/scancodeio/settings.py b/scancodeio/settings.py index caa14fd1e..b787990b2 100644 --- a/scancodeio/settings.py +++ b/scancodeio/settings.py @@ -20,6 +20,7 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import os import sys import tempfile from pathlib import Path @@ -348,6 +349,15 @@ PROJECT_DIR("static"), ] +# Media files (Uploaded package archives, etc.) + +MEDIA_URL = "/media/" +MEDIA_ROOT = os.path.join(str(ROOT_DIR), "media") + +# Package storage settings + +ENABLE_PACKAGE_STORAGE = env.bool("ENABLE_PACKAGE_STORAGE", default=False) + # Third-party apps CRISPY_TEMPLATE_PACK = "bootstrap3" diff --git a/scancodeio/urls.py b/scancodeio/urls.py index f0e475e17..35851e0fc 100644 --- a/scancodeio/urls.py +++ b/scancodeio/urls.py @@ -21,6 +21,7 @@ # Visit https://github.com/aboutcode-org/scancode.io for support and download. from django.conf import settings +from django.conf.urls.static import static from django.contrib.auth import views as auth_views from django.urls import include from django.urls import path @@ -54,6 +55,8 @@ path("", RedirectView.as_view(url="project/")), ] +urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) + if settings.SCANCODEIO_ENABLE_ADMIN_SITE: urlpatterns.append(path("admin/", admin_site.urls)) diff --git a/scanpipe/forms.py b/scanpipe/forms.py index f854235aa..d903560f9 100644 --- a/scanpipe/forms.py +++ b/scanpipe/forms.py @@ -160,6 +160,7 @@ class Meta: "pipeline", "execute_now", "selected_groups", + "use_local_storage", ] def __init__(self, *args, **kwargs): @@ -173,6 +174,11 @@ def __init__(self, *args, **kwargs): pipeline_choices = scanpipe_app.get_pipeline_choices(include_addon=False) self.fields["pipeline"].choices = pipeline_choices + self.fields["use_local_storage"].label = "Store packages locally" + self.fields["use_local_storage"].help_text = "If checked, " \ + "packages will be stored on the local filesystem." + self.fields["use_local_storage"].widget.attrs.update({"class": "checkbox"}) + def clean_name(self): return " ".join(self.cleaned_data["name"].split()) diff --git a/scanpipe/migrations/0068_packagearchive_downloadedpackage.py b/scanpipe/migrations/0068_packagearchive_downloadedpackage.py new file mode 100644 index 000000000..66b62632d --- /dev/null +++ b/scanpipe/migrations/0068_packagearchive_downloadedpackage.py @@ -0,0 +1,44 @@ +# Generated by Django 5.1.1 on 2025-05-10 06:55 + +import django.db.models.deletion +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('scanpipe', '0067_discoveredpackage_notes'), + ] + + operations = [ + migrations.CreateModel( + name='PackageArchive', + fields=[ + ('uuid', models.UUIDField(db_index=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UUID')), + ('checksum_sha256', models.CharField(db_index=True, help_text='SHA256 checksum of the package archive file.', max_length=64, unique=True)), + ('storage_path', models.CharField(help_text='Path to the stored archive file (e.g., file:///path/to/file).', max_length=1024)), + ('created_date', models.DateTimeField(auto_now_add=True, help_text='Date when the archive was added to storage.')), + ], + options={ + 'indexes': [models.Index(fields=['checksum_sha256'], name='checksum_idx')], + }, + ), + migrations.CreateModel( + name='DownloadedPackage', + fields=[ + ('uuid', models.UUIDField(db_index=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UUID')), + ('url', models.URLField(blank=True, db_index=True, help_text='URL from which the package was downloaded, if applicable.', max_length=1024)), + ('filename', models.CharField(help_text='Name of the package file.', max_length=255)), + ('download_date', models.DateTimeField(auto_now_add=True, help_text='Date when the package was downloaded or added.')), + ('scan_log', models.TextField(blank=True, help_text='Log output from scanning the package.')), + ('scan_date', models.DateTimeField(blank=True, help_text='Date when the package was scanned.', null=True)), + ('project', models.ForeignKey(editable=False, on_delete=django.db.models.deletion.CASCADE, related_name='downloadedpackages', to='scanpipe.project')), + ('package_archive', models.ForeignKey(help_text='The stored archive file associated with this package.', on_delete=django.db.models.deletion.CASCADE, to='scanpipe.packagearchive')), + ], + options={ + 'indexes': [models.Index(fields=['url'], name='url_idx')], + 'constraints': [models.UniqueConstraint(condition=models.Q(('url__gt', '')), fields=('url', 'project'), name='scanpipe_downloadedpackage_unique_url_project')], + }, + ), + ] diff --git a/scanpipe/migrations/0069_packagearchive_package_file_and_more.py b/scanpipe/migrations/0069_packagearchive_package_file_and_more.py new file mode 100644 index 000000000..f11a518ae --- /dev/null +++ b/scanpipe/migrations/0069_packagearchive_package_file_and_more.py @@ -0,0 +1,23 @@ +# Generated by Django 5.1.1 on 2025-05-12 09:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('scanpipe', '0068_packagearchive_downloadedpackage'), + ] + + operations = [ + migrations.AddField( + model_name='packagearchive', + name='package_file', + field=models.FileField(blank=True, help_text='The actual package archive file (e.g., ZIP or TAR).', null=True, upload_to='packages/'), + ), + migrations.AlterField( + model_name='packagearchive', + name='storage_path', + field=models.CharField(blank=True, help_text='Path to the stored archive file (e.g., file:///path/to/file).', max_length=1024), + ), + ] diff --git a/scanpipe/migrations/0070_project_use_local_storage_and_more.py b/scanpipe/migrations/0070_project_use_local_storage_and_more.py new file mode 100644 index 000000000..1f5db400d --- /dev/null +++ b/scanpipe/migrations/0070_project_use_local_storage_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 5.1.1 on 2025-05-26 09:19 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('scanpipe', '0069_packagearchive_package_file_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='project', + name='use_local_storage', + field=models.BooleanField(default=False, help_text='Store packages locally if enabled.'), + ), + migrations.AlterField( + model_name='packagearchive', + name='package_file', + field=models.FileField(blank=True, help_text='The actual package archive file ( ZIP or TAR).', null=True, upload_to='packages/'), + ), + migrations.AlterField( + model_name='packagearchive', + name='storage_path', + field=models.CharField(blank=True, help_text='Path to the stored archive file', max_length=1024), + ), + ] diff --git a/scanpipe/models.py b/scanpipe/models.py index f54395b05..f2efefa02 100644 --- a/scanpipe/models.py +++ b/scanpipe/models.py @@ -561,7 +561,8 @@ class Project(UUIDPKModel, ExtraDataFieldMixin, UpdateMixin, models.Model): notes = models.TextField(blank=True) settings = models.JSONField(default=dict, blank=True) labels = TaggableManager(through=UUIDTaggedItem) - + use_local_storage = models.BooleanField(default=False, + help_text="Store packages locally if enabled.") objects = ProjectQuerySet.as_manager() class Meta: @@ -4134,6 +4135,102 @@ def success(self): return self.response_status_code in (200, 201, 202) +class PackageArchive(UUIDPKModel): + """ + Stores metadata about a package archive file stored in the project's storage. + Each archive is uniquely identified by its SHA256 checksum. + """ + + checksum_sha256 = models.CharField( + max_length=64, + unique=True, + db_index=True, + help_text=_("SHA256 checksum of the package archive file."), + ) + storage_path = models.CharField( + max_length=1024, + blank=True, + help_text=_("Path to the stored archive file"), + ) + package_file = models.FileField( + upload_to="packages/", + null=True, + blank=True, + help_text=_("The actual package archive file ( ZIP or TAR)."), + ) + created_date = models.DateTimeField( + auto_now_add=True, + help_text=_("Date when the archive was added to storage."), + ) + + class Meta: + indexes = [ + models.Index(fields=["checksum_sha256"], name="checksum_idx"), + ] + + def __str__(self): + return f"Archive {self.checksum_sha256[:8]} at {self.storage_path + or self.package_file.name}" + + +class DownloadedPackage(UUIDPKModel): + """ + Tracks packages downloaded or provided as input for a project, linked to a + PackageArchive. Each instance represents a package associated with a project, + including its source URL (if downloaded) and scan details. + """ + + project = models.ForeignKey( + Project, + related_name="downloadedpackages", + on_delete=models.CASCADE, + editable=False, + ) + url = models.URLField( + max_length=1024, + db_index=True, + blank=True, + help_text=_("URL from which the package was downloaded, if applicable."), + ) + filename = models.CharField( + max_length=255, + help_text=_("Name of the package file."), + ) + download_date = models.DateTimeField( + auto_now_add=True, + help_text=_("Date when the package was downloaded or added."), + ) + scan_log = models.TextField( + blank=True, + help_text=_("Log output from scanning the package."), + ) + scan_date = models.DateTimeField( + null=True, + blank=True, + help_text=_("Date when the package was scanned."), + ) + package_archive = models.ForeignKey( + PackageArchive, + on_delete=models.CASCADE, + help_text=_("The stored archive file associated with this package."), + ) + + class Meta: + indexes = [ + models.Index(fields=["url"], name="url_idx"), + ] + constraints = [ + models.UniqueConstraint( + fields=["url", "project"], + condition=Q(url__gt=""), + name="%(app_label)s_%(class)s_unique_url_project", + ), + ] + + def __str__(self): + return f"{self.filename} for project {self.project.name}" + + @receiver(models.signals.post_save, sender=settings.AUTH_USER_MODEL) def create_auth_token(sender, instance=None, created=False, **kwargs): """Create an API key token on user creation, using the signal system.""" diff --git a/scanpipe/pipelines/deploy_to_develop.py b/scanpipe/pipelines/deploy_to_develop.py index 3b07795a4..5ec57e98e 100644 --- a/scanpipe/pipelines/deploy_to_develop.py +++ b/scanpipe/pipelines/deploy_to_develop.py @@ -20,6 +20,9 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from aboutcode.pipeline import group from scanpipe import pipes from scanpipe.pipelines import Pipeline @@ -28,7 +31,9 @@ from scanpipe.pipes import matchcode from scanpipe.pipes import purldb from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive +logger = logging.getLogger(__name__) class DeployToDevelop(Pipeline): """ @@ -59,6 +64,7 @@ def steps(cls): cls.extract_inputs_to_codebase_directory, cls.extract_archives, cls.collect_and_create_codebase_resources, + cls.store_package_archives, cls.fingerprint_codebase_directories, cls.flag_empty_files, cls.flag_whitespace_files, @@ -116,6 +122,40 @@ def steps(cls): ".odp", ] + def store_package_archives(self): + """Store package archives locally if enabled.""" + if not self.project.use_local_storage: + logger.info( + f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage.") + return [] + + logger.info(f"Storing package archives for project: {self.project.name}") + stored_files = [] + package_files = [ + resource.path + for resource in self.project.codebaseresources.filter( + extension__in=self.purldb_package_extensions + ) + ] + + for package_path in package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + + return stored_files + def get_inputs(self): """Locate the ``from`` and ``to`` input files.""" self.from_files, self.to_files = d2d.get_inputs(self.project) diff --git a/scanpipe/pipelines/docker.py b/scanpipe/pipelines/docker.py index 5bf338458..aac1bf0f2 100644 --- a/scanpipe/pipelines/docker.py +++ b/scanpipe/pipelines/docker.py @@ -20,9 +20,15 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from scanpipe.pipelines.root_filesystem import RootFS from scanpipe.pipes import docker from scanpipe.pipes import rootfs +from scanpipe.pipes.fetch import store_package_archive + +logger = logging.getLogger(__name__) class Docker(RootFS): @@ -36,6 +42,7 @@ def steps(cls): cls.find_images_os_and_distro, cls.collect_images_information, cls.collect_and_create_codebase_resources, + cls.store_package_archives, cls.collect_and_create_system_packages, cls.flag_uninteresting_codebase_resources, cls.flag_empty_files, @@ -74,6 +81,38 @@ def collect_and_create_codebase_resources(self): """Collect and labels all image files as CodebaseResources.""" for image in self.images: docker.create_codebase_resources(self.project, image) + self.package_files = [] + for resource in self.project.codebaseresources.filter(extension=".deb"): + self.package_files.append(resource.path) + logger.debug(f"Found package file: {resource.path}") + + def store_package_archives(self): + """Store identified package archives.""" + if not self.project.use_local_storage: + logger.info(f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage.") + return [] + + logger.info( + f"Storing package archives for project: {self.project.name}," + "files: {self.package_files}" + ) + stored_files = [] + for package_path in self.package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + return stored_files def collect_and_create_system_packages(self): """Collect installed system packages for each layer based on the distro.""" diff --git a/scanpipe/pipelines/docker_windows.py b/scanpipe/pipelines/docker_windows.py index 98684da13..e8f2cfe35 100644 --- a/scanpipe/pipelines/docker_windows.py +++ b/scanpipe/pipelines/docker_windows.py @@ -20,12 +20,16 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from scanpipe.pipelines.docker import Docker from scanpipe.pipes import docker from scanpipe.pipes import rootfs from scanpipe.pipes import windows +from scanpipe.pipes.fetch import store_package_archive - +logger = logging.getLogger(__name__) class DockerWindows(Docker): """Analyze Windows Docker images.""" @@ -37,6 +41,7 @@ def steps(cls): cls.find_images_os_and_distro, cls.collect_images_information, cls.collect_and_create_codebase_resources, + cls.store_package_archives, cls.collect_and_create_system_packages, cls.flag_known_software_packages, cls.flag_uninteresting_codebase_resources, @@ -50,6 +55,39 @@ def steps(cls): cls.flag_not_analyzed_codebase_resources, ) + def store_package_archives(self): + """Store identified package archives for Windows images.""" + if not self.project.use_local_storage: + logger.info(f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage.") + return [] + + logger.info(f"Storing package archives for project: {self.project.name}") + stored_files = [] + + package_files = [ + resource.path + for resource in self.project.codebaseresources.filter( + extension__in=[".msi", ".exe"]) + ] + + for package_path in package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + + return stored_files + def flag_known_software_packages(self): """Flag files from known software packages by checking common install paths.""" windows.flag_known_software(self.project) diff --git a/scanpipe/pipelines/inspect_packages.py b/scanpipe/pipelines/inspect_packages.py index 9d28c07cf..3c7e0db6b 100644 --- a/scanpipe/pipelines/inspect_packages.py +++ b/scanpipe/pipelines/inspect_packages.py @@ -20,10 +20,15 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from aboutcode.pipeline import group from scanpipe.pipelines.scan_codebase import ScanCodebase from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive +logger = logging.getLogger(__name__) class InspectPackages(ScanCodebase): """ @@ -50,6 +55,7 @@ def steps(cls): cls.flag_empty_files, cls.flag_ignored_resources, cls.scan_for_application_packages, + cls.store_package_archives, cls.resolve_dependencies, ) @@ -65,6 +71,41 @@ def scan_for_application_packages(self): progress_logger=self.log, ) + def store_package_archives(self): + """Store identified package archives locally if enabled.""" + if not self.project.use_local_storage: + logger.info(f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage.") + return [] + + logger.info(f"Storing package archives for project: {self.project.name}") + stored_files = [] + package_files = [ + resource.path + for resource in self.project.codebaseresources.filter( + extension__in=[ + ".zip", ".whl", ".tar.gz", ".deb", ".rpm", ".apk", ".nupkg", ".msi", + ".exe"] + ) + ] + + for package_path in package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + + return stored_files + @group("StaticResolver") def resolve_dependencies(self): """ diff --git a/scanpipe/pipelines/load_inventory.py b/scanpipe/pipelines/load_inventory.py index 5b6a3cff3..a4dd83bbe 100644 --- a/scanpipe/pipelines/load_inventory.py +++ b/scanpipe/pipelines/load_inventory.py @@ -21,10 +21,13 @@ # Visit https://github.com/aboutcode-org/scancode.io for support and download. import json +import logging from scanpipe.pipelines import Pipeline from scanpipe.pipes import input +from scanpipe.pipes.fetch import store_package_archive +logger = logging.getLogger(__name__) class LoadInventory(Pipeline): """ @@ -42,6 +45,7 @@ class LoadInventory(Pipeline): def steps(cls): return ( cls.get_inputs, + cls.store_inventory_files, cls.build_inventory_from_scans, ) @@ -49,6 +53,33 @@ def get_inputs(self): """Locate all the supported input files from the project's input/ directory.""" self.input_paths = self.project.inputs(extensions=self.supported_extensions) + def store_inventory_files(self): + """Store input inventory files locally if enabled.""" + if not self.project.use_local_storage: + logger.info(f"Local storage is disabled for project: {self.project.name}." + "Skipping file storage.") + return [] + + logger.info(f"Storing inventory files for project: {self.project.name}") + stored_files = [] + + for input_path in self.input_paths: + if not input_path.exists(): + logger.error(f"Invalid or missing file path: {input_path}") + continue + input_path_str = str(input_path) + logger.info(f"Storing inventory file: {input_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=input_path_str + ) + logger.info(f"Stored inventory file {input_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {input_path_str}: {e}") + + return stored_files + def build_inventory_from_scans(self): """ Process JSON scan results files to populate packages, dependencies, and diff --git a/scanpipe/pipelines/load_sbom.py b/scanpipe/pipelines/load_sbom.py index 955e54dd9..b8ee6153b 100644 --- a/scanpipe/pipelines/load_sbom.py +++ b/scanpipe/pipelines/load_sbom.py @@ -20,9 +20,14 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from scanpipe.pipelines.scan_codebase import ScanCodebase from scanpipe.pipes import resolve +from scanpipe.pipes.fetch import store_package_archive +logger = logging.getLogger(__name__) class LoadSBOM(ScanCodebase): """ @@ -44,6 +49,7 @@ def steps(cls): cls.flag_empty_files, cls.flag_ignored_resources, cls.get_sbom_inputs, + cls.store_sbom_files, cls.get_packages_from_sboms, cls.create_packages_from_sboms, cls.create_dependencies_from_sboms, @@ -53,6 +59,34 @@ def get_sbom_inputs(self): """Locate all the SBOMs among the codebase resources.""" self.manifest_resources = resolve.get_manifest_resources(self.project) + def store_sbom_files(self): + """Store SBOM files locally if enabled.""" + if not self.project.use_local_storage: + logger.info(f"Local storage is disabled for project: {self.project.name}." + "Skipping file storage.") + return [] + + logger.info(f"Storing SBOM files for project: {self.project.name}") + stored_files = [] + + for resource in self.manifest_resources: + resource_path = resource.path + if not Path(resource_path).exists(): + logger.error(f"Invalid or missing file path: {resource_path}") + continue + resource_path_str = str(resource_path) + logger.info(f"Storing SBOM file: {resource_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=resource_path_str + ) + logger.info(f"Stored SBOM file {resource_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {resource_path_str}: {e}") + + return stored_files + def get_packages_from_sboms(self): """Get packages data from SBOMs.""" self.packages = resolve.get_packages( diff --git a/scanpipe/pipelines/resolve_dependencies.py b/scanpipe/pipelines/resolve_dependencies.py index 781d1d639..a1450440b 100644 --- a/scanpipe/pipelines/resolve_dependencies.py +++ b/scanpipe/pipelines/resolve_dependencies.py @@ -20,12 +20,16 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from aboutcode.pipeline import group from scanpipe.pipelines.scan_codebase import ScanCodebase from scanpipe.pipes import resolve from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive - +logger = logging.getLogger(__name__) class ResolveDependencies(ScanCodebase): """ Resolve dependencies from package manifests and lockfiles. @@ -47,7 +51,9 @@ def steps(cls): cls.collect_and_create_codebase_resources, cls.flag_ignored_resources, cls.get_manifest_inputs, + cls.store_manifest_files, cls.scan_for_application_packages, + cls.store_package_archives, cls.create_packages_and_dependencies, cls.get_packages_from_manifest, cls.create_resolved_packages, @@ -57,6 +63,35 @@ def get_manifest_inputs(self): """Locate package manifest files with a supported package resolver.""" self.manifest_resources = resolve.get_manifest_resources(self.project) + def store_manifest_files(self): + """Store manifest files locally if enabled.""" + if not self.project.use_local_storage: + logger.info(f"Local storage is disabled for project: {self.project.name}." + "Skipping file storage.") + return [] + + logger.info(f"Storing manifest files for project: {self.project.name}") + stored_files = [] + + for resource in self.manifest_resources: + resource_path = resource.path + if not Path(resource_path).exists(): + logger.error(f"Invalid or missing file path: {resource_path}") + continue + resource_path_str = str(resource_path) + logger.info(f"Storing manifest file: {resource_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=resource_path_str + ) + logger.info(f"Stored manifest file {resource_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {resource_path_str}: {e}") + + return stored_files + + @group("StaticResolver") def scan_for_application_packages(self): """ @@ -70,6 +105,41 @@ def scan_for_application_packages(self): progress_logger=self.log, ) + def store_package_archives(self): + """Store package archives locally if enabled.""" + if not self.project.use_local_storage: + logger.info(f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage.") + return [] + + logger.info(f"Storing package archives for project: {self.project.name}") + stored_files = [] + package_files = [ + resource.path + for resource in self.project.codebaseresources.filter( + extension__in=[ + ".whl", ".tar.gz", ".zip", ".deb", ".rpm", ".apk", ".nupkg" + ] + ) + ] + + for package_path in package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + + return stored_files + @group("StaticResolver") def create_packages_and_dependencies(self): """ diff --git a/scanpipe/pipelines/root_filesystem.py b/scanpipe/pipelines/root_filesystem.py index 76478ce6d..d9c257245 100644 --- a/scanpipe/pipelines/root_filesystem.py +++ b/scanpipe/pipelines/root_filesystem.py @@ -20,12 +20,16 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +from pathlib import Path + from extractcode import EXTRACT_SUFFIX +from scanpipe.models import DiscoveredPackage from scanpipe.pipelines import Pipeline from scanpipe.pipes import flag from scanpipe.pipes import rootfs from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive class RootFS(Pipeline): @@ -39,6 +43,7 @@ def steps(cls): cls.collect_rootfs_information, cls.collect_and_create_codebase_resources, cls.collect_and_create_system_packages, + cls.store_package_archives, cls.flag_uninteresting_codebase_resources, cls.flag_empty_files, cls.flag_ignored_resources, @@ -87,6 +92,75 @@ def collect_and_create_system_packages(self): for rfs in self.root_filesystems: rootfs.scan_rootfs_for_system_packages(self.project, rfs) + def store_package_archives(self): + """ + Store package archives (.deb, .apk) found in the root filesystem or fetch + them for detected system packages if configured to do so. + """ + if not self.project.use_local_storage: + self.log(f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage.") + return [] + if not self.env.get("STORE_DOWNLOADED_PACKAGES", True): + self.log("Package storage skipped: STORE_DOWNLOADED_PACKAGES is disabled") + return + + self.log(f"Storing package archives for project: {self.project.name}") + stored_files = [] + + package_files = [ + resource.path + for resource in self.project.codebaseresources.filter( + extension__in=[".deb", ".apk"] + ) + ] + for package_path in package_files: + if not Path(package_path).exists(): + self.log(f"Package file not found: {package_path}", level="ERROR") + continue + result = store_package_archive( + self.project, url=None, file_path=str(package_path) + ) + self.log(f"Stored package archive: {package_path}, Result: {result}") + stored_files.append(result) + system_packages = DiscoveredPackage.objects.filter(project=self.project) + self.log(f"Found {system_packages.count()} system packages") + + for pkg in system_packages: + if "alpine" in pkg.purl: + pkg_name = pkg.name + pkg_version = pkg.version + apk_url = f"http://dl-cdn.alpinelinux.org/alpine/v3.18/main/x86_64/{pkg_name}-{pkg_version}.apk" + try: + import requests + + response = requests.get(apk_url, stream=True, timeout=10) + response.raise_for_status() + dest_path = ( + Path(self.project.work_directory) + / "tmp" + / f"{pkg_name}-{pkg_version}.apk" + ) + dest_path.parent.mkdir(exist_ok=True) + with open(dest_path, "wb") as f: + for chunk in response.iter_content(1024): + f.write(chunk) + result = store_package_archive( + self.project, url=apk_url, file_path=str(dest_path) + ) + self.log( + f"Stored system package archive: {pkg_name}, URL: {apk_url}," + "Result: {result}" + ) + stored_files.append(result) + except Exception as e: + self.log( + f"Failed to fetch/store {pkg_name} from {apk_url}: {e}", + level="WARNING", + ) + + return stored_files + def flag_uninteresting_codebase_resources(self): """Flag files—not worth tracking—that don’t belong to any system packages.""" rootfs.flag_uninteresting_codebase_resources(self.project) diff --git a/scanpipe/pipelines/scan_codebase.py b/scanpipe/pipelines/scan_codebase.py index d5bbe992c..4590559dc 100644 --- a/scanpipe/pipelines/scan_codebase.py +++ b/scanpipe/pipelines/scan_codebase.py @@ -20,11 +20,16 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import logging +from pathlib import Path + from scanpipe import pipes from scanpipe.pipelines import Pipeline from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive from scanpipe.pipes.input import copy_inputs +logger = logging.getLogger(__name__) class ScanCodebase(Pipeline): """ @@ -41,6 +46,7 @@ def steps(cls): cls.copy_inputs_to_codebase_directory, cls.extract_archives, cls.collect_and_create_codebase_resources, + cls.store_package_archives, cls.flag_empty_files, cls.flag_ignored_resources, cls.scan_for_application_packages, @@ -54,6 +60,41 @@ def copy_inputs_to_codebase_directory(self): """ copy_inputs(self.project.inputs("*"), self.project.codebase_path) + def store_package_archives(self): + """Store package archives locally if enabled.""" + if not self.project.use_local_storage: + logger.info(f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage.") + return [] + + logger.info(f"Storing package archives for project: {self.project.name}") + stored_files = [] + package_files = [ + resource.path + for resource in self.project.codebaseresources.filter( + extension__in=[ + ".whl", ".tar.gz", ".zip", ".deb", ".rpm", ".apk", ".nupkg", ".msi", + ".exe"] + ) + ] + + for package_path in package_files: + if not Path(package_path).exists(): + logger.error(f"Invalid or missing package path: {package_path}") + continue + package_path_str = str(package_path) + logger.info(f"Storing package archive: {package_path_str}") + try: + result = store_package_archive( + self.project, url=None, file_path=package_path_str + ) + logger.info(f"Stored package archive {package_path_str}: {result}") + stored_files.append(result) + except Exception as e: + logger.error(f"Failed to store {package_path_str}: {e}") + + return stored_files + def collect_and_create_codebase_resources(self): """Collect and create codebase resources.""" pipes.collect_and_create_codebase_resources(self.project) diff --git a/scanpipe/pipelines/scan_single_package.py b/scanpipe/pipelines/scan_single_package.py index 1c9eee2ee..1081fa996 100644 --- a/scanpipe/pipelines/scan_single_package.py +++ b/scanpipe/pipelines/scan_single_package.py @@ -21,6 +21,7 @@ # Visit https://github.com/aboutcode-org/scancode.io for support and download. import json +import logging from django.core.serializers.json import DjangoJSONEncoder @@ -29,9 +30,12 @@ from scanpipe.pipelines import Pipeline from scanpipe.pipes import input from scanpipe.pipes import scancode +from scanpipe.pipes.fetch import store_package_archive from scanpipe.pipes.input import copy_input from scanpipe.pipes.input import is_archive +logger = logging.getLogger(__name__) + class ScanSinglePackage(Pipeline): """ @@ -47,6 +51,7 @@ class ScanSinglePackage(Pipeline): def steps(cls): return ( cls.get_package_input, + cls.store_package, cls.collect_input_information, cls.extract_input_to_codebase_directory, cls.extract_archives, @@ -77,6 +82,34 @@ def get_package_input(self): self.input_path = inputs[0] + def store_package(self): + """ + Store the package archive and create a DownloadedPackage entry if applicable. + Uses the input_path from get_package_input and the optional URL for + downloaded packages. + """ + if not self.project.use_local_storage: + self.log(f"Local storage is disabled for project: {self.project.name}." + "Skipping package storage.") + return [] + logger.info( + f"Starting store_package for project: {self.project}," + "input: {self.input_path}" + ) + + file_path = self.input_path + if not file_path: + logger.error("No input file path available for storage") + return None + + url = None + + logger.info(f"Calling store_package_archive with URL: {url}, File: {file_path}") + result = store_package_archive(self.project, url=url, file_path=file_path) + + logger.info(f"store_package completed, result: {result}") + return result + def collect_input_information(self): """Collect and store information about the project input.""" self.project.update_extra_data( diff --git a/scanpipe/pipes/fetch.py b/scanpipe/pipes/fetch.py index 06b4aca28..9d943ffb3 100644 --- a/scanpipe/pipes/fetch.py +++ b/scanpipe/pipes/fetch.py @@ -21,6 +21,7 @@ # Visit https://github.com/aboutcode-org/scancode.io for support and download. import cgi +import hashlib import json import logging import os @@ -33,6 +34,7 @@ from urllib.parse import urlparse from django.conf import settings +from django.core.files import File import git import requests @@ -42,6 +44,9 @@ from plugincode.location_provider import get_location from requests import auth as request_auth +from scanpipe.models import DownloadedPackage +from scanpipe.models import PackageArchive + logger = logging.getLogger("scanpipe.pipes") Download = namedtuple("Download", "uri directory filename path size sha1 md5") @@ -345,6 +350,80 @@ def fetch_git_repo(url, to=None): ) +def store_package_archive(project, url=None, file_path=None): + """ + Store a package in PackageArchive and link it to DownloadedPackage. + + Args: + project: The ScanCode.io Project instance. + url (str, optional): The URL from which the package was downloaded. + file_path (str or Path, optional): Path to the package file. + + Returns: + DownloadedPackage: The created DownloadedPackage instance, or + None if storage is disabled or an error occurs. + + """ + logger.info( + f"store_package_archive called with project: {project}, url: {url}," + "file_path: {file_path}" + ) + + if not getattr(settings, "ENABLE_PACKAGE_STORAGE", False): + logger.info("Package storage disabled (ENABLE_PACKAGE_STORAGE=False)") + return None + + if not file_path: + input_files = project.input_files.all() + if not input_files: + logger.info("No input files found for project") + return None + file_path = input_files[0].path + logger.info(f"Using first input file: {file_path}") + + file_path = str(file_path) + logger.info(f"Processing file: {file_path}") + + try: + with open(file_path, "rb") as f: + checksum = hashlib.sha256(f.read()).hexdigest() + logger.info(f"Calculated SHA256: {checksum}") + except FileNotFoundError as e: + logger.error(f"File not found: {file_path}, error: {e}") + return None + + try: + archive, created = PackageArchive.objects.get_or_create( + checksum_sha256=checksum, + defaults={ + "storage_path": file_path, + "package_file": File( + open(file_path, "rb"), name=os.path.basename(file_path) + ), + }, + ) + logger.info( + f"PackageArchive {'created' if created else 'retrieved'}:" + "{archive.checksum_sha256}" + ) + except Exception as e: + logger.error(f"Error creating PackageArchive: {e}") + return None + + try: + dp = DownloadedPackage.objects.create( + project=project, + url=url or "", + filename=os.path.basename(file_path), + package_archive=archive, + ) + logger.info(f"DownloadedPackage created: {dp.url}, {dp.filename}") + return dp + except Exception as e: + logger.error(f"Error creating DownloadedPackage: {e}") + return None + + SCHEME_TO_FETCHER_MAPPING = { "http": fetch_http, "https": fetch_http, @@ -372,15 +451,30 @@ def get_fetcher(url): raise ValueError(error_msg) -def fetch_url(url): +def fetch_url(url, project=None): """Fetch provided `url` and returns the result as a `Download` object.""" fetcher = get_fetcher(url) logger.info(f'Fetching "{url}" using {fetcher.__name__}') downloaded = fetcher(url) + if ( + project + and getattr(settings, "ENABLE_PACKAGE_STORAGE", False) + and project.use_local_storage + ): + logger.info( + f"Storing package for project: {project.name} with url={url}" + ) + store_package_archive(project, url, downloaded.path) + elif project and not project.use_local_storage: + logger.info( + f"Skipping package storage for project: {project.name} " + "(local storage disabled)" + ) return downloaded -def fetch_urls(urls): + +def fetch_urls(urls, project=None): """ Fetch provided `urls` list. The `urls` can also be provided as a string containing one URL per line. diff --git a/scanpipe/templates/scanpipe/project_form.html b/scanpipe/templates/scanpipe/project_form.html index fd8459bd9..55667d610 100644 --- a/scanpipe/templates/scanpipe/project_form.html +++ b/scanpipe/templates/scanpipe/project_form.html @@ -57,6 +57,11 @@

+