Skip to content

add support to store packages/archives locally #1685

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions scancodeio/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import os
import sys
import tempfile
from pathlib import Path
Expand Down Expand Up @@ -367,6 +368,15 @@
PROJECT_DIR("static"),
]

# Media files (Uploaded package archives, etc.)

MEDIA_URL = "/media/"
MEDIA_ROOT = os.path.join(str(ROOT_DIR), "media")

# Package storage settings

ENABLE_PACKAGE_STORAGE = env.bool("ENABLE_PACKAGE_STORAGE", default=False)

# Third-party apps

CRISPY_TEMPLATE_PACK = "bootstrap3"
Expand Down
3 changes: 3 additions & 0 deletions scancodeio/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from django.conf import settings
from django.conf.urls.static import static
from django.contrib.auth import views as auth_views
from django.urls import include
from django.urls import path
Expand Down Expand Up @@ -54,6 +55,8 @@
path("", RedirectView.as_view(url="project/")),
]

urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)


if settings.SCANCODEIO_ENABLE_ADMIN_SITE:
urlpatterns.append(path("admin/", admin_site.urls))
Expand Down
6 changes: 6 additions & 0 deletions scanpipe/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ class Meta:
"pipeline",
"execute_now",
"selected_groups",
"use_local_storage",
]

def __init__(self, *args, **kwargs):
Expand All @@ -178,6 +179,11 @@ def __init__(self, *args, **kwargs):
pipeline_choices = scanpipe_app.get_pipeline_choices(include_addon=False)
self.fields["pipeline"].choices = pipeline_choices

self.fields["use_local_storage"].label = "Store packages locally"
self.fields["use_local_storage"].help_text = "If checked, " \
"packages will be stored on the local filesystem."
self.fields["use_local_storage"].widget.attrs.update({"class": "checkbox"})

def clean_name(self):
return " ".join(self.cleaned_data["name"].split())

Expand Down
44 changes: 44 additions & 0 deletions scanpipe/migrations/0068_packagearchive_downloadedpackage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Generated by Django 5.1.1 on 2025-05-10 06:55

import django.db.models.deletion
import uuid
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('scanpipe', '0067_discoveredpackage_notes'),
]

operations = [
migrations.CreateModel(
name='PackageArchive',
fields=[
('uuid', models.UUIDField(db_index=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UUID')),
('checksum_sha256', models.CharField(db_index=True, help_text='SHA256 checksum of the package archive file.', max_length=64, unique=True)),
('storage_path', models.CharField(help_text='Path to the stored archive file (e.g., file:///path/to/file).', max_length=1024)),
('created_date', models.DateTimeField(auto_now_add=True, help_text='Date when the archive was added to storage.')),
],
options={
'indexes': [models.Index(fields=['checksum_sha256'], name='checksum_idx')],
},
),
migrations.CreateModel(
name='DownloadedPackage',
fields=[
('uuid', models.UUIDField(db_index=True, default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UUID')),
('url', models.URLField(blank=True, db_index=True, help_text='URL from which the package was downloaded, if applicable.', max_length=1024)),
('filename', models.CharField(help_text='Name of the package file.', max_length=255)),
('download_date', models.DateTimeField(auto_now_add=True, help_text='Date when the package was downloaded or added.')),
('scan_log', models.TextField(blank=True, help_text='Log output from scanning the package.')),
('scan_date', models.DateTimeField(blank=True, help_text='Date when the package was scanned.', null=True)),
('project', models.ForeignKey(editable=False, on_delete=django.db.models.deletion.CASCADE, related_name='downloadedpackages', to='scanpipe.project')),
('package_archive', models.ForeignKey(help_text='The stored archive file associated with this package.', on_delete=django.db.models.deletion.CASCADE, to='scanpipe.packagearchive')),
],
options={
'indexes': [models.Index(fields=['url'], name='url_idx')],
'constraints': [models.UniqueConstraint(condition=models.Q(('url__gt', '')), fields=('url', 'project'), name='scanpipe_downloadedpackage_unique_url_project')],
},
),
]
23 changes: 23 additions & 0 deletions scanpipe/migrations/0069_packagearchive_package_file_and_more.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 5.1.1 on 2025-05-12 09:41

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('scanpipe', '0068_packagearchive_downloadedpackage'),
]

operations = [
migrations.AddField(
model_name='packagearchive',
name='package_file',
field=models.FileField(blank=True, help_text='The actual package archive file (e.g., ZIP or TAR).', null=True, upload_to='packages/'),
),
migrations.AlterField(
model_name='packagearchive',
name='storage_path',
field=models.CharField(blank=True, help_text='Path to the stored archive file (e.g., file:///path/to/file).', max_length=1024),
),
]
28 changes: 28 additions & 0 deletions scanpipe/migrations/0070_project_use_local_storage_and_more.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Generated by Django 5.1.1 on 2025-05-26 09:19

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('scanpipe', '0069_packagearchive_package_file_and_more'),
]

operations = [
migrations.AddField(
model_name='project',
name='use_local_storage',
field=models.BooleanField(default=False, help_text='Store packages locally if enabled.'),
),
migrations.AlterField(
model_name='packagearchive',
name='package_file',
field=models.FileField(blank=True, help_text='The actual package archive file ( ZIP or TAR).', null=True, upload_to='packages/'),
),
migrations.AlterField(
model_name='packagearchive',
name='storage_path',
field=models.CharField(blank=True, help_text='Path to the stored archive file', max_length=1024),
),
]
111 changes: 99 additions & 12 deletions scanpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,18 +585,9 @@ class Project(UUIDPKModel, ExtraDataFieldMixin, UpdateMixin, models.Model):
)
notes = models.TextField(blank=True)
settings = models.JSONField(default=dict, blank=True)
labels = TaggableManager(through=UUIDTaggedItem, ordering=["name"])
purl = models.CharField(
max_length=2048,
blank=True,
help_text=_(
"Package URL (PURL) for the project, required for pushing the project's "
"scan result to FederatedCode. For example, if the input is an input URL "
"like https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz, the "
"corresponding PURL would be pkg:npm/[email protected]."
),
)

labels = TaggableManager(through=UUIDTaggedItem)
use_local_storage = models.BooleanField(default=False,
help_text="Store packages locally if enabled.")
objects = ProjectQuerySet.as_manager()

class Meta:
Expand Down Expand Up @@ -4386,6 +4377,102 @@ def success(self):
return self.response_status_code in (200, 201, 202)


class PackageArchive(UUIDPKModel):
"""
Stores metadata about a package archive file stored in the project's storage.
Each archive is uniquely identified by its SHA256 checksum.
"""

checksum_sha256 = models.CharField(
max_length=64,
unique=True,
db_index=True,
help_text=_("SHA256 checksum of the package archive file."),
)
storage_path = models.CharField(
max_length=1024,
blank=True,
help_text=_("Path to the stored archive file"),
)
package_file = models.FileField(
upload_to="packages/",
null=True,
blank=True,
help_text=_("The actual package archive file ( ZIP or TAR)."),
)
created_date = models.DateTimeField(
auto_now_add=True,
help_text=_("Date when the archive was added to storage."),
)

class Meta:
indexes = [
models.Index(fields=["checksum_sha256"], name="checksum_idx"),
]

def __str__(self):
return f"Archive {self.checksum_sha256[:8]} at {self.storage_path
or self.package_file.name}"


class DownloadedPackage(UUIDPKModel):
"""
Tracks packages downloaded or provided as input for a project, linked to a
PackageArchive. Each instance represents a package associated with a project,
including its source URL (if downloaded) and scan details.
"""

project = models.ForeignKey(
Project,
related_name="downloadedpackages",
on_delete=models.CASCADE,
editable=False,
)
url = models.URLField(
max_length=1024,
db_index=True,
blank=True,
help_text=_("URL from which the package was downloaded, if applicable."),
)
filename = models.CharField(
max_length=255,
help_text=_("Name of the package file."),
)
download_date = models.DateTimeField(
auto_now_add=True,
help_text=_("Date when the package was downloaded or added."),
)
scan_log = models.TextField(
blank=True,
help_text=_("Log output from scanning the package."),
)
scan_date = models.DateTimeField(
null=True,
blank=True,
help_text=_("Date when the package was scanned."),
)
package_archive = models.ForeignKey(
PackageArchive,
on_delete=models.CASCADE,
help_text=_("The stored archive file associated with this package."),
)

class Meta:
indexes = [
models.Index(fields=["url"], name="url_idx"),
]
constraints = [
models.UniqueConstraint(
fields=["url", "project"],
condition=Q(url__gt=""),
name="%(app_label)s_%(class)s_unique_url_project",
),
]

def __str__(self):
return f"{self.filename} for project {self.project.name}"


@receiver(models.signals.post_save, sender=settings.AUTH_USER_MODEL)
def create_auth_token(sender, instance=None, created=False, **kwargs):
"""Create an API key token on user creation, using the signal system."""
Expand Down
41 changes: 40 additions & 1 deletion scanpipe/pipelines/analyze_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from scanpipe.pipelines.analyze_root_filesystem import RootFS
import logging
from pathlib import Path

from scanpipe.pipelines import RootFS
from scanpipe.pipes import docker
from scanpipe.pipes import rootfs
from scanpipe.pipes.fetch import store_package_archive

logger = logging.getLogger(__name__)


class Docker(RootFS):
Expand All @@ -36,6 +42,7 @@ def steps(cls):
cls.find_images_os_and_distro,
cls.collect_images_information,
cls.collect_and_create_codebase_resources,
cls.store_package_archives,
cls.collect_and_create_system_packages,
cls.flag_uninteresting_codebase_resources,
cls.flag_empty_files,
Expand Down Expand Up @@ -74,6 +81,38 @@ def collect_and_create_codebase_resources(self):
"""Collect and labels all image files as CodebaseResources."""
for image in self.images:
docker.create_codebase_resources(self.project, image)
self.package_files = []
for resource in self.project.codebaseresources.filter(extension=".deb"):
self.package_files.append(resource.path)
logger.debug(f"Found package file: {resource.path}")

def store_package_archives(self):
"""Store identified package archives."""
if not self.project.use_local_storage:
logger.info(f"Local storage is disabled for project: {self.project.name}."
"Skipping package storage.")
return []

logger.info(
f"Storing package archives for project: {self.project.name},"
"files: {self.package_files}"
)
stored_files = []
for package_path in self.package_files:
if not Path(package_path).exists():
logger.error(f"Invalid or missing package path: {package_path}")
continue
package_path_str = str(package_path)
logger.info(f"Storing package archive: {package_path_str}")
try:
result = store_package_archive(
self.project, url=None, file_path=package_path_str
)
logger.info(f"Stored package archive {package_path_str}: {result}")
stored_files.append(result)
except Exception as e:
logger.error(f"Failed to store {package_path_str}: {e}")
return stored_files

def collect_and_create_system_packages(self):
"""Collect installed system packages for each layer based on the distro."""
Expand Down
Loading
Loading