diff --git a/scanpipe/pipelines/__init__.py b/scanpipe/pipelines/__init__.py index b2a3f61cc5..4db02cfbaa 100644 --- a/scanpipe/pipelines/__init__.py +++ b/scanpipe/pipelines/__init__.py @@ -78,7 +78,10 @@ def flag_ignored_resources(self): ignored_patterns = ignored_patterns.splitlines() ignored_patterns.extend(flag.DEFAULT_IGNORED_PATTERNS) - flag.flag_ignored_patterns(self.project, patterns=ignored_patterns) + flag.flag_ignored_patterns( + codebaseresources=self.project.codebaseresources.no_status(), + patterns=ignored_patterns, + ) def extract_archive(self, location, target): """Extract archive at `location` to `target`. Save errors as messages.""" @@ -179,6 +182,8 @@ def __init__(self, run_instance): self.selected_groups = run_instance.selected_groups self.selected_steps = run_instance.selected_steps + self.ecosystem_config = None + @classmethod def get_initial_steps(cls): """Add the ``download_inputs`` step as an initial step if enabled.""" diff --git a/scanpipe/pipelines/deploy_to_develop.py b/scanpipe/pipelines/deploy_to_develop.py index 5213d347ad..7c9bc64565 100644 --- a/scanpipe/pipelines/deploy_to_develop.py +++ b/scanpipe/pipelines/deploy_to_develop.py @@ -24,6 +24,7 @@ from scanpipe import pipes from scanpipe.pipelines import Pipeline from scanpipe.pipes import d2d +from scanpipe.pipes import d2d_config from scanpipe.pipes import flag from scanpipe.pipes import input from scanpipe.pipes import matchcode @@ -64,6 +65,8 @@ def steps(cls): cls.flag_empty_files, cls.flag_whitespace_files, cls.flag_ignored_resources, + cls.load_ecosystem_config, + cls.map_ruby, cls.map_about_files, cls.map_checksum, cls.match_archives_to_purldb, @@ -94,33 +97,6 @@ def steps(cls): cls.create_local_files_packages, ) - purldb_package_extensions = [".jar", ".war", ".zip"] - purldb_resource_extensions = [ - ".map", - ".js", - ".mjs", - ".ts", - ".d.ts", - ".jsx", - ".tsx", - ".css", - ".scss", - ".less", - ".sass", - ".soy", - ".class", - ] - doc_extensions = [ - ".pdf", - ".doc", - ".docx", - ".ppt", - ".pptx", - ".tex", - ".odt", - ".odp", - ] - def get_inputs(self): """Locate the ``from`` and ``to`` input files.""" self.from_files, self.to_files = d2d.get_inputs(self.project) @@ -155,6 +131,15 @@ def flag_whitespace_files(self): """Flag whitespace files with size less than or equal to 100 byte as ignored.""" d2d.flag_whitespace_files(project=self.project) + def load_ecosystem_config(self): + """Load ecosystem specific configurations for d2d steps for selected options.""" + d2d_config.load_ecosystem_config(pipeline=self, options=self.selected_groups) + + @optional_step("Ruby") + def map_ruby(self): + """Load Ruby specific configurations for d2d steps.""" + pass + def map_about_files(self): """Map ``from/`` .ABOUT files to their related ``to/`` resources.""" d2d.map_about_files(project=self.project, logger=self.log) @@ -171,7 +156,7 @@ def match_archives_to_purldb(self): d2d.match_purldb_resources( project=self.project, - extensions=self.purldb_package_extensions, + extensions=self.matchable_package_extensions, matcher_func=d2d.match_purldb_package, logger=self.log, ) @@ -249,7 +234,7 @@ def match_resources_to_purldb(self): d2d.match_purldb_resources( project=self.project, - extensions=self.purldb_resource_extensions, + extensions=self.matchable_resource_extensions, matcher_func=d2d.match_purldb_resource, logger=self.log, ) @@ -287,6 +272,7 @@ def flag_mapped_resources_archives_and_ignored_directories(self): def perform_house_keeping_tasks(self): """ On deployed side + - Ignore specific files based on ecosystem based configurations. - PurlDB match files with ``no-java-source`` and empty status, if no match is found update status to ``requires-review``. - Update status for uninteresting files. @@ -297,9 +283,14 @@ def perform_house_keeping_tasks(self): """ d2d.match_resources_with_no_java_source(project=self.project, logger=self.log) d2d.handle_dangling_deployed_legal_files(project=self.project, logger=self.log) + d2d.ignore_unmapped_resources_from_config( + project=self.project, + patterns_to_ignore=self.ecosystem_config.deployed_resource_path_exclusions, + logger=self.log, + ) d2d.match_unmapped_resources( project=self.project, - matched_extensions=self.purldb_resource_extensions, + matched_extensions=self.ecosystem_config.matchable_resource_extensions, logger=self.log, ) d2d.flag_undeployed_resources(project=self.project) @@ -335,5 +326,5 @@ def flag_deployed_from_resources_with_missing_license(self): """Update the status for deployed from files with missing license.""" d2d.flag_deployed_from_resources_with_missing_license( self.project, - doc_extensions=self.doc_extensions, + doc_extensions=self.ecosystem_config.doc_extensions, ) diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index 617df5552a..5e826d6b91 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -54,6 +54,7 @@ from scanpipe.models import CodebaseRelation from scanpipe.models import CodebaseResource from scanpipe.models import convert_glob_to_django_regex +from scanpipe.pipes import d2d_config from scanpipe.pipes import flag from scanpipe.pipes import get_resource_diff_ratio from scanpipe.pipes import js @@ -1463,6 +1464,20 @@ def match_resources_with_no_java_source(project, logger=None): ) +def ignore_unmapped_resources_from_config(project, patterns_to_ignore, logger=None): + """Ignore unmapped resources for a project using `patterns_to_ignore`.""" + ignored_resources_count = flag.flag_ignored_patterns( + codebaseresources=project.codebaseresources.to_codebase().no_status(), + patterns=patterns_to_ignore, + status=flag.IGNORED_FROM_CONFIG, + ) + if logger: + logger( + f"Ignoring {ignored_resources_count:,d} to/ resources with " + "ecosystem specific configurations." + ) + + def match_unmapped_resources(project, matched_extensions=None, logger=None): """ Match resources with empty status to PurlDB, if unmatched @@ -1925,7 +1940,10 @@ def map_rust_binaries_with_symbols(project, logger=None): ) # Collect source symbols from rust source files - rust_from_resources = from_resources.filter(extension=".rs") + rust_config = d2d_config.get_ecosystem_config(ecosystem="Rust") + rust_from_resources = from_resources.filter( + extension__in=rust_config.source_symbol_extensions + ) map_binaries_with_symbols( project=project, @@ -1945,7 +1963,10 @@ def map_elfs_binaries_with_symbols(project, logger=None): ) # Collect source symbols from elf related source files - elf_from_resources = from_resources.filter(extension__in=[".c", ".cpp", ".h"]) + elf_config = d2d_config.get_ecosystem_config(ecosystem="Elf") + elf_from_resources = from_resources.filter( + extension__in=elf_config.source_symbol_extensions + ) map_binaries_with_symbols( project=project, @@ -1968,8 +1989,9 @@ def map_macho_binaries_with_symbols(project, logger=None): ) # Collect source symbols from macos related source files + macos_config = d2d_config.get_ecosystem_config(ecosystem="MacOS") mac_from_resources = from_resources.filter( - extension__in=[".c", ".cpp", ".h", ".m", ".swift"] + extension__in=macos_config.source_symbol_extensions, ) map_binaries_with_symbols( @@ -1990,8 +2012,9 @@ def map_winpe_binaries_with_symbols(project, logger=None): ) # Collect source symbols from windows related source files + windows_config = d2d_config.get_ecosystem_config(ecosystem="Windows") windows_from_resources = from_resources.filter( - extension__in=[".c", ".cpp", ".h", ".cs"] + extension__in=windows_config.source_symbol_extensions, ) map_binaries_with_symbols( @@ -2057,16 +2080,17 @@ def map_javascript_symbols(project, logger=None): """Map deployed JavaScript, TypeScript to its sources using symbols.""" project_files = project.codebaseresources.files() + js_config = d2d_config.get_ecosystem_config(ecosystem="JavaScript") javascript_to_resources = ( project_files.to_codebase() .has_no_relation() - .filter(extension__in=[".ts", ".js"]) + .filter(extension__in=js_config.source_symbol_extensions) ) javascript_from_resources = ( project_files.from_codebase() .exclude(path__contains="/test/") - .filter(extension__in=[".ts", ".js"]) + .filter(extension__in=js_config.source_symbol_extensions) ) if not (javascript_from_resources.exists() and javascript_to_resources.exists()): diff --git a/scanpipe/pipes/d2d_config.py b/scanpipe/pipes/d2d_config.py new file mode 100644 index 0000000000..2508928323 --- /dev/null +++ b/scanpipe/pipes/d2d_config.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + + +from dataclasses import dataclass +from dataclasses import field + + +@dataclass +class EcosystemConfig: + """ + Base class for ecosystem-specific configurations to be defined + for each ecosystem. + """ + + # This should be defined for each ecosystem which + # are options in the pipelines + ecosystem_option: str = "Default" + + # These are extensions for package archive files for this ecosystem + # that are matchable against the purldb using matchcode + matchable_package_extensions: list = field(default_factory=list) + + # These are extensions for file of this ecosystem that are + # matchable against the purldb using matchcode + matchable_resource_extensions: list = field(default_factory=list) + + # Extensions for document files which do not require review + doc_extensions: list = field(default_factory=list) + + # Paths in the deployed binaries/archives (on the to/ side) which + # do not need review even if they are not matched to the source side + deployed_resource_path_exclusions: list = field(default_factory=list) + + # Paths in the development/source archive (on the from/ side) which + # should not be considered even if unmapped to the deployed side when + # assessing what to review on the deployed side + devel_resource_path_exclusions: list = field(default_factory=list) + + # Symbols which are found in ecosystem-specific standard libraries + # which are not so useful in mapping + standard_symbols_to_exclude: list = field(default_factory=list) + + # File extesions which should be looked at for source symbol extraction + # for mapping using symbols for a specific selected option/ecosystem + source_symbol_extensions: list = field(default_factory=list) + + +# Dictionary of ecosystem configurations +ECOSYSTEM_CONFIGS = { + "Default": EcosystemConfig( + matchable_package_extensions=[".zip", ".tar.gz", ".tar.xz"], + devel_resource_path_exclusions=["*/tests/*"], + doc_extensions=[ + ".pdf", + ".doc", + ".docx", + ".ppt", + ".pptx", + ".tex", + ".odt", + ".odp", + ], + ), + "Java": EcosystemConfig( + ecosystem_option="Java", + matchable_package_extensions=[".jar", ".war"], + matchable_resource_extensions=[".class"], + ), + "JavaScript": EcosystemConfig( + ecosystem_option="JavaScript", + matchable_resource_extensions=[ + ".map", + ".js", + ".mjs", + ".ts", + ".d.ts", + ".jsx", + ".tsx", + ".css", + ".scss", + ".less", + ".sass", + ".soy", + ], + source_symbol_extensions=[".ts", ".js"], + ), + "Go": EcosystemConfig( + ecosystem_option="Go", + matchable_resource_extensions=[".go"], + ), + "Rust": EcosystemConfig( + ecosystem_option="Rust", + matchable_resource_extensions=[".rs"], + source_symbol_extensions=[".rs"], + ), + "Ruby": EcosystemConfig( + ecosystem_option="Ruby", + matchable_package_extensions=[".gem"], + matchable_resource_extensions=[".rb"], + deployed_resource_path_exclusions=["*checksums.yaml.gz*", "*metadata.gz*"], + ), + "Elf": EcosystemConfig( + ecosystem_option="Elf", + source_symbol_extensions=[".c", ".cpp", ".h"], + ), + "MacOS": EcosystemConfig( + ecosystem_option="MacOS", + source_symbol_extensions=[".c", ".cpp", ".h", ".m", ".swift"], + ), + "Windows": EcosystemConfig( + ecosystem_option="Windows", + source_symbol_extensions=[".c", ".cpp", ".h", ".cs"], + ), +} + + +def get_ecosystem_config(ecosystem): + """Return the ``ecosystem`` config.""" + return ECOSYSTEM_CONFIGS.get(ecosystem, ECOSYSTEM_CONFIGS["Default"]) + + +def load_ecosystem_config(pipeline, options): + """ + Add ecosystem specific configurations for each ecosystem selected + as `options` to the `pipeline`. These configurations are used for: + - which resource/package extensions to match to purldb + - which source files to get source symbols from + - which unmapped paths to ignore in deployed binaries + """ + # Add default configurations which are common accross ecosystems + pipeline.ecosystem_config = ECOSYSTEM_CONFIGS.get("Default") + + # Add configurations for each selected ecosystem + for selected_option in options: + if selected_option not in ECOSYSTEM_CONFIGS: + continue + + ecosystem_config = get_ecosystem_config(ecosystem=selected_option) + add_ecosystem_config( + pipeline_ecosystem_config=pipeline.ecosystem_config, + ecosystem_config=ecosystem_config, + ) + + +def add_ecosystem_config(pipeline_ecosystem_config, ecosystem_config): + """ + Set the `pipeline_ecosystem_config` values from an individual ecosystem + based configuration defined in `ecosystem_config`. + """ + d2d_pipeline_configs = [ + "matchable_package_extensions", + "matchable_resource_extensions", + "deployed_resource_path_exclusions", + ] + + for config_name in d2d_pipeline_configs: + config_value = getattr(ecosystem_config, config_name) + pipeline_config_value = getattr(pipeline_ecosystem_config, config_name) + if config_value: + if not pipeline_config_value: + new_config_value = config_value + else: + new_config_value = pipeline_config_value.extend(config_value) + + setattr(pipeline_ecosystem_config, config_name, new_config_value) diff --git a/scanpipe/pipes/flag.py b/scanpipe/pipes/flag.py index 2a81ef11c6..e8a983d40d 100644 --- a/scanpipe/pipes/flag.py +++ b/scanpipe/pipes/flag.py @@ -46,6 +46,7 @@ IGNORED_DEFAULT_IGNORES = "ignored-default-ignores" IGNORED_DATA_FILE_NO_CLUES = "ignored-data-file-no-clues" IGNORED_DOC_FILE = "ignored-doc-file" +IGNORED_FROM_CONFIG = "ignored-from-config" IGNORED_BY_MAX_FILE_SIZE = "ignored-by-max-file-size" COMPLIANCE_LICENSES = "compliance-licenses" @@ -94,15 +95,15 @@ def flag_ignored_directories(project): return qs.update(status=IGNORED_DIRECTORY) -def flag_ignored_patterns(project, patterns): +def flag_ignored_patterns(codebaseresources, patterns, status=IGNORED_PATTERN): """Flag codebase resource as ``ignored`` status from list of ``patterns``.""" if isinstance(patterns, str): patterns = patterns.splitlines() update_count = 0 for pattern in patterns: - qs = project.codebaseresources.no_status().path_pattern(pattern) - update_count += qs.update(status=IGNORED_PATTERN) + qs = codebaseresources.path_pattern(pattern) + update_count += qs.update(status=status) return update_count diff --git a/scanpipe/tests/data/d2d-ruby/from-sentry-ruby-5.22.1.zip b/scanpipe/tests/data/d2d-ruby/from-sentry-ruby-5.22.1.zip new file mode 100644 index 0000000000..8813602753 Binary files /dev/null and b/scanpipe/tests/data/d2d-ruby/from-sentry-ruby-5.22.1.zip differ diff --git a/scanpipe/tests/data/d2d-ruby/to-sentry-delayed_job-5.22.1.gem b/scanpipe/tests/data/d2d-ruby/to-sentry-delayed_job-5.22.1.gem new file mode 100644 index 0000000000..f78ed3d58d Binary files /dev/null and b/scanpipe/tests/data/d2d-ruby/to-sentry-delayed_job-5.22.1.gem differ diff --git a/scanpipe/tests/pipes/test_d2d.py b/scanpipe/tests/pipes/test_d2d.py index 2846ece697..3f9d394373 100644 --- a/scanpipe/tests/pipes/test_d2d.py +++ b/scanpipe/tests/pipes/test_d2d.py @@ -36,6 +36,7 @@ from scanpipe.models import CodebaseResource from scanpipe.models import Project from scanpipe.pipes import d2d +from scanpipe.pipes import d2d_config from scanpipe.pipes import flag from scanpipe.pipes import scancode from scanpipe.pipes.input import copy_input @@ -1531,6 +1532,51 @@ def test_scanpipe_pipes_d2d_map_go_paths(self): ).count(), ) + def test_scanpipe_pipes_d2d_map_ruby(self): + input_dir = self.project1.input_path + input_resources = [ + self.data / "d2d-ruby/to-sentry-delayed_job-5.22.1.gem", + self.data / "d2d-ruby/from-sentry-ruby-5.22.1.zip", + ] + copy_inputs(input_resources, input_dir) + self.from_files, self.to_files = d2d.get_inputs(self.project1) + inputs_with_codebase_path_destination = [ + (self.from_files, self.project1.codebase_path / d2d.FROM), + (self.to_files, self.project1.codebase_path / d2d.TO), + ] + for input_files, codebase_path in inputs_with_codebase_path_destination: + for input_file_path in input_files: + scancode.extract_archive(input_file_path, codebase_path) + + scancode.extract_archives( + self.project1.codebase_path, + recurse=True, + ) + pipes.collect_and_create_codebase_resources(self.project1) + buffer = io.StringIO() + d2d.map_checksum( + project=self.project1, checksum_field="sha1", logger=buffer.write + ) + ruby_config = d2d_config.get_ecosystem_config(ecosystem="Ruby") + d2d.ignore_unmapped_resources_from_config( + project=self.project1, + patterns_to_ignore=ruby_config.deployed_resource_path_exclusions, + logger=buffer.write, + ) + d2d.flag_undeployed_resources(project=self.project1) + self.assertEqual( + 39, + CodebaseRelation.objects.filter( + project=self.project1, map_type="sha1" + ).count(), + ) + self.assertEqual( + 0, + CodebaseResource.objects.filter( + project=self.project1, status="requires-review" + ).count(), + ) + @skipIf(sys.platform == "darwin", "Test is failing on macOS") def test_scanpipe_pipes_d2d_map_rust_symbols(self): input_dir = self.project1.input_path diff --git a/scanpipe/tests/pipes/test_flag.py b/scanpipe/tests/pipes/test_flag.py index 211f7c57ab..c971590dd7 100644 --- a/scanpipe/tests/pipes/test_flag.py +++ b/scanpipe/tests/pipes/test_flag.py @@ -71,7 +71,9 @@ def test_scanpipe_pipes_flag_flag_ignored_directories(self): def test_scanpipe_pipes_flag_flag_ignored_patterns(self): patterns = ["*.ext", "dir/*"] - updated = flag.flag_ignored_patterns(self.project1, patterns) + updated = flag.flag_ignored_patterns( + self.project1.codebaseresources.no_status(), patterns + ) self.assertEqual(3, updated) self.resource1.refresh_from_db() @@ -86,7 +88,8 @@ def test_scanpipe_pipes_flag_flag_ignored_patterns(self): make_resource_file(self.project1, "path/deeper/policies.yml") make_resource_file(self.project1, "path/other-policies.yml") updated = flag.flag_ignored_patterns( - self.project1, flag.DEFAULT_IGNORED_PATTERNS + self.project1.codebaseresources.no_status(), + flag.DEFAULT_IGNORED_PATTERNS, ) self.assertEqual(3, updated) @@ -97,7 +100,9 @@ def test_scanpipe_pipes_flag_flag_ignored_patterns(self): project2, "a.cdx.json.zip-extract/__MACOSX/._a.cdx.json" ) make_resource_file(project2, "a.cdx.json.zip-extract/a.cdx.json") - updated = flag.flag_ignored_patterns(project2, flag.DEFAULT_IGNORED_PATTERNS) + updated = flag.flag_ignored_patterns( + project2.codebaseresources.no_status(), flag.DEFAULT_IGNORED_PATTERNS + ) self.assertEqual(2, updated) ignored_qs = project2.codebaseresources.status(flag.IGNORED_PATTERN) self.assertEqual(2, ignored_qs.count()) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index e3df763948..21cefc4168 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -424,8 +424,11 @@ def test_scanpipe_pipeline_class_flag_ignored_resources(self): with mock.patch("scanpipe.pipes.flag.flag_ignored_patterns") as mock_flag: mock_flag.return_value = None pipeline.flag_ignored_resources() + + mock_flag.assert_called_once() patterns_args = ["*.ext", *flag.DEFAULT_IGNORED_PATTERNS] - mock_flag.assert_called_with(project1, patterns=patterns_args) + self.assertEqual(mock_flag.mock_calls[0].kwargs["patterns"], patterns_args) + self.assertEqual(mock_flag.mock_calls[0].kwargs["codebaseresources"].count(), 0) def test_scanpipe_pipeline_class_extract_archive(self): project1 = make_project()