Skip to content

Commit f557411

Browse files
authored
Refine ScanCode.io d2d pipeline for JavaScript using string literals mapping (#1652)
* Add d2d step to map javascript using string literals Signed-off-by: Keshav Priyadarshi <[email protected]> * Add test for javascript string match Signed-off-by: Keshav Priyadarshi <[email protected]> --------- Signed-off-by: Keshav Priyadarshi <[email protected]>
1 parent c93fc98 commit f557411

File tree

9 files changed

+2164
-0
lines changed

9 files changed

+2164
-0
lines changed

scanpipe/filters.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,7 @@ def filter(self, qs, value):
483483
("about_file", "about file"),
484484
("java_to_class", "java to class"),
485485
("jar_to_source", "jar to source"),
486+
("javascript_strings", "js strings"),
486487
("javascript_symbols", "js symbols"),
487488
("js_compiled", "js compiled"),
488489
("js_colocation", "js colocation"),

scanpipe/pipelines/deploy_to_develop.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def steps(cls):
7575
cls.map_jar_to_source,
7676
cls.map_javascript,
7777
cls.map_javascript_symbols,
78+
cls.map_javascript_strings,
7879
cls.map_elf,
7980
cls.map_macho,
8081
cls.map_winpe,
@@ -189,6 +190,11 @@ def map_javascript_symbols(self):
189190
"""Map deployed JavaScript, TypeScript to its sources using symbols."""
190191
d2d.map_javascript_symbols(project=self.project, logger=self.log)
191192

193+
@optional_step("JavaScript")
194+
def map_javascript_strings(self):
195+
"""Map deployed JavaScript, TypeScript to its sources using string literals."""
196+
d2d.map_javascript_strings(project=self.project, logger=self.log)
197+
192198
@optional_step("Elf")
193199
def map_elf(self):
194200
"""Map ELF binaries to their sources using dwarf paths and symbols."""

scanpipe/pipes/d2d.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
from scanpipe.pipes import purldb
6464
from scanpipe.pipes import resolve
6565
from scanpipe.pipes import scancode
66+
from scanpipe.pipes import stringmap
6667
from scanpipe.pipes import symbolmap
6768
from scanpipe.pipes import symbols
6869

@@ -2170,3 +2171,86 @@ def _map_javascript_symbols(to_resource, javascript_from_resources, logger):
21702171
to_resource.update(status=flag.MAPPED)
21712172
return 1
21722173
return 0
2174+
2175+
2176+
def map_javascript_strings(project, logger=None):
2177+
"""Map deployed JavaScript, TypeScript to its sources using string literals."""
2178+
project_files = project.codebaseresources.files()
2179+
2180+
javascript_to_resources = (
2181+
project_files.to_codebase()
2182+
.has_no_relation()
2183+
.filter(extension__in=[".ts", ".js"])
2184+
.exclude(extra_data={})
2185+
)
2186+
2187+
javascript_from_resources = (
2188+
project_files.from_codebase()
2189+
.exclude(path__contains="/test/")
2190+
.filter(extension__in=[".ts", ".js"])
2191+
.exclude(extra_data={})
2192+
)
2193+
2194+
if not (javascript_from_resources.exists() and javascript_to_resources.exists()):
2195+
return
2196+
2197+
javascript_from_resources_count = javascript_from_resources.count()
2198+
javascript_to_resources_count = javascript_to_resources.count()
2199+
if logger:
2200+
logger(
2201+
f"Mapping {javascript_to_resources_count:,d} JavaScript resources"
2202+
f" using string literals against {javascript_from_resources_count:,d}"
2203+
" from/ resources."
2204+
)
2205+
2206+
resource_iterator = javascript_to_resources.iterator(chunk_size=2000)
2207+
progress = LoopProgress(javascript_to_resources_count, logger)
2208+
2209+
resource_mapped = 0
2210+
for to_resource in progress.iter(resource_iterator):
2211+
resource_mapped += _map_javascript_strings(
2212+
to_resource, javascript_from_resources, logger
2213+
)
2214+
if logger:
2215+
logger(f"{resource_mapped:,d} resource mapped using strings")
2216+
2217+
2218+
def _map_javascript_strings(to_resource, javascript_from_resources, logger):
2219+
"""
2220+
Map a deployed JavaScript resource to its source using string literals and
2221+
return 1 if match is found otherwise return 0.
2222+
"""
2223+
ignoreable_string_threshold = 5
2224+
to_strings = to_resource.extra_data.get("source_strings")
2225+
to_strings_set = set(to_strings)
2226+
2227+
if not to_strings or len(to_strings_set) < ignoreable_string_threshold:
2228+
return 0
2229+
2230+
best_matching_score = 0
2231+
best_match = None
2232+
for source_js in javascript_from_resources:
2233+
from_strings = source_js.extra_data.get("source_strings")
2234+
from_strings_set = set(from_strings)
2235+
if not from_strings or len(from_strings_set) < ignoreable_string_threshold:
2236+
continue
2237+
2238+
is_match, similarity = stringmap.match_source_strings_to_deployed(
2239+
source_strings=from_strings,
2240+
deployed_strings=to_strings,
2241+
)
2242+
2243+
if is_match and similarity > best_matching_score:
2244+
best_matching_score = similarity
2245+
best_match = source_js
2246+
2247+
if best_match:
2248+
pipes.make_relation(
2249+
from_resource=best_match,
2250+
to_resource=to_resource,
2251+
map_type="javascript_strings",
2252+
extra_data={"js_string_map_score": similarity},
2253+
)
2254+
to_resource.update(status=flag.MAPPED)
2255+
return 1
2256+
return 0

scanpipe/pipes/stringmap.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from collections import Counter
24+
25+
STRING_MATCHING_RATIO_JAVASCRIPT = 0.7
26+
SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT = 10
27+
STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE = 0.5
28+
29+
30+
def match_source_strings_to_deployed(source_strings, deployed_strings):
31+
"""
32+
Compute the similarity between source and deployed string literals and
33+
return whether they match based on matching threshold.
34+
"""
35+
common_strings_ratio = 0
36+
is_match = False
37+
deployed_strings_set = set(deployed_strings)
38+
source_strings_set = set(source_strings)
39+
source_strings_count = len(source_strings)
40+
deployed_strings_count = len(deployed_strings)
41+
total_strings_count = source_strings_count + deployed_strings_count
42+
source_strings_counter = Counter(source_strings)
43+
deployed_strings_counter = Counter(deployed_strings)
44+
45+
common_strings = source_strings_set.intersection(deployed_strings_set)
46+
total_common_strings_count = sum(
47+
[
48+
source_strings_counter.get(string, 0)
49+
+ deployed_strings_counter.get(string, 0)
50+
for string in common_strings
51+
]
52+
)
53+
54+
if total_common_strings_count:
55+
common_strings_ratio = total_common_strings_count / total_strings_count
56+
57+
if common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT:
58+
is_match = True
59+
elif (
60+
source_strings_count > SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT
61+
and common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE
62+
):
63+
is_match = True
64+
65+
return is_match, common_strings_ratio
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
about_resource: cesium
2+
name: cesium
3+
version: 1.125
4+
download_url: https://github.com/CesiumGS/cesium/archive/refs/tags/1.125.zip
5+
homepage_url: https://github.com/CesiumGS/cesium
6+
license_expression: apache-2.0
7+
attribute: yes
8+
package_url: pkg:github/CesiumGS/[email protected]

scanpipe/tests/data/d2d-javascript/strings/cesium/deployed-decodeI3S.js

Lines changed: 26 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)