Skip to content

Commit 9dd1756

Browse files
committed
Add d2d step to map javascript using string literals
Signed-off-by: Keshav Priyadarshi <[email protected]>
1 parent a018711 commit 9dd1756

File tree

4 files changed

+129
-0
lines changed

4 files changed

+129
-0
lines changed

scanpipe/filters.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,7 @@ def filter(self, qs, value):
483483
("about_file", "about file"),
484484
("java_to_class", "java to class"),
485485
("jar_to_source", "jar to source"),
486+
("javascript_strings", "js strings"),
486487
("javascript_symbols", "js symbols"),
487488
("js_compiled", "js compiled"),
488489
("js_colocation", "js colocation"),

scanpipe/pipelines/deploy_to_develop.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def steps(cls):
7272
cls.map_jar_to_source,
7373
cls.map_javascript,
7474
cls.map_javascript_symbols,
75+
cls.map_javascript_strings,
7576
cls.map_elf,
7677
cls.map_macho,
7778
cls.map_winpe,
@@ -204,6 +205,11 @@ def map_javascript_symbols(self):
204205
"""Map deployed JavaScript, TypeScript to its sources using symbols."""
205206
d2d.map_javascript_symbols(project=self.project, logger=self.log)
206207

208+
@optional_step("JavaScript")
209+
def map_javascript_strings(self):
210+
"""Map deployed JavaScript, TypeScript to its sources using string literals."""
211+
d2d.map_javascript_strings(project=self.project, logger=self.log)
212+
207213
@optional_step("Elf")
208214
def map_elf(self):
209215
"""Map ELF binaries to their sources using dwarf paths and symbols."""

scanpipe/pipes/d2d.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
from scanpipe.pipes import purldb
6363
from scanpipe.pipes import resolve
6464
from scanpipe.pipes import scancode
65+
from scanpipe.pipes import strings
6566
from scanpipe.pipes import symbolmap
6667
from scanpipe.pipes import symbols
6768

@@ -2146,3 +2147,84 @@ def _map_javascript_symbols(to_resource, javascript_from_resources, logger):
21462147
to_resource.update(status=flag.MAPPED)
21472148
return 1
21482149
return 0
2150+
2151+
2152+
def map_javascript_strings(project, logger=None):
2153+
"""Map deployed JavaScript, TypeScript to its sources using string literals."""
2154+
project_files = project.codebaseresources.files()
2155+
2156+
javascript_to_resources = (
2157+
project_files.to_codebase()
2158+
.has_no_relation()
2159+
.filter(extension__in=[".ts", ".js"])
2160+
.exclude(extra_data={})
2161+
)
2162+
2163+
javascript_from_resources = (
2164+
project_files.from_codebase()
2165+
.exclude(path__contains="/test/")
2166+
.filter(extension__in=[".ts", ".js"])
2167+
.exclude(extra_data={})
2168+
)
2169+
2170+
if not (javascript_from_resources.exists() and javascript_to_resources.exists()):
2171+
return
2172+
2173+
javascript_from_resources_count = javascript_from_resources.count()
2174+
javascript_to_resources_count = javascript_to_resources.count()
2175+
if logger:
2176+
logger(
2177+
f"Mapping {javascript_to_resources_count:,d} JavaScript resources"
2178+
f" using string literals against {javascript_from_resources_count:,d}"
2179+
" from/ resources."
2180+
)
2181+
2182+
resource_iterator = javascript_to_resources.iterator(chunk_size=2000)
2183+
progress = LoopProgress(javascript_to_resources_count, logger)
2184+
2185+
resource_mapped = 0
2186+
for to_resource in progress.iter(resource_iterator):
2187+
resource_mapped += _map_javascript_strings(
2188+
to_resource, javascript_from_resources, logger
2189+
)
2190+
if logger:
2191+
logger(f"{resource_mapped:,d} resource mapped using strings")
2192+
2193+
2194+
def _map_javascript_strings(to_resource, javascript_from_resources, logger):
2195+
"""
2196+
Map a deployed JavaScript resource to its source using string literals and
2197+
return 1 if match is found otherwise return 0.
2198+
"""
2199+
ignoreable_string_threshold = 5
2200+
to_strings = to_resource.extra_data.get("source_strings")
2201+
2202+
if not to_strings and len(to_strings) > ignoreable_string_threshold:
2203+
return 0
2204+
2205+
best_matching_score = 0
2206+
best_match = None
2207+
for source_js in javascript_from_resources:
2208+
from_strings = source_js.extra_data.get("source_strings")
2209+
if not from_strings and len(from_strings) > ignoreable_string_threshold:
2210+
continue
2211+
2212+
is_match, similarity = strings.match_source_strings_to_deployed(
2213+
source_strings=from_strings,
2214+
deployed_strings=to_strings,
2215+
)
2216+
2217+
if is_match and similarity > best_matching_score:
2218+
best_matching_score = similarity
2219+
best_match = source_js
2220+
2221+
if best_match:
2222+
pipes.make_relation(
2223+
from_resource=best_match,
2224+
to_resource=to_resource,
2225+
map_type="javascript_strings",
2226+
extra_data={"js_string_map_score": similarity},
2227+
)
2228+
to_resource.update(status=flag.MAPPED)
2229+
return 1
2230+
return 0

scanpipe/pipes/strings.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,14 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23+
from collections import Counter
24+
2325
from aboutcode.pipeline import LoopProgress
2426

27+
STRING_MATCHING_RATIO_JAVASCRIPT = 0.7
28+
SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT = 10
29+
STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE = 0.5
30+
2531

2632
class XgettextNotFound(Exception):
2733
pass
@@ -66,3 +72,37 @@ def _collect_and_store_resource_strings(resource):
6672
result = strings_xgettext.collect_strings(resource.location)
6773
strings = [item["string"] for item in result if "string" in item]
6874
resource.update_extra_data({"source_strings": strings})
75+
76+
77+
def match_source_strings_to_deployed(source_strings, deployed_strings):
78+
common_strings_ratio = 0
79+
is_match = False
80+
deployed_strings_set = set(deployed_strings)
81+
source_strings_set = set(source_strings)
82+
source_strings_count = len(source_strings)
83+
deployed_strings_count = len(deployed_strings)
84+
total_strings_count = source_strings_count + deployed_strings_count
85+
source_strings_counter = Counter(source_strings)
86+
deployed_strings_counter = Counter(deployed_strings)
87+
88+
common_strings = source_strings_set.intersection(deployed_strings_set)
89+
total_common_strings_count = sum(
90+
[
91+
source_strings_counter.get(string, 0)
92+
+ deployed_strings_counter.get(string, 0)
93+
for string in common_strings
94+
]
95+
)
96+
97+
if total_common_strings_count:
98+
common_strings_ratio = total_common_strings_count / total_strings_count
99+
100+
if common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT:
101+
is_match = True
102+
elif (
103+
source_strings_count > SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT
104+
and common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE
105+
):
106+
is_match = True
107+
108+
return is_match, common_strings_ratio

0 commit comments

Comments
 (0)