Skip to content

Commit fc5c92d

Browse files
committed
Add d2d step to map javascript using string literals
Signed-off-by: Keshav Priyadarshi <[email protected]>
1 parent 8ccbef3 commit fc5c92d

File tree

4 files changed

+129
-0
lines changed

4 files changed

+129
-0
lines changed

scanpipe/filters.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,7 @@ def filter(self, qs, value):
483483
("about_file", "about file"),
484484
("java_to_class", "java to class"),
485485
("jar_to_source", "jar to source"),
486+
("javascript_strings", "js strings"),
486487
("javascript_symbols", "js symbols"),
487488
("js_compiled", "js compiled"),
488489
("js_colocation", "js colocation"),

scanpipe/pipelines/deploy_to_develop.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def steps(cls):
7272
cls.map_jar_to_source,
7373
cls.map_javascript,
7474
cls.map_javascript_symbols,
75+
cls.map_javascript_strings,
7576
cls.map_elf,
7677
cls.map_go,
7778
cls.map_rust,
@@ -202,6 +203,11 @@ def map_javascript_symbols(self):
202203
"""Map deployed JavaScript, TypeScript to its sources using symbols."""
203204
d2d.map_javascript_symbols(project=self.project, logger=self.log)
204205

206+
@optional_step("JavaScript")
207+
def map_javascript_strings(self):
208+
"""Map deployed JavaScript, TypeScript to its sources using string literals."""
209+
d2d.map_javascript_strings(project=self.project, logger=self.log)
210+
205211
@optional_step("Elf")
206212
def map_elf(self):
207213
"""Map ELF binaries to their sources."""

scanpipe/pipes/d2d.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
from scanpipe.pipes import purldb
6060
from scanpipe.pipes import resolve
6161
from scanpipe.pipes import scancode
62+
from scanpipe.pipes import strings
6263
from scanpipe.pipes import symbolmap
6364
from scanpipe.pipes import symbols
6465

@@ -2055,3 +2056,84 @@ def _map_javascript_symbols(to_resource, javascript_from_resources, logger):
20552056
to_resource.update(status=flag.MAPPED)
20562057
return 1
20572058
return 0
2059+
2060+
2061+
def map_javascript_strings(project, logger=None):
2062+
"""Map deployed JavaScript, TypeScript to its sources using string literals."""
2063+
project_files = project.codebaseresources.files()
2064+
2065+
javascript_to_resources = (
2066+
project_files.to_codebase()
2067+
.has_no_relation()
2068+
.filter(extension__in=[".ts", ".js"])
2069+
.exclude(extra_data={})
2070+
)
2071+
2072+
javascript_from_resources = (
2073+
project_files.from_codebase()
2074+
.exclude(path__contains="/test/")
2075+
.filter(extension__in=[".ts", ".js"])
2076+
.exclude(extra_data={})
2077+
)
2078+
2079+
if not (javascript_from_resources.exists() and javascript_to_resources.exists()):
2080+
return
2081+
2082+
javascript_from_resources_count = javascript_from_resources.count()
2083+
javascript_to_resources_count = javascript_to_resources.count()
2084+
if logger:
2085+
logger(
2086+
f"Mapping {javascript_to_resources_count:,d} JavaScript resources"
2087+
f" using string literals against {javascript_from_resources_count:,d}"
2088+
" from/ resources."
2089+
)
2090+
2091+
resource_iterator = javascript_to_resources.iterator(chunk_size=2000)
2092+
progress = LoopProgress(javascript_to_resources_count, logger)
2093+
2094+
resource_mapped = 0
2095+
for to_resource in progress.iter(resource_iterator):
2096+
resource_mapped += _map_javascript_strings(
2097+
to_resource, javascript_from_resources, logger
2098+
)
2099+
if logger:
2100+
logger(f"{resource_mapped:,d} resource mapped using strings")
2101+
2102+
2103+
def _map_javascript_strings(to_resource, javascript_from_resources, logger):
2104+
"""
2105+
Map a deployed JavaScript resource to its source using string literals and
2106+
return 1 if match is found otherwise return 0.
2107+
"""
2108+
ignoreable_string_threshold = 5
2109+
to_strings = to_resource.extra_data.get("source_strings")
2110+
2111+
if not to_strings and len(to_strings) > ignoreable_string_threshold:
2112+
return 0
2113+
2114+
best_matching_score = 0
2115+
best_match = None
2116+
for source_js in javascript_from_resources:
2117+
from_strings = source_js.extra_data.get("source_strings")
2118+
if not from_strings and len(from_strings) > ignoreable_string_threshold:
2119+
continue
2120+
2121+
is_match, similarity = strings.match_source_strings_to_deployed(
2122+
source_strings=from_strings,
2123+
deployed_strings=to_strings,
2124+
)
2125+
2126+
if is_match and similarity > best_matching_score:
2127+
best_matching_score = similarity
2128+
best_match = source_js
2129+
2130+
if best_match:
2131+
pipes.make_relation(
2132+
from_resource=best_match,
2133+
to_resource=to_resource,
2134+
map_type="javascript_strings",
2135+
extra_data={"js_string_map_score": similarity},
2136+
)
2137+
to_resource.update(status=flag.MAPPED)
2138+
return 1
2139+
return 0

scanpipe/pipes/strings.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,14 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23+
from collections import Counter
24+
2325
from aboutcode.pipeline import LoopProgress
2426

27+
STRING_MATCHING_RATIO_JAVASCRIPT = 0.7
28+
SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT = 10
29+
STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE = 0.5
30+
2531

2632
class XgettextNotFound(Exception):
2733
pass
@@ -66,3 +72,37 @@ def _collect_and_store_resource_strings(resource):
6672
result = strings_xgettext.collect_strings(resource.location)
6773
strings = [item["string"] for item in result if "string" in item]
6874
resource.update_extra_data({"source_strings": strings})
75+
76+
77+
def match_source_strings_to_deployed(source_strings, deployed_strings):
78+
common_strings_ratio = 0
79+
is_match = False
80+
deployed_strings_set = set(deployed_strings)
81+
source_strings_set = set(source_strings)
82+
source_strings_count = len(source_strings)
83+
deployed_strings_count = len(deployed_strings)
84+
total_strings_count = source_strings_count + deployed_strings_count
85+
source_strings_counter = Counter(source_strings)
86+
deployed_strings_counter = Counter(deployed_strings)
87+
88+
common_strings = source_strings_set.intersection(deployed_strings_set)
89+
total_common_strings_count = sum(
90+
[
91+
source_strings_counter.get(string, 0)
92+
+ deployed_strings_counter.get(string, 0)
93+
for string in common_strings
94+
]
95+
)
96+
97+
if total_common_strings_count:
98+
common_strings_ratio = total_common_strings_count / total_strings_count
99+
100+
if common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT:
101+
is_match = True
102+
elif (
103+
source_strings_count > SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT
104+
and common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE
105+
):
106+
is_match = True
107+
108+
return is_match, common_strings_ratio

0 commit comments

Comments
 (0)