Skip to content

Commit 2f8e03e

Browse files
committed
Add test for javascript string match
Signed-off-by: Keshav Priyadarshi <[email protected]>
1 parent fc5c92d commit 2f8e03e

File tree

9 files changed

+2080
-45
lines changed

9 files changed

+2080
-45
lines changed

scanpipe/pipes/d2d.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
from scanpipe.pipes import purldb
6060
from scanpipe.pipes import resolve
6161
from scanpipe.pipes import scancode
62-
from scanpipe.pipes import strings
62+
from scanpipe.pipes import stringmap
6363
from scanpipe.pipes import symbolmap
6464
from scanpipe.pipes import symbols
6565

@@ -2107,18 +2107,20 @@ def _map_javascript_strings(to_resource, javascript_from_resources, logger):
21072107
"""
21082108
ignoreable_string_threshold = 5
21092109
to_strings = to_resource.extra_data.get("source_strings")
2110+
to_strings_set = set(to_strings)
21102111

2111-
if not to_strings and len(to_strings) > ignoreable_string_threshold:
2112+
if not to_strings or len(to_strings_set) < ignoreable_string_threshold:
21122113
return 0
21132114

21142115
best_matching_score = 0
21152116
best_match = None
21162117
for source_js in javascript_from_resources:
21172118
from_strings = source_js.extra_data.get("source_strings")
2118-
if not from_strings and len(from_strings) > ignoreable_string_threshold:
2119+
from_strings_set = set(from_strings)
2120+
if not from_strings or len(from_strings_set) < ignoreable_string_threshold:
21192121
continue
21202122

2121-
is_match, similarity = strings.match_source_strings_to_deployed(
2123+
is_match, similarity = stringmap.match_source_strings_to_deployed(
21222124
source_strings=from_strings,
21232125
deployed_strings=to_strings,
21242126
)

scanpipe/pipes/stringmap.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from collections import Counter
24+
25+
STRING_MATCHING_RATIO_JAVASCRIPT = 0.7
26+
SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT = 10
27+
STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE = 0.5
28+
29+
30+
def match_source_strings_to_deployed(source_strings, deployed_strings):
31+
"""
32+
Compute the similarity between source and deployed string literals and
33+
return whether they match based on matching threshold.
34+
"""
35+
common_strings_ratio = 0
36+
is_match = False
37+
deployed_strings_set = set(deployed_strings)
38+
source_strings_set = set(source_strings)
39+
source_strings_count = len(source_strings)
40+
deployed_strings_count = len(deployed_strings)
41+
total_strings_count = source_strings_count + deployed_strings_count
42+
source_strings_counter = Counter(source_strings)
43+
deployed_strings_counter = Counter(deployed_strings)
44+
45+
common_strings = source_strings_set.intersection(deployed_strings_set)
46+
total_common_strings_count = sum(
47+
[
48+
source_strings_counter.get(string, 0)
49+
+ deployed_strings_counter.get(string, 0)
50+
for string in common_strings
51+
]
52+
)
53+
54+
if total_common_strings_count:
55+
common_strings_ratio = total_common_strings_count / total_strings_count
56+
57+
if common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT:
58+
is_match = True
59+
elif (
60+
source_strings_count > SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT
61+
and common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE
62+
):
63+
is_match = True
64+
65+
return is_match, common_strings_ratio

scanpipe/pipes/strings.py

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,8 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
from collections import Counter
24-
2523
from aboutcode.pipeline import LoopProgress
2624

27-
STRING_MATCHING_RATIO_JAVASCRIPT = 0.7
28-
SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT = 10
29-
STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE = 0.5
30-
3125

3226
class XgettextNotFound(Exception):
3327
pass
@@ -72,37 +66,3 @@ def _collect_and_store_resource_strings(resource):
7266
result = strings_xgettext.collect_strings(resource.location)
7367
strings = [item["string"] for item in result if "string" in item]
7468
resource.update_extra_data({"source_strings": strings})
75-
76-
77-
def match_source_strings_to_deployed(source_strings, deployed_strings):
78-
common_strings_ratio = 0
79-
is_match = False
80-
deployed_strings_set = set(deployed_strings)
81-
source_strings_set = set(source_strings)
82-
source_strings_count = len(source_strings)
83-
deployed_strings_count = len(deployed_strings)
84-
total_strings_count = source_strings_count + deployed_strings_count
85-
source_strings_counter = Counter(source_strings)
86-
deployed_strings_counter = Counter(deployed_strings)
87-
88-
common_strings = source_strings_set.intersection(deployed_strings_set)
89-
total_common_strings_count = sum(
90-
[
91-
source_strings_counter.get(string, 0)
92-
+ deployed_strings_counter.get(string, 0)
93-
for string in common_strings
94-
]
95-
)
96-
97-
if total_common_strings_count:
98-
common_strings_ratio = total_common_strings_count / total_strings_count
99-
100-
if common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT:
101-
is_match = True
102-
elif (
103-
source_strings_count > SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT
104-
and common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE
105-
):
106-
is_match = True
107-
108-
return is_match, common_strings_ratio
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
about_resource: cesium
2+
name: cesium
3+
version: 1.125
4+
download_url: https://github.com/CesiumGS/cesium/archive/refs/tags/1.125.zip
5+
homepage_url: https://github.com/CesiumGS/cesium
6+
license_expression: apache-2.0
7+
attribute: yes
8+
package_url: pkg:github/CesiumGS/[email protected]

scanpipe/tests/data/d2d-javascript/strings/cesium/deployed-decodeI3S.js

Lines changed: 26 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)