Skip to content

Commit d139fc7

Browse files
committed
Add bit_count for more ecosystems
Signed-off-by: Keshav Priyadarshi <[email protected]>
1 parent b6a9db6 commit d139fc7

File tree

3 files changed

+66
-44
lines changed

3 files changed

+66
-44
lines changed

aboutcode/hashid/__init__.py

Lines changed: 66 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
which makes every filesystem performance suffer.
2929
3030
In addition, when storing these files in Git repositories, we need to avoid creating any repository
31-
with too many files that would make using this repository impactical or exceed the limits of some
31+
with too many files that would make using this repository impractical or exceed the limits of some
3232
repository hosting services.
3333
3434
Therefore we are storing vulnerability data using a directory tree using the first few characters
@@ -46,21 +46,21 @@ def build_vcid(prefix="VCID"):
4646
"""
4747
Return a new Vulnerable Code ID (aka. VCID) which is a strongly unique vulnerability
4848
identifier string using the provided ``prefix``. A VCID is composed of a four letter prefix, and
49-
three segments composed of four letters and dihits each separated by a dash.
49+
three segments composed of four letters and digits each separated by a dash.
5050
For example::
5151
>>> import re
5252
>>> vcid = build_vcid()
5353
>>> assert re.match('VCID(-[a-hjkm-z1-9]{4}){3}', vcid), vcid
5454
5555
We were mistakenly not using enough bits. The symptom was that the last
56-
segment of the VCID was always strting with "aaa" This ensure we are now OK:
56+
segment of the VCID was always string with "aaa" This ensure we are now OK:
5757
>>> vcids = [build_vcid() for _ in range(50)]
5858
>>> assert not any(vid.split("-")[-1].startswith("aaa") for vid in vcids)
5959
"""
6060
uid = uuid4().bytes
61-
# we keep three segments of 4 base32-encodee bytes, 3*4=12
61+
# we keep three segments of 4 base32-encoded bytes, 3*4=12
6262
# which corresponds to 60 bits
63-
# becausee each base32 byte can store 5 bits (2**5 = 32)
63+
# because each base32 byte can store 5 bits (2**5 = 32)
6464
uid = base32_custom(uid)[:12].decode("utf-8").lower()
6565
return f"{prefix}-{uid[:4]}-{uid[4:8]}-{uid[8:12]}"
6666

@@ -117,7 +117,7 @@ def vulnerability_yml_path(vcid):
117117
Return the path to a vulnerability YAML file crafted from the ``vcid`` VCID vulnerability id.
118118
119119
The approach is to distribute the files in many directories to avoid having too many files in
120-
any directory and be able to find the path to a vulneravility file given its VCID distributed on
120+
any directory and be able to find the path to a vulnerability file given its VCID distributed on
121121
the first two characters of the UUID section of a VCID.
122122
123123
The UUID is using a base32 encoding, hence keeping two characters means 32 x 32 = 1024
@@ -162,6 +162,52 @@ def get_package_vulnerabilities_yml_file_path(purl: Union[PackageURL, str]):
162162
return get_package_base_dir(purl) / VULNERABILITIES_FILENAME
163163

164164

165+
# We use a 4-tier system for storing package metadata.
166+
# The tiers are as follows:
167+
# 1. Super Large Ecosystem (~5M packages): 2^10 = 1,028 git repositories
168+
# 2. Large Ecosystem (~500K packages): 2^7 = 128 git repositories
169+
# 3. Medium Ecosystem (~50K packages): 2^5 = 32 git repositories
170+
# 4. Small Ecosystem (~2K packages): 2^0 = 1 git repository
171+
# See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
172+
BIT_COUNT_BY_ECOSYSTEM = {
173+
# Super Large Ecosystem
174+
"github": 10,
175+
"npm": 10,
176+
# Large Ecosystem
177+
"golang": 7,
178+
"maven": 7,
179+
"nuget": 7,
180+
"perl": 7,
181+
"php": 7,
182+
"pypi": 7,
183+
"ruby": 7,
184+
# Medium Ecosystem
185+
"alpm": 5,
186+
"bitbucket": 5,
187+
"cocoapods": 5,
188+
"composer": 5,
189+
"deb": 5,
190+
"docker": 5,
191+
"generic": 5,
192+
"huggingface": 5,
193+
"mlflow": 5,
194+
"pub": 5,
195+
"rpm": 5,
196+
# Small Ecosystem
197+
"bitnami": 0,
198+
"cargo": 0,
199+
"conan": 0,
200+
"conda": 0,
201+
"cpan": 0,
202+
"cran": 0,
203+
"gem": 0,
204+
"hackage": 0,
205+
"hex": 0,
206+
"luarocks": 0,
207+
"swift": 0,
208+
}
209+
210+
165211
def package_path_elements(purl: Union[PackageURL, str]):
166212
"""
167213
Return 4-tuple of POSIX path strings crafted from the ``purl`` package PURL string or object.
@@ -199,7 +245,7 @@ def package_path_elements(purl: Union[PackageURL, str]):
199245
sbom.spdx.2.2.json : a SPDX SBOM
200246
.... other files
201247
202-
<extra_path> : one sub directory for each quote-encoded <qualifiers#supath> if any
248+
<extra_path> : one sub directory for each quote-encoded <qualifiers#subpath> if any
203249
metadata.yml : ABOUT YAML file with package origin and license metadata for this version
204250
scancode-scan.yml : a scancode scan for this package version
205251
foo-scan.yml : a scan for this package version created with tool foo
@@ -233,7 +279,8 @@ def package_path_elements(purl: Union[PackageURL, str]):
233279
if isinstance(purl, str):
234280
purl = PackageURL.from_string(purl)
235281

236-
purl_hash = get_purl_hash(purl)
282+
bit_count = BIT_COUNT_BY_ECOSYSTEM.get(purl.type, 0)
283+
purl_hash = get_purl_hash(purl=purl, _bit_count=bit_count)
237284

238285
if ns := purl.namespace:
239286
ns_name = f"{ns}/{purl.name}"
@@ -290,37 +337,17 @@ def get_core_purl(purl: Union[PackageURL, str]):
290337
return PackageURL(**purld)
291338

292339

293-
# See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
294-
BIT_COUNT_BY_ECOSYSTEM = {
295-
# Super large ecosystem 1024 repos.
296-
"npm": 10,
297-
# Large ecosystem 128 repos.
298-
"pypi": 7,
299-
"maven": 7,
300-
"golang": 7,
301-
"perl": 7,
302-
"ruby": 7,
303-
"nuget": 7,
304-
"php": 7,
305-
# Medium ecosystem 32 repos.
306-
"rpm": 5,
307-
"deb": 5,
308-
# Small ecosystem 1 repo.
309-
"github": 0,
310-
}
311-
312-
313340
def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 0) -> str:
314341
"""
315342
Return a short lower cased hash string from a ``purl`` string or object. The PURL is normalized
316343
and we drop its version, qualifiers and subpath.
317344
318-
This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 13 bits
319-
which represents 2**13 = 8192 possible hash values. It returns a fixed length short hash string
345+
This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 0 bits
346+
which represents 2**0 = 1 possible hash value. It returns a fixed length short hash string
320347
that is left-padded with zeros.
321348
322349
The hash length is derived from the bit_count and the number of bits-per-byte stored in an hex
323-
encoding of this bits count. For 13 bits, this means up to 4 characters.
350+
encoding of this bits count. For 10 bits, this means up to 3 characters.
324351
325352
The function is carefully designed to be portable across tech stacks and easy to implement in
326353
many programming languages:
@@ -342,36 +369,31 @@ def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 0) -> str:
342369
For example::
343370
344371
The hash does not change with version or qualifiers::
345-
>>> get_purl_hash("pkg:pypi/[email protected]")
372+
>>> get_purl_hash("pkg:pypi/[email protected]", 7)
346373
'09'
347-
>>> get_purl_hash("pkg:pypi/[email protected]")
374+
>>> get_purl_hash("pkg:pypi/[email protected]", 7)
348375
'09'
349-
>>> get_purl_hash("pkg:pypi/[email protected]?foo=bar#sub/path")
376+
>>> get_purl_hash("pkg:pypi/[email protected]?foo=bar#sub/path", 7)
350377
'09'
351378
352379
The hash is left padded with zero if it::
353-
>>> get_purl_hash("pkg:pypi/expressionss")
380+
>>> get_purl_hash("pkg:pypi/expressionss", 7)
354381
'57'
355382
356383
We normalize the PURL. Here pypi normalization always uses dash for underscore ::
357384
358-
>>> get_purl_hash("pkg:pypi/license_expression")
385+
>>> get_purl_hash("pkg:pypi/license_expression", 7)
359386
'50'
360-
>>> get_purl_hash("pkg:pypi/license-expression")
387+
>>> get_purl_hash("pkg:pypi/license-expression", 7)
361388
'50'
362389
363390
Originally from:
364391
https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154
365392
"""
366393

367-
core_purl = get_core_purl(purl)
368-
369-
if core_purl.type in BIT_COUNT_BY_ECOSYSTEM:
370-
_bit_count = BIT_COUNT_BY_ECOSYSTEM[core_purl.type]
371-
372-
core_purl_str = core_purl.to_string()
394+
core_purl = get_core_purl(purl).to_string()
373395
# compute the hash from a UTF-8 encoded string
374-
purl_bytes = core_purl_str.encode("utf-8")
396+
purl_bytes = core_purl.encode("utf-8")
375397
hash_bytes = sha256(purl_bytes).digest()
376398
# ... converted to integer so we can truncate with modulo. Note that we use big endian.
377399
hash_int = int.from_bytes(hash_bytes, "big")

0 commit comments

Comments
 (0)