28
28
which makes every filesystem performance suffer.
29
29
30
30
In addition, when storing these files in Git repositories, we need to avoid creating any repository
31
- with too many files that would make using this repository impactical or exceed the limits of some
31
+ with too many files that would make using this repository impractical or exceed the limits of some
32
32
repository hosting services.
33
33
34
34
Therefore we are storing vulnerability data using a directory tree using the first few characters
@@ -46,21 +46,21 @@ def build_vcid(prefix="VCID"):
46
46
"""
47
47
Return a new Vulnerable Code ID (aka. VCID) which is a strongly unique vulnerability
48
48
identifier string using the provided ``prefix``. A VCID is composed of a four letter prefix, and
49
- three segments composed of four letters and dihits each separated by a dash.
49
+ three segments composed of four letters and digits each separated by a dash.
50
50
For example::
51
51
>>> import re
52
52
>>> vcid = build_vcid()
53
53
>>> assert re.match('VCID(-[a-hjkm-z1-9]{4}){3}', vcid), vcid
54
54
55
55
We were mistakenly not using enough bits. The symptom was that the last
56
- segment of the VCID was always strting with "aaa" This ensure we are now OK:
56
+ segment of the VCID was always string with "aaa" This ensure we are now OK:
57
57
>>> vcids = [build_vcid() for _ in range(50)]
58
58
>>> assert not any(vid.split("-")[-1].startswith("aaa") for vid in vcids)
59
59
"""
60
60
uid = uuid4 ().bytes
61
- # we keep three segments of 4 base32-encodee bytes, 3*4=12
61
+ # we keep three segments of 4 base32-encoded bytes, 3*4=12
62
62
# which corresponds to 60 bits
63
- # becausee each base32 byte can store 5 bits (2**5 = 32)
63
+ # because each base32 byte can store 5 bits (2**5 = 32)
64
64
uid = base32_custom (uid )[:12 ].decode ("utf-8" ).lower ()
65
65
return f"{ prefix } -{ uid [:4 ]} -{ uid [4 :8 ]} -{ uid [8 :12 ]} "
66
66
@@ -117,7 +117,7 @@ def vulnerability_yml_path(vcid):
117
117
Return the path to a vulnerability YAML file crafted from the ``vcid`` VCID vulnerability id.
118
118
119
119
The approach is to distribute the files in many directories to avoid having too many files in
120
- any directory and be able to find the path to a vulneravility file given its VCID distributed on
120
+ any directory and be able to find the path to a vulnerability file given its VCID distributed on
121
121
the first two characters of the UUID section of a VCID.
122
122
123
123
The UUID is using a base32 encoding, hence keeping two characters means 32 x 32 = 1024
@@ -162,6 +162,52 @@ def get_package_vulnerabilities_yml_file_path(purl: Union[PackageURL, str]):
162
162
return get_package_base_dir (purl ) / VULNERABILITIES_FILENAME
163
163
164
164
165
+ # We use a 4-tier system for storing package metadata.
166
+ # The tiers are as follows:
167
+ # 1. Super Large Ecosystem (~5M packages): 2^10 = 1,028 git repositories
168
+ # 2. Large Ecosystem (~500K packages): 2^7 = 128 git repositories
169
+ # 3. Medium Ecosystem (~50K packages): 2^5 = 32 git repositories
170
+ # 4. Small Ecosystem (~2K packages): 2^0 = 1 git repository
171
+ # See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
172
+ BIT_COUNT_BY_ECOSYSTEM = {
173
+ # Super Large Ecosystem
174
+ "github" : 10 ,
175
+ "npm" : 10 ,
176
+ # Large Ecosystem
177
+ "golang" : 7 ,
178
+ "maven" : 7 ,
179
+ "nuget" : 7 ,
180
+ "perl" : 7 ,
181
+ "php" : 7 ,
182
+ "pypi" : 7 ,
183
+ "ruby" : 7 ,
184
+ # Medium Ecosystem
185
+ "alpm" : 5 ,
186
+ "bitbucket" : 5 ,
187
+ "cocoapods" : 5 ,
188
+ "composer" : 5 ,
189
+ "deb" : 5 ,
190
+ "docker" : 5 ,
191
+ "generic" : 5 ,
192
+ "huggingface" : 5 ,
193
+ "mlflow" : 5 ,
194
+ "pub" : 5 ,
195
+ "rpm" : 5 ,
196
+ # Small Ecosystem
197
+ "bitnami" : 0 ,
198
+ "cargo" : 0 ,
199
+ "conan" : 0 ,
200
+ "conda" : 0 ,
201
+ "cpan" : 0 ,
202
+ "cran" : 0 ,
203
+ "gem" : 0 ,
204
+ "hackage" : 0 ,
205
+ "hex" : 0 ,
206
+ "luarocks" : 0 ,
207
+ "swift" : 0 ,
208
+ }
209
+
210
+
165
211
def package_path_elements (purl : Union [PackageURL , str ]):
166
212
"""
167
213
Return 4-tuple of POSIX path strings crafted from the ``purl`` package PURL string or object.
@@ -199,7 +245,7 @@ def package_path_elements(purl: Union[PackageURL, str]):
199
245
sbom.spdx.2.2.json : a SPDX SBOM
200
246
.... other files
201
247
202
- <extra_path> : one sub directory for each quote-encoded <qualifiers#supath > if any
248
+ <extra_path> : one sub directory for each quote-encoded <qualifiers#subpath > if any
203
249
metadata.yml : ABOUT YAML file with package origin and license metadata for this version
204
250
scancode-scan.yml : a scancode scan for this package version
205
251
foo-scan.yml : a scan for this package version created with tool foo
@@ -233,7 +279,8 @@ def package_path_elements(purl: Union[PackageURL, str]):
233
279
if isinstance (purl , str ):
234
280
purl = PackageURL .from_string (purl )
235
281
236
- purl_hash = get_purl_hash (purl )
282
+ bit_count = BIT_COUNT_BY_ECOSYSTEM .get (purl .type , 0 )
283
+ purl_hash = get_purl_hash (purl = purl , _bit_count = bit_count )
237
284
238
285
if ns := purl .namespace :
239
286
ns_name = f"{ ns } /{ purl .name } "
@@ -290,37 +337,17 @@ def get_core_purl(purl: Union[PackageURL, str]):
290
337
return PackageURL (** purld )
291
338
292
339
293
- # See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
294
- BIT_COUNT_BY_ECOSYSTEM = {
295
- # Super large ecosystem 1024 repos.
296
- "npm" : 10 ,
297
- # Large ecosystem 128 repos.
298
- "pypi" : 7 ,
299
- "maven" : 7 ,
300
- "golang" : 7 ,
301
- "perl" : 7 ,
302
- "ruby" : 7 ,
303
- "nuget" : 7 ,
304
- "php" : 7 ,
305
- # Medium ecosystem 32 repos.
306
- "rpm" : 5 ,
307
- "deb" : 5 ,
308
- # Small ecosystem 1 repo.
309
- "github" : 0 ,
310
- }
311
-
312
-
313
340
def get_purl_hash (purl : Union [PackageURL , str ], _bit_count : int = 0 ) -> str :
314
341
"""
315
342
Return a short lower cased hash string from a ``purl`` string or object. The PURL is normalized
316
343
and we drop its version, qualifiers and subpath.
317
344
318
- This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 13 bits
319
- which represents 2**13 = 8192 possible hash values . It returns a fixed length short hash string
345
+ This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 0 bits
346
+ which represents 2**0 = 1 possible hash value . It returns a fixed length short hash string
320
347
that is left-padded with zeros.
321
348
322
349
The hash length is derived from the bit_count and the number of bits-per-byte stored in an hex
323
- encoding of this bits count. For 13 bits, this means up to 4 characters.
350
+ encoding of this bits count. For 10 bits, this means up to 3 characters.
324
351
325
352
The function is carefully designed to be portable across tech stacks and easy to implement in
326
353
many programming languages:
@@ -342,36 +369,31 @@ def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 0) -> str:
342
369
For example::
343
370
344
371
The hash does not change with version or qualifiers::
345
- >>> get_purl_hash("pkg:pypi/[email protected] ")
372
+ >>> get_purl_hash("pkg:pypi/[email protected] ", 7 )
346
373
'09'
347
- >>> get_purl_hash("pkg:pypi/[email protected] ")
374
+ >>> get_purl_hash("pkg:pypi/[email protected] ", 7 )
348
375
'09'
349
- >>> get_purl_hash("pkg:pypi/[email protected] ?foo=bar#sub/path")
376
+ >>> get_purl_hash("pkg:pypi/[email protected] ?foo=bar#sub/path", 7 )
350
377
'09'
351
378
352
379
The hash is left padded with zero if it::
353
- >>> get_purl_hash("pkg:pypi/expressionss")
380
+ >>> get_purl_hash("pkg:pypi/expressionss", 7 )
354
381
'57'
355
382
356
383
We normalize the PURL. Here pypi normalization always uses dash for underscore ::
357
384
358
- >>> get_purl_hash("pkg:pypi/license_expression")
385
+ >>> get_purl_hash("pkg:pypi/license_expression", 7 )
359
386
'50'
360
- >>> get_purl_hash("pkg:pypi/license-expression")
387
+ >>> get_purl_hash("pkg:pypi/license-expression", 7 )
361
388
'50'
362
389
363
390
Originally from:
364
391
https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154
365
392
"""
366
393
367
- core_purl = get_core_purl (purl )
368
-
369
- if core_purl .type in BIT_COUNT_BY_ECOSYSTEM :
370
- _bit_count = BIT_COUNT_BY_ECOSYSTEM [core_purl .type ]
371
-
372
- core_purl_str = core_purl .to_string ()
394
+ core_purl = get_core_purl (purl ).to_string ()
373
395
# compute the hash from a UTF-8 encoded string
374
- purl_bytes = core_purl_str .encode ("utf-8" )
396
+ purl_bytes = core_purl .encode ("utf-8" )
375
397
hash_bytes = sha256 (purl_bytes ).digest ()
376
398
# ... converted to integer so we can truncate with modulo. Note that we use big endian.
377
399
hash_int = int .from_bytes (hash_bytes , "big" )
0 commit comments