28
28
which makes every filesystem performance suffer.
29
29
30
30
In addition, when storing these files in Git repositories, we need to avoid creating any repository
31
- with too many files that would make using this repository impactical or exceed the limits of some
31
+ with too many files that would make using this repository impractical or exceed the limits of some
32
32
repository hosting services.
33
33
34
34
Therefore we are storing vulnerability data using a directory tree using the first few characters
@@ -46,21 +46,21 @@ def build_vcid(prefix="VCID"):
46
46
"""
47
47
Return a new Vulnerable Code ID (aka. VCID) which is a strongly unique vulnerability
48
48
identifier string using the provided ``prefix``. A VCID is composed of a four letter prefix, and
49
- three segments composed of four letters and dihits each separated by a dash.
49
+ three segments composed of four letters and digits each separated by a dash.
50
50
For example::
51
51
>>> import re
52
52
>>> vcid = build_vcid()
53
53
>>> assert re.match('VCID(-[a-hjkm-z1-9]{4}){3}', vcid), vcid
54
54
55
55
We were mistakenly not using enough bits. The symptom was that the last
56
- segment of the VCID was always strting with "aaa" This ensure we are now OK:
56
+ segment of the VCID was always string with "aaa" This ensure we are now OK:
57
57
>>> vcids = [build_vcid() for _ in range(50)]
58
58
>>> assert not any(vid.split("-")[-1].startswith("aaa") for vid in vcids)
59
59
"""
60
60
uid = uuid4 ().bytes
61
- # we keep three segments of 4 base32-encodee bytes, 3*4=12
61
+ # we keep three segments of 4 base32-encoded bytes, 3*4=12
62
62
# which corresponds to 60 bits
63
- # becausee each base32 byte can store 5 bits (2**5 = 32)
63
+ # because each base32 byte can store 5 bits (2**5 = 32)
64
64
uid = base32_custom (uid )[:12 ].decode ("utf-8" ).lower ()
65
65
return f"{ prefix } -{ uid [:4 ]} -{ uid [4 :8 ]} -{ uid [8 :12 ]} "
66
66
@@ -72,7 +72,7 @@ def get_vcid_yml_file_path(vcid: str):
72
72
return Path (VULNERABILITY_REPO_NAME ) / vulnerability_yml_path (vcid )
73
73
74
74
75
- # This cuxstom 32 characters alphabet is designed to avoid visually easily confusable characters:
75
+ # This custom 32 characters alphabet is designed to avoid visually easily confusable characters:
76
76
# i and l
77
77
# 0 and o
78
78
_base32_alphabet = b"abcdefghjkmnpqrstuvwxyz123456789"
@@ -117,7 +117,7 @@ def vulnerability_yml_path(vcid):
117
117
Return the path to a vulnerability YAML file crafted from the ``vcid`` VCID vulnerability id.
118
118
119
119
The approach is to distribute the files in many directories to avoid having too many files in
120
- any directory and be able to find the path to a vulneravility file given its VCID distributed on
120
+ any directory and be able to find the path to a vulnerability file given its VCID distributed on
121
121
the first two characters of the UUID section of a VCID.
122
122
123
123
The UUID is using a base32 encoding, hence keeping two characters means 32 x 32 = 1024
@@ -140,9 +140,12 @@ def get_package_base_dir(purl: Union[PackageURL, str]):
140
140
"""
141
141
Return the base path to a Package directory (ignoring version) for a purl
142
142
"""
143
+ if isinstance (purl , str ):
144
+ purl = PackageURL .from_string (purl )
145
+
143
146
path_elements = package_path_elements (purl )
144
147
phash , core_path , _pversion , _extra_path = path_elements
145
- return Path (f"{ PACKAGE_REPOS_NAME_PREFIX } -{ phash } " ) / core_path
148
+ return Path (f"{ PACKAGE_REPOS_NAME_PREFIX } -{ purl . type } - { phash } " ) / core_path
146
149
147
150
148
151
def get_package_purls_yml_file_path (purl : Union [PackageURL , str ]):
@@ -159,6 +162,52 @@ def get_package_vulnerabilities_yml_file_path(purl: Union[PackageURL, str]):
159
162
return get_package_base_dir (purl ) / VULNERABILITIES_FILENAME
160
163
161
164
165
+ # We use a 4-tier system for storing package metadata.
166
+ # The tiers are as follows:
167
+ # 1. Super Large Ecosystem (~5M packages): 2^10 = 1,024 git repositories
168
+ # 2. Large Ecosystem (~500K packages): 2^7 = 128 git repositories
169
+ # 3. Medium Ecosystem (~50K packages): 2^5 = 32 git repositories
170
+ # 4. Small Ecosystem (~2K packages): 2^0 = 1 git repository
171
+ # See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
172
+ BIT_COUNT_BY_ECOSYSTEM = {
173
+ # Super Large Ecosystem
174
+ "github" : 10 ,
175
+ "npm" : 10 ,
176
+ # Large Ecosystem
177
+ "golang" : 7 ,
178
+ "maven" : 7 ,
179
+ "nuget" : 7 ,
180
+ "perl" : 7 ,
181
+ "php" : 7 ,
182
+ "pypi" : 7 ,
183
+ "ruby" : 7 ,
184
+ # Medium Ecosystem
185
+ "alpm" : 5 ,
186
+ "bitbucket" : 5 ,
187
+ "cocoapods" : 5 ,
188
+ "composer" : 5 ,
189
+ "deb" : 5 ,
190
+ "docker" : 5 ,
191
+ "gem" : 5 ,
192
+ "generic" : 5 ,
193
+ "huggingface" : 5 ,
194
+ "mlflow" : 5 ,
195
+ "pub" : 5 ,
196
+ "rpm" : 5 ,
197
+ # Small Ecosystem
198
+ "bitnami" : 0 ,
199
+ "cargo" : 0 ,
200
+ "conan" : 0 ,
201
+ "conda" : 0 ,
202
+ "cpan" : 0 ,
203
+ "cran" : 0 ,
204
+ "hackage" : 0 ,
205
+ "hex" : 0 ,
206
+ "luarocks" : 0 ,
207
+ "swift" : 0 ,
208
+ }
209
+
210
+
162
211
def package_path_elements (purl : Union [PackageURL , str ]):
163
212
"""
164
213
Return 4-tuple of POSIX path strings crafted from the ``purl`` package PURL string or object.
@@ -196,7 +245,7 @@ def package_path_elements(purl: Union[PackageURL, str]):
196
245
sbom.spdx.2.2.json : a SPDX SBOM
197
246
.... other files
198
247
199
- <extra_path> : one sub directory for each quote-encoded <qualifiers#supath > if any
248
+ <extra_path> : one sub directory for each quote-encoded <qualifiers#subpath > if any
200
249
metadata.yml : ABOUT YAML file with package origin and license metadata for this version
201
250
scancode-scan.yml : a scancode scan for this package version
202
251
foo-scan.yml : a scan for this package version created with tool foo
@@ -208,15 +257,15 @@ def package_path_elements(purl: Union[PackageURL, str]):
208
257
We keep the same prefix for different versions::
209
258
210
259
>>> package_path_elements("pkg:pypi/[email protected] ")
211
- ('1050 ', 'pypi/license-expression', '30.3.1', '')
260
+ ('50 ', 'pypi/license-expression', '30.3.1', '')
212
261
>>> package_path_elements("pkg:pypi/[email protected] ")
213
- ('1050 ', 'pypi/license-expression', '10.3.1', '')
262
+ ('50 ', 'pypi/license-expression', '10.3.1', '')
214
263
215
264
We encode with quotes, avoid double encoding of already quoted parts to make subpaths easier
216
265
for filesystems::
217
266
218
267
>>> package_path_elements("pkg:pypi/[email protected] ?foo=bar&baz=bar#sub/path")
219
- ('1050 ', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath')
268
+ ('50 ', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath')
220
269
221
270
>>> purl = PackageURL(
222
271
... type="pypi",
@@ -225,12 +274,13 @@ def package_path_elements(purl: Union[PackageURL, str]):
225
274
... qualifiers=dict(foo="bar"),
226
275
... subpath="a/b/c")
227
276
>>> package_path_elements(purl)
228
- ('1050 ', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc')
277
+ ('50 ', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc')
229
278
"""
230
279
if isinstance (purl , str ):
231
280
purl = PackageURL .from_string (purl )
232
281
233
- purl_hash = get_purl_hash (purl )
282
+ bit_count = BIT_COUNT_BY_ECOSYSTEM .get (purl .type , 0 )
283
+ purl_hash = get_purl_hash (purl = purl , _bit_count = bit_count )
234
284
235
285
if ns := purl .namespace :
236
286
ns_name = f"{ ns } /{ purl .name } "
@@ -287,17 +337,17 @@ def get_core_purl(purl: Union[PackageURL, str]):
287
337
return PackageURL (** purld )
288
338
289
339
290
- def get_purl_hash (purl : Union [PackageURL , str ], _bit_count : int = 13 ) -> str :
340
+ def get_purl_hash (purl : Union [PackageURL , str ], _bit_count : int = 0 ) -> str :
291
341
"""
292
342
Return a short lower cased hash string from a ``purl`` string or object. The PURL is normalized
293
343
and we drop its version, qualifiers and subpath.
294
344
295
- This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 13 bits
296
- which represents 2**13 = 8192 possible hash values . It returns a fixed length short hash string
345
+ This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 0 bits
346
+ which represents 2**0 = 1 possible hash value . It returns a fixed length short hash string
297
347
that is left-padded with zeros.
298
348
299
349
The hash length is derived from the bit_count and the number of bits-per-byte stored in an hex
300
- encoding of this bits count. For 13 bits, this means up to 4 characters.
350
+ encoding of this bits count. For 10 bits, this means up to 3 characters.
301
351
302
352
The function is carefully designed to be portable across tech stacks and easy to implement in
303
353
many programming languages:
@@ -319,23 +369,23 @@ def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 13) -> str:
319
369
For example::
320
370
321
371
The hash does not change with version or qualifiers::
322
- >>> get_purl_hash("pkg:pypi/[email protected] ")
323
- '1289 '
324
- >>> get_purl_hash("pkg:pypi/[email protected] ")
325
- '1289 '
326
- >>> get_purl_hash("pkg:pypi/[email protected] ?foo=bar#sub/path")
327
- '1289 '
372
+ >>> get_purl_hash("pkg:pypi/[email protected] ", 7 )
373
+ '09 '
374
+ >>> get_purl_hash("pkg:pypi/[email protected] ", 7 )
375
+ '09 '
376
+ >>> get_purl_hash("pkg:pypi/[email protected] ?foo=bar#sub/path", 7 )
377
+ '09 '
328
378
329
379
The hash is left padded with zero if it::
330
- >>> get_purl_hash("pkg:pypi/expressionss")
331
- '0057 '
380
+ >>> get_purl_hash("pkg:pypi/expressionss", 7 )
381
+ '57 '
332
382
333
383
We normalize the PURL. Here pypi normalization always uses dash for underscore ::
334
384
335
- >>> get_purl_hash("pkg:pypi/license_expression")
336
- '1050 '
337
- >>> get_purl_hash("pkg:pypi/license-expression")
338
- '1050 '
385
+ >>> get_purl_hash("pkg:pypi/license_expression", 7 )
386
+ '50 '
387
+ >>> get_purl_hash("pkg:pypi/license-expression", 7 )
388
+ '50 '
339
389
340
390
Originally from:
341
391
https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154
0 commit comments