aboutcode-org
diff --git a/‎aboutcode/hashid/__init__.py
Lines changed: 66 additions & 44 deletions b/‎aboutcode/hashid/__init__.py
Lines changed: 66 additions & 44 deletions
diff --git a/‎vulnerabilities/tests/test_data/export_command/aboutcode-packages-generic-0/generic/nginx/test/purls.yml renamed to ‎vulnerabilities/tests/test_data/export_command/aboutcode-packages-generic-0d/generic/nginx/test/purls.yml b/‎vulnerabilities/tests/test_data/export_command/aboutcode-packages-generic-0/generic/nginx/test/purls.yml renamed to ‎vulnerabilities/tests/test_data/export_command/aboutcode-packages-generic-0d/generic/nginx/test/purls.yml
diff --git a/‎vulnerabilities/tests/test_data/export_command/aboutcode-packages-generic-0/generic/nginx/test/vulnerabilities.yml renamed to ‎vulnerabilities/tests/test_data/export_command/aboutcode-packages-generic-0d/generic/nginx/test/vulnerabilities.yml b/‎vulnerabilities/tests/test_data/export_command/aboutcode-packages-generic-0/generic/nginx/test/vulnerabilities.yml renamed to ‎vulnerabilities/tests/test_data/export_command/aboutcode-packages-generic-0d/generic/nginx/test/vulnerabilities.yml
@@ -28,7 +28,7 @@
 which makes every filesystem performance suffer.
 
 In addition, when storing these files in Git repositories, we need to avoid creating any repository
-with too many files that would make using this repository impactical or exceed the limits of some
+with too many files that would make using this repository impractical or exceed the limits of some
 repository hosting services.
 
 Therefore we are storing vulnerability data using a directory tree using the first few characters
@@ -46,21 +46,21 @@ def build_vcid(prefix="VCID"):
     """
     Return a new Vulnerable Code ID (aka. VCID) which is a strongly unique vulnerability
     identifier string using the provided ``prefix``. A VCID is composed of a four letter prefix, and
-    three segments composed of four letters and dihits each separated by a dash.
+    three segments composed of four letters and digits each separated by a dash.
     For example::
     >>> import re
     >>> vcid = build_vcid()
     >>> assert re.match('VCID(-[a-hjkm-z1-9]{4}){3}', vcid), vcid
 
     We were mistakenly not using enough bits. The symptom was that the last
-    segment of the VCID was always strting with "aaa" This ensure we are now OK:
+    segment of the VCID was always string with "aaa" This ensure we are now OK:
     >>> vcids = [build_vcid() for _ in range(50)]
     >>> assert not any(vid.split("-")[-1].startswith("aaa") for vid in vcids)
     """
     uid = uuid4().bytes
-    # we keep  three segments of 4 base32-encodee bytes, 3*4=12
+    # we keep  three segments of 4 base32-encoded bytes, 3*4=12
     # which corresponds to 60 bits
-    # becausee each base32 byte can store 5 bits (2**5 = 32)
+    # because each base32 byte can store 5 bits (2**5 = 32)
     uid = base32_custom(uid)[:12].decode("utf-8").lower()
     return f"{prefix}-{uid[:4]}-{uid[4:8]}-{uid[8:12]}"
 
@@ -117,7 +117,7 @@ def vulnerability_yml_path(vcid):
     Return the path to a vulnerability YAML file crafted from the ``vcid`` VCID vulnerability id.
 
     The approach is to distribute the files in many directories to avoid having too many files in
-    any directory and be able to find the path to a vulneravility file given its VCID distributed on
+    any directory and be able to find the path to a vulnerability file given its VCID distributed on
     the first two characters of the UUID section of a VCID.
 
     The UUID is using a base32 encoding, hence keeping two characters means 32 x 32 = 1024
@@ -162,6 +162,52 @@ def get_package_vulnerabilities_yml_file_path(purl: Union[PackageURL, str]):
     return get_package_base_dir(purl) / VULNERABILITIES_FILENAME
 
 
+# We use a 4-tier system for storing package metadata.
+# The tiers are as follows:
+# 1. Super Large Ecosystem (~5M packages): 2^10 = 1,028 git repositories
+# 2. Large Ecosystem (~500K packages): 2^7 = 128 git repositories
+# 3. Medium Ecosystem (~50K packages): 2^5 = 32 git repositories
+# 4. Small Ecosystem (~2K packages): 2^0 = 1 git repository
+# See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
+BIT_COUNT_BY_ECOSYSTEM = {
+    # Super Large Ecosystem
+    "github": 10,
+    "npm": 10,
+    # Large Ecosystem
+    "golang": 7,
+    "maven": 7,
+    "nuget": 7,
+    "perl": 7,
+    "php": 7,
+    "pypi": 7,
+    "ruby": 7,
+    # Medium Ecosystem
+    "alpm": 5,
+    "bitbucket": 5,
+    "cocoapods": 5,
+    "composer": 5,
+    "deb": 5,
+    "docker": 5,
+    "generic": 5,
+    "huggingface": 5,
+    "mlflow": 5,
+    "pub": 5,
+    "rpm": 5,
+    # Small Ecosystem
+    "bitnami": 0,
+    "cargo": 0,
+    "conan": 0,
+    "conda": 0,
+    "cpan": 0,
+    "cran": 0,
+    "gem": 0,
+    "hackage": 0,
+    "hex": 0,
+    "luarocks": 0,
+    "swift": 0,
+}
+
+
 def package_path_elements(purl: Union[PackageURL, str]):
     """
     Return 4-tuple of POSIX path strings crafted from the ``purl`` package PURL string or object.
@@ -199,7 +245,7 @@ def package_path_elements(purl: Union[PackageURL, str]):
           sbom.spdx.2.2.json : a SPDX SBOM
           .... other files
 
-          <extra_path> : one sub directory for each quote-encoded <qualifiers#supath> if any
+          <extra_path> : one sub directory for each quote-encoded <qualifiers#subpath> if any
             metadata.yml : ABOUT YAML file with package origin and license metadata for this version
             scancode-scan.yml : a scancode scan for this package version
             foo-scan.yml : a scan for this package version created with tool foo
@@ -233,7 +279,8 @@ def package_path_elements(purl: Union[PackageURL, str]):
     if isinstance(purl, str):
         purl = PackageURL.from_string(purl)
 
-    purl_hash = get_purl_hash(purl)
+    bit_count = BIT_COUNT_BY_ECOSYSTEM.get(purl.type, 0)
+    purl_hash = get_purl_hash(purl=purl, _bit_count=bit_count)
 
     if ns := purl.namespace:
         ns_name = f"{ns}/{purl.name}"
@@ -290,37 +337,17 @@ def get_core_purl(purl: Union[PackageURL, str]):
     return PackageURL(**purld)
 
 
-# See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
-BIT_COUNT_BY_ECOSYSTEM = {
-    # Super large ecosystem 1024 repos.
-    "npm": 10,
-    # Large ecosystem 128 repos.
-    "pypi": 7,
-    "maven": 7,
-    "golang": 7,
-    "perl": 7,
-    "ruby": 7,
-    "nuget": 7,
-    "php": 7,
-    # Medium ecosystem 32 repos.
-    "rpm": 5,
-    "deb": 5,
-    # Small ecosystem 1 repo.
-    "github": 0,
-}
-
-
 def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 0) -> str:
     """
     Return a short lower cased hash string from a ``purl`` string or object. The PURL is normalized
     and we drop its version, qualifiers and subpath.
 
-    This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 13 bits
-    which represents 2**13 = 8192 possible hash values. It returns a fixed length short hash string
+    This function takes a normalized PURL string and a ``_bit_count`` argument defaulting to 0 bits
+    which represents 2**0 = 1 possible hash value. It returns a fixed length short hash string
     that is left-padded with zeros.
 
     The hash length is derived from the bit_count and the number of bits-per-byte stored in an hex
-    encoding of this bits count. For 13 bits, this means up to 4 characters.
+    encoding of this bits count. For 10 bits, this means up to 3 characters.
 
     The function is carefully designed to be portable across tech stacks and easy to implement in
     many programming languages:
@@ -342,36 +369,31 @@ def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 0) -> str:
     For example::
 
     The hash does not change with version or qualifiers::
-    >>> get_purl_hash("pkg:pypi/[email protected]")
+    >>> get_purl_hash("pkg:pypi/[email protected]", 7)
     '09'
-    >>> get_purl_hash("pkg:pypi/[email protected]")
+    >>> get_purl_hash("pkg:pypi/[email protected]", 7)
     '09'
-    >>> get_purl_hash("pkg:pypi/[email protected]?foo=bar#sub/path")
+    >>> get_purl_hash("pkg:pypi/[email protected]?foo=bar#sub/path", 7)
     '09'
 
     The hash is left padded with zero if it::
-    >>> get_purl_hash("pkg:pypi/expressionss")
+    >>> get_purl_hash("pkg:pypi/expressionss", 7)
     '57'
 
     We normalize the PURL. Here pypi normalization always uses dash for underscore ::
 
-    >>> get_purl_hash("pkg:pypi/license_expression")
+    >>> get_purl_hash("pkg:pypi/license_expression", 7)
     '50'
-    >>> get_purl_hash("pkg:pypi/license-expression")
+    >>> get_purl_hash("pkg:pypi/license-expression", 7)
     '50'
 
     Originally from:
     https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154
     """
 
-    core_purl = get_core_purl(purl)
-
-    if core_purl.type in BIT_COUNT_BY_ECOSYSTEM:
-        _bit_count = BIT_COUNT_BY_ECOSYSTEM[core_purl.type]
-
-    core_purl_str = core_purl.to_string()
+    core_purl = get_core_purl(purl).to_string()
     # compute the hash from a UTF-8 encoded string
-    purl_bytes = core_purl_str.encode("utf-8")
+    purl_bytes = core_purl.encode("utf-8")
     hash_bytes = sha256(purl_bytes).digest()
     # ... converted to integer so we can truncate with modulo. Note that we use big endian.
     hash_int = int.from_bytes(hash_bytes, "big")