Skip to content

Commit 851933f

Browse files
committed
cleanup / fix listings handling
1 parent 2638a9c commit 851933f

File tree

3 files changed

+59
-10
lines changed

3 files changed

+59
-10
lines changed

src/datachain/lib/listing.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,24 @@ def _sanitize_ds_name(raw: str) -> str:
129129
return _DS_ENCODE_RE.sub(lambda m: f"_x{ord(m.group()):02x}", raw)
130130

131131

132+
_DS_DECODE_RE = re.compile(r"_x([0-9a-f]{2}|_x)")
133+
134+
135+
def _desanitize_ds_name(encoded: str) -> str:
136+
"""Reverse :func:`_sanitize_ds_name`.
137+
138+
``_xHH`` → the original character, ``_x_x`` → literal ``_x``.
139+
"""
140+
141+
def _repl(m: re.Match) -> str:
142+
tok = m.group(1)
143+
if tok == "_x": # escaped literal "_x"
144+
return "_x"
145+
return chr(int(tok, 16)) # _xHH → original char
146+
147+
return _DS_DECODE_RE.sub(_repl, encoded)
148+
149+
132150
def parse_listing_uri(uri: str) -> tuple[str, str, str]:
133151
"""
134152
Parsing uri and returns listing dataset name, listing uri and listing path
@@ -156,10 +174,14 @@ def is_listing_dataset(name: str) -> bool:
156174

157175

158176
def listing_uri_from_name(dataset_name: str) -> str:
159-
"""Returns clean storage URI from listing dataset name"""
177+
"""Returns clean storage URI from listing dataset name.
178+
179+
Strips the ``lst__`` prefix and reverses the ``_xHH`` encoding
180+
applied by :func:`_sanitize_ds_name`.
181+
"""
160182
if not is_listing_dataset(dataset_name):
161183
raise ValueError(f"Dataset {dataset_name} is not a listing")
162-
return dataset_name.removeprefix(LISTING_PREFIX)
184+
return _desanitize_ds_name(dataset_name.removeprefix(LISTING_PREFIX))
163185

164186

165187
@contextmanager

src/datachain/lib/listing_info.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,13 @@
11
from datetime import datetime, timedelta, timezone
22

3-
from datachain.client import Client
43
from datachain.lib.dataset_info import DatasetInfo
5-
from datachain.lib.listing import LISTING_PREFIX, LISTING_TTL
4+
from datachain.lib.listing import LISTING_PREFIX, LISTING_TTL, _desanitize_ds_name
65

76

87
class ListingInfo(DatasetInfo):
98
@property
109
def uri(self) -> str:
11-
return self.name.removeprefix(LISTING_PREFIX)
12-
13-
@property
14-
def storage_uri(self) -> str:
15-
uri, _ = Client.parse_url(self.uri)
16-
return uri
10+
return _desanitize_ds_name(self.name.removeprefix(LISTING_PREFIX))
1711

1812
@property
1913
def expires(self) -> datetime | None:

tests/unit/test_listing.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from datachain.lib.file import File
1010
from datachain.lib.listing import (
1111
LISTING_PREFIX,
12+
_desanitize_ds_name,
13+
_sanitize_ds_name,
1214
get_listing,
1315
is_listing_dataset,
1416
listing_uri_from_name,
@@ -212,6 +214,13 @@ def test_is_listing_dataset(name, is_listing):
212214

213215
def test_listing_uri_from_name():
214216
assert listing_uri_from_name("lst__s3://my-bucket") == "s3://my-bucket"
217+
# Encoded name round-trips back to the original URI
218+
assert listing_uri_from_name("lst__s3://bucket/v1_x2e0/") == "s3://bucket/v1.0/"
219+
# Escaped _x round-trips
220+
assert (
221+
listing_uri_from_name("lst__s3://bucket/export_x_xml/")
222+
== "s3://bucket/export_xml/"
223+
)
215224
with pytest.raises(ValueError):
216225
listing_uri_from_name("s3://my-bucket")
217226

@@ -250,3 +259,27 @@ def test_parse_listing_uri_no_collision_percent_vs_literal():
250259

251260
def test_parse_listing_uri_no_collision_dot_vs_underscore():
252261
assert _ds_name("s3://b/v1.0/") != _ds_name("s3://b/v1_0/")
262+
263+
264+
@pytest.mark.parametrize(
265+
"raw",
266+
[
267+
"s3://my-bucket/dogs/",
268+
"s3://bucket/v1.0/",
269+
"s3://bucket/dir%25/",
270+
"s3://bucket/user@host/",
271+
"s3://bucket/export_xml/",
272+
"s3://my.company.data/path/to/files/",
273+
"gs://bucket-with_underscores_x_and.dots/",
274+
"file:///home/user/path with spaces/",
275+
"s3://b/_x_x_x/edge/",
276+
],
277+
)
278+
def test_sanitize_desanitize_roundtrip(raw):
279+
"""_desanitize_ds_name is the exact inverse of _sanitize_ds_name."""
280+
assert _desanitize_ds_name(_sanitize_ds_name(raw)) == raw
281+
282+
283+
def test_desanitize_plain_passthrough():
284+
"""Strings with no _xHH sequences pass through unchanged."""
285+
assert _desanitize_ds_name("s3://my-bucket/dogs/") == "s3://my-bucket/dogs/"

0 commit comments

Comments
 (0)