Skip to content

Commit a752cf5

Browse files
authored
fix: Change get_filters to not hang due to long filter strings (#1418)
* test: Add get_filters() unit tests for long filters * test: Add pytest-timeout dependency for tests * feat: Add split_not_in_quotes() utility function and unit tests * fix: Change get_filters to use split_not_in_quotes() function * chore: Lint issue * chore: Reformat
1 parent 7814906 commit a752cf5

File tree

6 files changed

+230
-27
lines changed

6 files changed

+230
-27
lines changed

data_validation/cli_tools.py

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,18 @@
5252
import uuid
5353
import os
5454
import math
55-
import re
5655
from argparse import Namespace
5756
from typing import Dict, List, Optional
5857
from yaml import Dumper, Loader, dump, load
5958

60-
from data_validation import clients, consts, find_tables, state_manager, gcs_helper
59+
from data_validation import (
60+
clients,
61+
consts,
62+
find_tables,
63+
state_manager,
64+
gcs_helper,
65+
util,
66+
)
6167
from data_validation.validation_builder import list_to_sublists
6268

6369

@@ -1202,31 +1208,29 @@ def get_filters(filter_value: str) -> List[Dict]:
12021208
If only one filter is specified, it applies to both source and target
12031209
For a doc on regular expression for filters see docs/internal/filters_regex.md
12041210
"""
1205-
1206-
single_filter = r"([^':]*('[^']*')*)*"
1207-
double_filter = (
1208-
r"(?P<source>" + single_filter + r"):(?P<target>" + single_filter + r")"
1209-
)
1210-
filter_config = []
1211-
if result := re.fullmatch(single_filter, filter_value):
1212-
if result.group(0) == "":
1211+
filters = util.split_not_in_quotes(filter_value, ":")
1212+
if len(filters) not in (1, 2):
1213+
raise argparse.ArgumentTypeError("Unable to parse filter arguments.")
1214+
filters = [_.strip() for _ in filters]
1215+
if len(filters) == 1:
1216+
if not filters[0]:
12131217
raise argparse.ArgumentTypeError("Empty string not allowed in filter")
12141218
filter_dict = {
12151219
"type": "custom",
1216-
"source": result.group(0),
1217-
"target": result.group(0),
1220+
"source": filters[0],
1221+
"target": filters[0],
12181222
}
1219-
elif result := re.fullmatch(double_filter, filter_value):
1220-
if result.group("source") == "" or result.group("target") == "":
1223+
elif len(filters) == 2:
1224+
if not filters[0] or not filters[1]:
12211225
raise argparse.ArgumentTypeError("Empty string not allowed in filter")
12221226
filter_dict = {
12231227
"type": "custom",
1224-
"source": result.group("source"),
1225-
"target": result.group("target"),
1228+
"source": filters[0],
1229+
"target": filters[1],
12261230
}
1227-
else:
1228-
raise argparse.ArgumentTypeError("Unable to parse filter arguments.")
1229-
filter_config.append(filter_dict)
1231+
filter_config = [
1232+
filter_dict,
1233+
]
12301234
return filter_config
12311235

12321236

data_validation/util.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import logging
16+
import re
1617
import time
1718

1819

@@ -22,3 +23,24 @@ def timed_call(log_txt, fn, *args, **kwargs):
2223
elapsed = time.time() - t0
2324
logging.debug(f"{log_txt} elapsed: {round(elapsed, 2)}s")
2425
return result
26+
27+
28+
def split_not_in_quotes(
29+
to_split: str, sep: str = " ", exclude_empty_tokens: bool = False
30+
) -> list:
31+
"""Split a string by a separator but only when the separator is not inside quotes.
32+
re pattern taken from this comment:
33+
https://stackoverflow.com/a/2787979/10979853
34+
The commenter's words should the link ever go stale:
35+
Each time it finds a semicolon, the lookahead scans the entire remaining string,
36+
making sure there's an even number of single-quotes and an even number of double-quotes.
37+
(Single-quotes inside double-quoted fields, or vice-versa, are ignored.) If the
38+
lookahead succeeds, the semicolon is a delimiter.
39+
The pattern doesn't cope with whitespace as sep, back to back spaces are multiple seps, therefore
40+
we have exclude_empty_tokens parameter.
41+
"""
42+
pattern = r"""%(sep)s(?=(?:[^'"]|'[^']*'|"[^"]*")*$)""" % {"sep": sep}
43+
if exclude_empty_tokens:
44+
return [t for t in re.split(pattern, to_split) if t]
45+
else:
46+
return re.split(pattern, to_split)

noxfile.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@
4848
def _setup_session_requirements(session, extra_packages=[]):
4949
"""Install requirements for nox tests."""
5050

51-
session.install("--upgrade", "pip", "pytest", "pytest-cov", "wheel")
51+
session.install(
52+
"--upgrade", "pip", "pytest", "pytest-cov", "pytest-timeout", "wheel"
53+
)
5254
session.install("--no-cache-dir", "-e", ".")
5355

5456
if extra_packages:

setup.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,15 @@
5050
extras_require = {
5151
"apache-airflow": "1.10.11",
5252
"pyspark": "3.0.0",
53+
"develop": [
54+
"black==22.3.0",
55+
"flake8",
56+
"freezegun",
57+
"pyfakefs==4.6.2",
58+
"pytest",
59+
"pytest-cov",
60+
"pytest-timeout",
61+
],
5362
}
5463

5564
packages = [

tests/unit/test_cli_tools.py

Lines changed: 90 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -564,14 +564,17 @@ def test_get_result_handler_by_conn_file(fs):
564564
}
565565

566566

567+
@pytest.mark.timeout(10)
567568
@pytest.mark.parametrize(
568569
"test_input,expected",
569570
[
571+
# Simple filters.
572+
("id < 5", [{"type": "custom", "source": "id < 5", "target": "id < 5"}]),
573+
("id > 5", [{"type": "custom", "source": "id > 5", "target": "id > 5"}]),
570574
(
571-
"id < 5:row_id <5",
572-
[{"type": "custom", "source": "id < 5", "target": "row_id <5"}],
575+
"id = 'abc'",
576+
[{"type": "custom", "source": "id = 'abc'", "target": "id = 'abc'"}],
573577
),
574-
("id < 5", [{"type": "custom", "source": "id < 5", "target": "id < 5"}]),
575578
(
576579
"name != 'John'",
577580
[
@@ -582,6 +585,7 @@ def test_get_result_handler_by_conn_file(fs):
582585
}
583586
],
584587
),
588+
# With an escaped single quote.
585589
(
586590
"name != 'St. John''s'",
587591
[
@@ -592,20 +596,99 @@ def test_get_result_handler_by_conn_file(fs):
592596
}
593597
],
594598
),
599+
# Filter pairs.
600+
(
601+
"id < 5:row_id <5",
602+
[{"type": "custom", "source": "id < 5", "target": "row_id <5"}],
603+
),
604+
(
605+
"id = 'abc':row_id='abc'",
606+
[{"type": "custom", "source": "id = 'abc'", "target": "row_id='abc'"}],
607+
),
608+
# Really long filters.
609+
(
610+
"id12345678901234567890 = 'abcdefghijklmnopqrstuvwxyz'",
611+
[
612+
{
613+
"type": "custom",
614+
"source": "id12345678901234567890 = 'abcdefghijklmnopqrstuvwxyz'",
615+
"target": "id12345678901234567890 = 'abcdefghijklmnopqrstuvwxyz'",
616+
}
617+
],
618+
),
595619
(
596-
"mod_timestamp >= '2024-04-01 16:00:00 UTC':mod_timestamp >= '2020-04-01 16:00:00 UTC'",
620+
"id12345678901234567890=12345678901234567890:row_id12345678901234567890=12345678901234567890",
597621
[
598622
{
599623
"type": "custom",
600-
"source": "mod_timestamp >= '2024-04-01 16:00:00 UTC'",
601-
"target": "mod_timestamp >= '2020-04-01 16:00:00 UTC'",
624+
"source": "id12345678901234567890=12345678901234567890",
625+
"target": "row_id12345678901234567890=12345678901234567890",
602626
}
603627
],
604628
),
605629
],
606630
)
607-
def test_get_filters(test_input, expected):
631+
def test_get_filters_simple(test_input: str, expected: list):
632+
"""Test get filters."""
633+
res = cli_tools.get_filters(test_input)
634+
assert res == expected
635+
636+
637+
@pytest.mark.parametrize(
638+
"test_input",
639+
[
640+
(""),
641+
(" "),
642+
(":"),
643+
(" : "),
644+
],
645+
)
646+
def test_get_filters_fail(test_input: str):
608647
"""Test get filters."""
648+
with pytest.raises(argparse.ArgumentTypeError):
649+
_ = cli_tools.get_filters(test_input)
650+
651+
652+
@pytest.mark.parametrize(
653+
"test_input,expected",
654+
[
655+
# Timestamp related characters.
656+
(
657+
"col_ts >= '2024-04-01 16:00:00 UTC'",
658+
[
659+
{
660+
"type": "custom",
661+
"source": "col_ts >= '2024-04-01 16:00:00 UTC'",
662+
"target": "col_ts >= '2024-04-01 16:00:00 UTC'",
663+
}
664+
],
665+
),
666+
# Timestamp related characters with a filter pair colon.
667+
(
668+
"col_ts >= '2024-04-01 16:00:00 UTC':col_ts >= '2020-04-01 16.00.00 +00:00'",
669+
[
670+
{
671+
"type": "custom",
672+
"source": "col_ts >= '2024-04-01 16:00:00 UTC'",
673+
"target": "col_ts >= '2020-04-01 16.00.00 +00:00'",
674+
}
675+
],
676+
),
677+
# Timestamp with greater-than, less-than and parentheses.
678+
(
679+
"col_ts >= to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and col_ts < to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
680+
[
681+
{
682+
"type": "custom",
683+
"source": "col_ts >= to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and col_ts < to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
684+
"target": "col_ts >= to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and col_ts < to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
685+
}
686+
],
687+
),
688+
],
689+
)
690+
def test_get_filters_datetimes(test_input, expected):
691+
"""Test get filters with timestamps."""
609692
res = cli_tools.get_filters(test_input)
610693
assert res == expected
611694

tests/unit/test_util.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,86 @@ def fn_cwargs(c1: int = None):
6060

6161
assert fn_cwargs(3) == module_under_test.timed_call("cwargs fn", fn_cwargs, c1=3)
6262
assert any(_ for _ in caplog.messages if "cwargs fn" in _)
63+
64+
65+
@pytest.mark.parametrize(
66+
"test_input,expected,sep",
67+
[
68+
# Test with default separator of space.
69+
(
70+
"abc",
71+
[
72+
"abc",
73+
],
74+
None,
75+
),
76+
(
77+
"a b c",
78+
[
79+
"a",
80+
"b",
81+
"c",
82+
],
83+
None,
84+
),
85+
(
86+
"a 'b c'",
87+
[
88+
"a",
89+
"'b c'",
90+
],
91+
None,
92+
),
93+
# Test with separator of ":" which matches --filters separator.
94+
(
95+
"col1 = 1:col2 = 1",
96+
[
97+
"col1 = 1",
98+
"col2 = 1",
99+
],
100+
":",
101+
),
102+
(
103+
"col1 = ':'",
104+
[
105+
"col1 = ':'",
106+
],
107+
":",
108+
),
109+
(
110+
"col1 = ':':col2 = ''':'''",
111+
[
112+
"col1 = ':'",
113+
"col2 = ''':'''",
114+
],
115+
":",
116+
),
117+
(
118+
"col_ts >= to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') "
119+
"and col_ts < to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
120+
[
121+
"col_ts >= to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') "
122+
"and col_ts < to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
123+
],
124+
":",
125+
),
126+
(
127+
"col_ts1 between to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')"
128+
":"
129+
"col_ts2 between to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
130+
[
131+
"col_ts1 between to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
132+
"col_ts2 between to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
133+
],
134+
":",
135+
),
136+
],
137+
)
138+
def test_split_not_in_quotes(
139+
module_under_test, test_input: str, expected: tuple, sep: str
140+
):
141+
if sep:
142+
result = module_under_test.split_not_in_quotes(test_input, sep=sep)
143+
else:
144+
result = module_under_test.split_not_in_quotes(test_input)
145+
assert result == expected

0 commit comments

Comments
 (0)