fix: Change get_filters to not hang due to long filter strings (#1418)

nj1973 · web-flow · commit a752cf5ac0b0 · 2025-02-04T06:37:25.000Z
* test: Add get_filters() unit tests for long filters

* test: Add pytest-timeout dependency for tests

* feat: Add split_not_in_quotes() utility function and unit tests

* fix: Change get_filters to use split_not_in_quotes() function

* chore: Lint issue

* chore: Reformat
diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py
@@ -52,12 +52,18 @@
 import uuid
 import os
 import math
-import re
 from argparse import Namespace
 from typing import Dict, List, Optional
 from yaml import Dumper, Loader, dump, load
 
-from data_validation import clients, consts, find_tables, state_manager, gcs_helper
+from data_validation import (
+    clients,
+    consts,
+    find_tables,
+    state_manager,
+    gcs_helper,
+    util,
+)
 from data_validation.validation_builder import list_to_sublists
 
 
@@ -1202,31 +1208,29 @@ def get_filters(filter_value: str) -> List[Dict]:
     If only one filter is specified, it applies to both source and target
     For a doc on regular expression for filters see docs/internal/filters_regex.md
     """
-
-    single_filter = r"([^':]*('[^']*')*)*"
-    double_filter = (
-        r"(?P<source>" + single_filter + r"):(?P<target>" + single_filter + r")"
-    )
-    filter_config = []
-    if result := re.fullmatch(single_filter, filter_value):
-        if result.group(0) == "":
+    filters = util.split_not_in_quotes(filter_value, ":")
+    if len(filters) not in (1, 2):
+        raise argparse.ArgumentTypeError("Unable to parse filter arguments.")
+    filters = [_.strip() for _ in filters]
+    if len(filters) == 1:
+        if not filters[0]:
             raise argparse.ArgumentTypeError("Empty string not allowed in filter")
         filter_dict = {
             "type": "custom",
-            "source": result.group(0),
-            "target": result.group(0),
+            "source": filters[0],
+            "target": filters[0],
         }
-    elif result := re.fullmatch(double_filter, filter_value):
-        if result.group("source") == "" or result.group("target") == "":
+    elif len(filters) == 2:
+        if not filters[0] or not filters[1]:
             raise argparse.ArgumentTypeError("Empty string not allowed in filter")
         filter_dict = {
             "type": "custom",
-            "source": result.group("source"),
-            "target": result.group("target"),
+            "source": filters[0],
+            "target": filters[1],
         }
-    else:
-        raise argparse.ArgumentTypeError("Unable to parse filter arguments.")
-    filter_config.append(filter_dict)
+    filter_config = [
+        filter_dict,
+    ]
     return filter_config
 
 
diff --git a/data_validation/util.py b/data_validation/util.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+import re
 import time
 
 
@@ -22,3 +23,24 @@ def timed_call(log_txt, fn, *args, **kwargs):
     elapsed = time.time() - t0
     logging.debug(f"{log_txt} elapsed: {round(elapsed, 2)}s")
     return result
+
+
+def split_not_in_quotes(
+    to_split: str, sep: str = " ", exclude_empty_tokens: bool = False
+) -> list:
+    """Split a string by a separator but only when the separator is not inside quotes.
+    re pattern taken from this comment:
+        https://stackoverflow.com/a/2787979/10979853
+    The commenter's words should the link ever go stale:
+        Each time it finds a semicolon, the lookahead scans the entire remaining string,
+        making sure there's an even number of single-quotes and an even number of double-quotes.
+        (Single-quotes inside double-quoted fields, or vice-versa, are ignored.) If the
+        lookahead succeeds, the semicolon is a delimiter.
+    The pattern doesn't cope with whitespace as sep, back to back spaces are multiple seps, therefore
+    we have exclude_empty_tokens parameter.
+    """
+    pattern = r"""%(sep)s(?=(?:[^'"]|'[^']*'|"[^"]*")*$)""" % {"sep": sep}
+    if exclude_empty_tokens:
+        return [t for t in re.split(pattern, to_split) if t]
+    else:
+        return re.split(pattern, to_split)
diff --git a/noxfile.py b/noxfile.py
@@ -48,7 +48,9 @@
 def _setup_session_requirements(session, extra_packages=[]):
     """Install requirements for nox tests."""
 
-    session.install("--upgrade", "pip", "pytest", "pytest-cov", "wheel")
+    session.install(
+        "--upgrade", "pip", "pytest", "pytest-cov", "pytest-timeout", "wheel"
+    )
     session.install("--no-cache-dir", "-e", ".")
 
     if extra_packages:
diff --git a/setup.py b/setup.py
@@ -50,6 +50,15 @@
 extras_require = {
     "apache-airflow": "1.10.11",
     "pyspark": "3.0.0",
+    "develop": [
+        "black==22.3.0",
+        "flake8",
+        "freezegun",
+        "pyfakefs==4.6.2",
+        "pytest",
+        "pytest-cov",
+        "pytest-timeout",
+    ],
 }
 
 packages = [
diff --git a/tests/unit/test_cli_tools.py b/tests/unit/test_cli_tools.py
@@ -564,14 +564,17 @@ def test_get_result_handler_by_conn_file(fs):
     }
 
 
+@pytest.mark.timeout(10)
 @pytest.mark.parametrize(
     "test_input,expected",
     [
+        # Simple filters.
+        ("id < 5", [{"type": "custom", "source": "id < 5", "target": "id < 5"}]),
+        ("id > 5", [{"type": "custom", "source": "id > 5", "target": "id > 5"}]),
         (
-            "id < 5:row_id <5",
-            [{"type": "custom", "source": "id < 5", "target": "row_id <5"}],
+            "id = 'abc'",
+            [{"type": "custom", "source": "id = 'abc'", "target": "id = 'abc'"}],
         ),
-        ("id < 5", [{"type": "custom", "source": "id < 5", "target": "id < 5"}]),
         (
             "name != 'John'",
             [
@@ -582,6 +585,7 @@ def test_get_result_handler_by_conn_file(fs):
                 }
             ],
         ),
+        # With an escaped single quote.
         (
             "name != 'St. John''s'",
             [
@@ -592,20 +596,99 @@ def test_get_result_handler_by_conn_file(fs):
                 }
             ],
         ),
+        # Filter pairs.
+        (
+            "id < 5:row_id <5",
+            [{"type": "custom", "source": "id < 5", "target": "row_id <5"}],
+        ),
+        (
+            "id = 'abc':row_id='abc'",
+            [{"type": "custom", "source": "id = 'abc'", "target": "row_id='abc'"}],
+        ),
+        # Really long filters.
+        (
+            "id12345678901234567890 = 'abcdefghijklmnopqrstuvwxyz'",
+            [
+                {
+                    "type": "custom",
+                    "source": "id12345678901234567890 = 'abcdefghijklmnopqrstuvwxyz'",
+                    "target": "id12345678901234567890 = 'abcdefghijklmnopqrstuvwxyz'",
+                }
+            ],
+        ),
         (
-            "mod_timestamp >= '2024-04-01 16:00:00 UTC':mod_timestamp >= '2020-04-01 16:00:00 UTC'",
+            "id12345678901234567890=12345678901234567890:row_id12345678901234567890=12345678901234567890",
             [
                 {
                     "type": "custom",
-                    "source": "mod_timestamp >= '2024-04-01 16:00:00 UTC'",
-                    "target": "mod_timestamp >= '2020-04-01 16:00:00 UTC'",
+                    "source": "id12345678901234567890=12345678901234567890",
+                    "target": "row_id12345678901234567890=12345678901234567890",
                 }
             ],
         ),
     ],
 )
-def test_get_filters(test_input, expected):
+def test_get_filters_simple(test_input: str, expected: list):
+    """Test get filters."""
+    res = cli_tools.get_filters(test_input)
+    assert res == expected
+
+
+@pytest.mark.parametrize(
+    "test_input",
+    [
+        (""),
+        ("  "),
+        (":"),
+        (" : "),
+    ],
+)
+def test_get_filters_fail(test_input: str):
     """Test get filters."""
+    with pytest.raises(argparse.ArgumentTypeError):
+        _ = cli_tools.get_filters(test_input)
+
+
+@pytest.mark.parametrize(
+    "test_input,expected",
+    [
+        # Timestamp related characters.
+        (
+            "col_ts >= '2024-04-01 16:00:00 UTC'",
+            [
+                {
+                    "type": "custom",
+                    "source": "col_ts >= '2024-04-01 16:00:00 UTC'",
+                    "target": "col_ts >= '2024-04-01 16:00:00 UTC'",
+                }
+            ],
+        ),
+        # Timestamp related characters with a filter pair colon.
+        (
+            "col_ts >= '2024-04-01 16:00:00 UTC':col_ts >= '2020-04-01 16.00.00 +00:00'",
+            [
+                {
+                    "type": "custom",
+                    "source": "col_ts >= '2024-04-01 16:00:00 UTC'",
+                    "target": "col_ts >= '2020-04-01 16.00.00 +00:00'",
+                }
+            ],
+        ),
+        # Timestamp with greater-than, less-than and parentheses.
+        (
+            "col_ts >= to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and col_ts < to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
+            [
+                {
+                    "type": "custom",
+                    "source": "col_ts >= to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and col_ts < to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
+                    "target": "col_ts >= to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and col_ts < to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
+                }
+            ],
+        ),
+    ],
+)
+def test_get_filters_datetimes(test_input, expected):
+    """Test get filters with timestamps."""
     res = cli_tools.get_filters(test_input)
     assert res == expected
 
diff --git a/tests/unit/test_util.py b/tests/unit/test_util.py
@@ -60,3 +60,86 @@ def fn_cwargs(c1: int = None):
 
     assert fn_cwargs(3) == module_under_test.timed_call("cwargs fn", fn_cwargs, c1=3)
     assert any(_ for _ in caplog.messages if "cwargs fn" in _)
+
+
+@pytest.mark.parametrize(
+    "test_input,expected,sep",
+    [
+        # Test with default separator of space.
+        (
+            "abc",
+            [
+                "abc",
+            ],
+            None,
+        ),
+        (
+            "a b c",
+            [
+                "a",
+                "b",
+                "c",
+            ],
+            None,
+        ),
+        (
+            "a 'b c'",
+            [
+                "a",
+                "'b c'",
+            ],
+            None,
+        ),
+        # Test with separator of ":" which matches --filters separator.
+        (
+            "col1 = 1:col2 = 1",
+            [
+                "col1 = 1",
+                "col2 = 1",
+            ],
+            ":",
+        ),
+        (
+            "col1 = ':'",
+            [
+                "col1 = ':'",
+            ],
+            ":",
+        ),
+        (
+            "col1 = ':':col2 = ''':'''",
+            [
+                "col1 = ':'",
+                "col2 = ''':'''",
+            ],
+            ":",
+        ),
+        (
+            "col_ts >= to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') "
+            "and col_ts < to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
+            [
+                "col_ts >= to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') "
+                "and col_ts < to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
+            ],
+            ":",
+        ),
+        (
+            "col_ts1 between to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')"
+            ":"
+            "col_ts2 between to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
+            [
+                "col_ts1 between to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
+                "col_ts2 between to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS') and to_timestamp('2024-04-01 16:00:00','YYYY-MM-DD HH24:MI:SS')",
+            ],
+            ":",
+        ),
+    ],
+)
+def test_split_not_in_quotes(
+    module_under_test, test_input: str, expected: tuple, sep: str
+):
+    if sep:
+        result = module_under_test.split_not_in_quotes(test_input, sep=sep)
+    else:
+        result = module_under_test.split_not_in_quotes(test_input)
+    assert result == expected