-
Notifications
You must be signed in to change notification settings - Fork 25.2k
ESQL: Pushdown constructs doing case-insensitive regexes #128393
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
caf2c94
3525880
f9b8e79
ae057c6
fdcdcb9
07411e1
7fa6686
bc25956
17b6db2
f70bc27
cb105fa
03c9202
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
pr: 128393 | ||
summary: Pushdown constructs doing case-insensitive regexes | ||
area: ES|QL | ||
type: enhancement | ||
issues: | ||
- 127479 |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as above - make the parameter a class property. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
package org.elasticsearch.xpack.esql.core.util; | ||
|
||
import org.apache.lucene.document.InetAddressPoint; | ||
import org.apache.lucene.search.WildcardQuery; | ||
import org.apache.lucene.search.spell.LevenshteinDistance; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.apache.lucene.util.CollectionUtil; | ||
|
@@ -178,6 +179,44 @@ public static String wildcardToJavaPattern(String pattern, char escape) { | |
return regex.toString(); | ||
} | ||
|
||
/** | ||
* Translates a Lucene wildcard pattern to a Lucene RegExp one. | ||
* @param wildcard Lucene wildcard pattern | ||
* @return Lucene RegExp pattern | ||
*/ | ||
public static String luceneWildcardToRegExp(String wildcard) { | ||
StringBuilder regex = new StringBuilder(); | ||
|
||
for (int i = 0, wcLen = wildcard.length(); i < wcLen; i++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
char c = wildcard.charAt(i); // this will work chunking through Unicode as long as all values matched are ASCII | ||
switch (c) { | ||
case WildcardQuery.WILDCARD_STRING -> regex.append(".*"); | ||
case WildcardQuery.WILDCARD_CHAR -> regex.append("."); | ||
case WildcardQuery.WILDCARD_ESCAPE -> { | ||
if (i + 1 < wcLen) { | ||
// consume the wildcard escaping, consider the next char | ||
char next = wildcard.charAt(i + 1); | ||
i++; | ||
switch (next) { | ||
case WildcardQuery.WILDCARD_STRING, WildcardQuery.WILDCARD_CHAR, WildcardQuery.WILDCARD_ESCAPE -> | ||
// escape `*`, `.`, `\`, since these are special chars in RegExp as well | ||
regex.append("\\"); | ||
// default: unnecessary escaping -- just ignore the escaping | ||
} | ||
regex.append(next); | ||
} else { | ||
// "else fallthru, lenient parsing with a trailing \" -- according to WildcardQuery#toAutomaton | ||
regex.append("\\\\"); | ||
} | ||
} | ||
case '$', '(', ')', '+', '.', '[', ']', '^', '{', '|', '}' -> regex.append("\\").append(c); | ||
default -> regex.append(c); | ||
} | ||
} | ||
|
||
return regex.toString(); | ||
} | ||
|
||
/** | ||
* Translates a like pattern to a Lucene wildcard. | ||
* This methods pays attention to the custom escape char which gets converted into \ (used by Lucene). | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -319,3 +319,63 @@ warningRegex:java.lang.IllegalArgumentException: single-value function encounter | |
emp_no:integer | job_positions:keyword | ||
10025 | Accountant | ||
; | ||
|
||
likeWithUpper | ||
FROM employees | ||
| KEEP emp_no, first_name | ||
| SORT emp_no | ||
| WHERE TO_UPPER(first_name) LIKE "GEOR*" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please add a couple of negative tests, eg. where TO_UPPER tries to match a lowercase pattern
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added more tests (the logical folding is tested as well) |
||
; | ||
|
||
emp_no:integer |first_name:keyword | ||
10001 |Georgi | ||
10055 |Georgy | ||
; | ||
|
||
likeWithLower | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we decide to accept the breaking change, you'll need at least a new capability, otherwise these tests will fail on mixed clusters. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No longer needed. |
||
FROM employees | ||
| KEEP emp_no, first_name | ||
| SORT emp_no | ||
| WHERE TO_LOWER(TO_UPPER(first_name)) LIKE "geor*" | ||
; | ||
|
||
emp_no:integer |first_name:keyword | ||
10001 |Georgi | ||
10055 |Georgy | ||
; | ||
|
||
rlikeWithUpper | ||
FROM employees | ||
| KEEP emp_no, first_name | ||
| SORT emp_no | ||
| WHERE TO_UPPER(first_name) RLIKE "GEOR.*" | ||
; | ||
|
||
emp_no:integer |first_name:keyword | ||
10001 |Georgi | ||
10055 |Georgy | ||
; | ||
|
||
rlikeWithLower | ||
FROM employees | ||
| KEEP emp_no, first_name | ||
| SORT emp_no | ||
| WHERE TO_LOWER(TO_UPPER(first_name)) RLIKE "geor.*" | ||
; | ||
|
||
emp_no:integer |first_name:keyword | ||
10001 |Georgi | ||
10055 |Georgy | ||
; | ||
|
||
negatedRLikeWithLower | ||
FROM employees | ||
| KEEP emp_no, first_name | ||
| SORT emp_no | ||
| WHERE TO_LOWER(TO_UPPER(first_name)) NOT RLIKE "geor.*" | ||
| STATS c = COUNT() | ||
; | ||
|
||
c:long | ||
88 | ||
; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,10 +7,12 @@ | |
|
||
package org.elasticsearch.xpack.esql.expression.function.scalar.string; | ||
|
||
import org.elasticsearch.TransportVersions; | ||
import org.elasticsearch.common.io.stream.NamedWriteableRegistry; | ||
import org.elasticsearch.common.io.stream.StreamInput; | ||
import org.elasticsearch.common.io.stream.StreamOutput; | ||
import org.elasticsearch.compute.operator.EvalOperator; | ||
import org.elasticsearch.xpack.esql.EsqlIllegalArgumentException; | ||
import org.elasticsearch.xpack.esql.capabilities.TranslationAware; | ||
import org.elasticsearch.xpack.esql.core.expression.Expression; | ||
import org.elasticsearch.xpack.esql.core.expression.FoldContext; | ||
|
@@ -37,6 +39,7 @@ public class RLike extends org.elasticsearch.xpack.esql.core.expression.predicat | |
EvaluatorMapper, | ||
TranslationAware.SingleValueTranslationAware { | ||
public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "RLike", RLike::new); | ||
public static final String NAME = "RLIKE"; | ||
|
||
@FunctionInfo(returnType = "boolean", description = """ | ||
Use `RLIKE` to filter data based on string patterns using using | ||
|
@@ -52,28 +55,43 @@ Matching special characters (eg. `.`, `*`, `(`...) will require escaping. | |
To reduce the overhead of escaping, we suggest using triple quotes strings `\"\"\"` | ||
|
||
<<load-esql-example, file=string tag=rlikeEscapingTripleQuotes>> | ||
""", operator = "RLIKE", examples = @Example(file = "docs", tag = "rlike")) | ||
""", operator = NAME, examples = @Example(file = "docs", tag = "rlike")) | ||
public RLike( | ||
Source source, | ||
@Param(name = "str", type = { "keyword", "text" }, description = "A literal value.") Expression value, | ||
@Param(name = "pattern", type = { "keyword", "text" }, description = "A regular expression.") RLikePattern pattern | ||
) { | ||
super(source, value, pattern); | ||
this(source, value, pattern, false); | ||
} | ||
|
||
public RLike(Source source, Expression field, RLikePattern rLikePattern, boolean caseInsensitive) { | ||
super(source, field, rLikePattern, caseInsensitive); | ||
} | ||
|
||
private RLike(StreamInput in) throws IOException { | ||
this(Source.readFrom((PlanStreamInput) in), in.readNamedWriteable(Expression.class), new RLikePattern(in.readString())); | ||
this( | ||
Source.readFrom((PlanStreamInput) in), | ||
in.readNamedWriteable(Expression.class), | ||
new RLikePattern(in.readString()), | ||
in.getTransportVersion().onOrAfter(TransportVersions.ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY) && in.readBoolean() | ||
); | ||
} | ||
|
||
@Override | ||
public void writeTo(StreamOutput out) throws IOException { | ||
source().writeTo(out); | ||
out.writeNamedWriteable(field()); | ||
out.writeString(pattern().asJavaRegex()); | ||
if (caseInsensitive() && out.getTransportVersion().before(TransportVersions.ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we just re-add the We'll have to be careful with layouts and NameIDs, but probably it's not impossible. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would also give us a bwc advantage if we decide to add case insensitive operators to the grammar. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Clever idea: serialize Another suggestion that might be cleaner would be to perform the optimization on the data nodes only, not on the coordinator to avoid the difference in serialization. |
||
// The plan has been optimized to run a case-insensitive match, which the remote peer cannot be notified of. Simply avoiding | ||
// the serialization of the boolean would result in wrong results. | ||
throw new EsqlIllegalArgumentException( | ||
NAME + " with case insensitivity is not supported in peer node's version [{}]. Upgrade to version [{}] or newer.", | ||
out.getTransportVersion(), | ||
TransportVersions.ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY | ||
); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pointing out this fast-failure here // bwc issue. We haven't been serialising the case-insensitivity of the regexp operators, so far. We now need to. Not sure we have an alternative to failing the query in this case, without introducing some compensating mechanisms like rerouting queries to old nodes for planning. The planner isn't currently aware of the versions it should plan for. Introducing new operators ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another case for us needing to know if we can enable the optimization based on the minimum node version, or something like it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't make these decisions on the local node only? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case we could be safe, see my comment below, but in general I agree that having minimum node version (or node features, or whatever) at planning level would help a lot. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We probably should be able to pull that into a rule that applies locally-only, yes.. Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extracted the logic into local optimiser-only rule. |
||
out.writeBoolean(caseInsensitive()); | ||
} | ||
|
||
@Override | ||
|
@@ -103,7 +121,7 @@ public Boolean fold(FoldContext ctx) { | |
|
||
@Override | ||
public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) { | ||
return AutomataMatch.toEvaluator(source(), toEvaluator.apply(field()), pattern().createAutomaton()); | ||
return AutomataMatch.toEvaluator(source(), toEvaluator.apply(field()), pattern().createAutomaton(caseInsensitive())); | ||
} | ||
|
||
@Override | ||
|
@@ -122,4 +140,9 @@ public Query asQuery(LucenePushdownPredicates pushdownPredicates, TranslatorHand | |
public Expression singleValueField() { | ||
return field(); | ||
} | ||
|
||
@Override | ||
public String nodeString() { | ||
return NAME + "(" + field().nodeString() + ", \"" + pattern().pattern() + "\", " + caseInsensitive() + ")"; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Expose ignoreCase as a property in StringPattern since it affects both the Automaton and javaRegex. The former can contain the mode but the latter doesn't so we need a way to bubble it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The pattern is independent of how it's used for matching, casing-wise. The java regex version has it's own mechanism to flag case insensitivity and not sure it'd be trivial, or "safe", or even needed to modify it based on a method parameter.
But even if we updated the
StringPattern
interface, we'd have to recreate the object if theRegexMatch
requires case-insenstive matching, since theStringPattern
object is created at parsing time (when it's not known if the matching will be case insensitive or not).Furthermore, the
matchesAll()
andexactMatch()
methods ofAbstractStringPattern
also callingautomaton()
are invariant to casing.So not sure if we'd need any more changes, but if there's a better solution here, happy to apply it.