From 79640b3eaf4bfbce1136147b57931ce7aa1892e9 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 29 Sep 2023 14:12:41 +0200 Subject: [PATCH 1/3] Optimize .? in the FastRegexMatcher Signed-off-by: Marco Pracucci --- model/labels/regexp.go | 33 ++++++++++- model/labels/regexp_test.go | 112 ++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 2 deletions(-) diff --git a/model/labels/regexp.go b/model/labels/regexp.go index 420d35670a..da22b1000b 100644 --- a/model/labels/regexp.go +++ b/model/labels/regexp.go @@ -480,6 +480,15 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher { // Any string is fine (including an empty one), as far as it doesn't contain any newline. return anyStringWithoutNewlineMatcher{} + case syntax.OpQuest: + // Only optimize for ".?". + if len(re.Sub) != 1 || re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL { + return nil + } + + return &zeroOrOneCharacterStringMatcher{ + matchNL: re.Sub[0].Op == syntax.OpAnyChar, + } case syntax.OpEmptyMatch: return emptyStringMatcher{} @@ -511,14 +520,14 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher { var left, right StringMatcher // Let's try to find if there's a first and last any matchers. - if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar { + if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar || re.Sub[0].Op == syntax.OpQuest { left = stringMatcherFromRegexpInternal(re.Sub[0]) if left == nil { return nil } re.Sub = re.Sub[1:] } - if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar { + if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar || re.Sub[len(re.Sub)-1].Op == syntax.OpQuest { right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1]) if right == nil { return nil @@ -845,6 +854,26 @@ func (m *anyNonEmptyStringMatcher) Matches(s string) bool { return len(s) > 0 && strings.IndexByte(s, '\n') == -1 } +// zeroOrOneCharacterStringMatcher is a StringMatcher which matches zero or one occurrence +// of any character. The newline character is matches only if matchNL is set to true. +type zeroOrOneCharacterStringMatcher struct { + matchNL bool +} + +func (m *zeroOrOneCharacterStringMatcher) Matches(s string) bool { + // Zero or one. + if len(s) > 1 { + return false + } + + // No need to check for the newline if the string is empty or matching a newline is OK. + if m.matchNL || len(s) == 0 { + return true + } + + return s[0] != '\n' +} + // trueMatcher is a stringMatcher which matches any string (always returns true). type trueMatcher struct{} diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go index 00538aca48..ee26cfeedb 100644 --- a/model/labels/regexp_test.go +++ b/model/labels/regexp_test.go @@ -78,6 +78,13 @@ var ( "(?i:(zQPbMkNO.*|NNSPdvMi.*|iWuuSoAl.*|qbvKMimS.*|IecrXtPa.*|seTckYqt.*|NxnyHkgB.*|fIDlOgKb.*|UhlWIygH.*|OtNoJxHG.*|cUTkFVIV.*|mTgFIHjr.*|jQkoIDtE.*|PPMKxRXl.*|AwMfwVkQ.*|CQyMrTQJ.*|BzrqxVSi.*|nTpcWuhF.*|PertdywG.*|ZZDgCtXN.*|WWdDPyyE.*|uVtNQsKk.*|BdeCHvPZ.*|wshRnFlH.*|aOUIitIp.*|RxZeCdXT.*|CFZMslCj.*|AVBZRDxl.*|IzIGCnhw.*|ythYuWiz.*|oztXVXhl.*|VbLkwqQx.*|qvaUgyVC.*|VawUjPWC.*|ecloYJuj.*|boCLTdSU.*|uPrKeAZx.*|hrMWLWBq.*|JOnUNHRM.*|rYnujkPq.*|dDEdZhIj.*|DRrfvugG.*|yEGfDxVV.*|YMYdJWuP.*|PHUQZNWM.*|AmKNrLis.*|zTxndVfn.*|FPsHoJnc.*|EIulZTua.*|KlAPhdzg.*|ScHJJCLt.*|NtTfMzME.*|eMCwuFdo.*|SEpJVJbR.*|cdhXZeCx.*|sAVtBwRh.*|kVFEVcMI.*|jzJrxraA.*|tGLHTell.*|NNWoeSaw.*|DcOKSetX.*|UXZAJyka.*|THpMphDP.*|rizheevl.*|kDCBRidd.*|pCZZRqyu.*|pSygkitl.*|SwZGkAaW.*|wILOrfNX.*|QkwVOerj.*|kHOMxPDr.*|EwOVycJv.*|AJvtzQFS.*|yEOjKYYB.*|LizIINLL.*|JBRSsfcG.*|YPiUqqNl.*|IsdEbvee.*|MjEpGcBm.*|OxXZVgEQ.*|xClXGuxa.*|UzRCGFEb.*|buJbvfvA.*|IPZQxRet.*|oFYShsMc.*|oBHffuHO.*|bzzKrcBR.*|KAjzrGCl.*|IPUsAVls.*|OGMUMbIU.*|gyDccHuR.*|bjlalnDd.*|ZLWjeMna.*|fdsuIlxQ.*|dVXtiomV.*|XxedTjNg.*|XWMHlNoA.*|nnyqArQX.*|opfkWGhb.*|wYtnhdYb.*))", // A long case insensitive alternation where each entry starts with ".*". "(?i:(.*zQPbMkNO|.*NNSPdvMi|.*iWuuSoAl|.*qbvKMimS|.*IecrXtPa|.*seTckYqt|.*NxnyHkgB|.*fIDlOgKb|.*UhlWIygH|.*OtNoJxHG|.*cUTkFVIV|.*mTgFIHjr|.*jQkoIDtE|.*PPMKxRXl|.*AwMfwVkQ|.*CQyMrTQJ|.*BzrqxVSi|.*nTpcWuhF|.*PertdywG|.*ZZDgCtXN|.*WWdDPyyE|.*uVtNQsKk|.*BdeCHvPZ|.*wshRnFlH|.*aOUIitIp|.*RxZeCdXT|.*CFZMslCj|.*AVBZRDxl|.*IzIGCnhw|.*ythYuWiz|.*oztXVXhl|.*VbLkwqQx|.*qvaUgyVC|.*VawUjPWC|.*ecloYJuj|.*boCLTdSU|.*uPrKeAZx|.*hrMWLWBq|.*JOnUNHRM|.*rYnujkPq|.*dDEdZhIj|.*DRrfvugG|.*yEGfDxVV|.*YMYdJWuP|.*PHUQZNWM|.*AmKNrLis|.*zTxndVfn|.*FPsHoJnc|.*EIulZTua|.*KlAPhdzg|.*ScHJJCLt|.*NtTfMzME|.*eMCwuFdo|.*SEpJVJbR|.*cdhXZeCx|.*sAVtBwRh|.*kVFEVcMI|.*jzJrxraA|.*tGLHTell|.*NNWoeSaw|.*DcOKSetX|.*UXZAJyka|.*THpMphDP|.*rizheevl|.*kDCBRidd|.*pCZZRqyu|.*pSygkitl|.*SwZGkAaW|.*wILOrfNX|.*QkwVOerj|.*kHOMxPDr|.*EwOVycJv|.*AJvtzQFS|.*yEOjKYYB|.*LizIINLL|.*JBRSsfcG|.*YPiUqqNl|.*IsdEbvee|.*MjEpGcBm|.*OxXZVgEQ|.*xClXGuxa|.*UzRCGFEb|.*buJbvfvA|.*IPZQxRet|.*oFYShsMc|.*oBHffuHO|.*bzzKrcBR|.*KAjzrGCl|.*IPUsAVls|.*OGMUMbIU|.*gyDccHuR|.*bjlalnDd|.*ZLWjeMna|.*fdsuIlxQ|.*dVXtiomV|.*XxedTjNg|.*XWMHlNoA|.*nnyqArQX|.*opfkWGhb|.*wYtnhdYb))", + // Quest ".?". + "fo.?", + "foo.?", + "f.?o", + ".*foo.?", + ".?foo.+", + "foo.?|bar", } values = []string{ "foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "", @@ -418,6 +425,13 @@ func TestStringMatcherFromRegexp(t *testing.T) { {"foo.+.+", nil}, {".*.*foo", nil}, {".+.+foo", nil}, + {"aaa.?.?", nil}, + {"aaa.?.*", nil}, + // Regexps with ".?". + {"ext.?|xfs", orStringMatcher{&literalPrefixStringMatcher{prefix: "ext", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: false}}, &equalStringMatcher{s: "xfs", caseSensitive: true}}}, + {"(?s)(ext.?|xfs)", orStringMatcher{&literalPrefixStringMatcher{prefix: "ext", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: true}}, &equalStringMatcher{s: "xfs", caseSensitive: true}}}, + {"foo.?", &literalPrefixStringMatcher{prefix: "foo", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: false}}}, + {"f.?o", nil}, } { c := c t.Run(c.pattern, func(t *testing.T) { @@ -588,6 +602,88 @@ func TestStringMatcherFromRegexp_LiteralSuffix(t *testing.T) { } } +func TestStringMatcherFromRegexp_Quest(t *testing.T) { + for _, c := range []struct { + pattern string + expectedZeroOrOneMatchers int + expectedMatches []string + expectedNotMatches []string + }{ + // Not match newline. + { + pattern: "test.?", + expectedZeroOrOneMatchers: 1, + expectedMatches: []string{"test", "test!"}, + expectedNotMatches: []string{"test\n", "tes", "test!!"}, + }, { + pattern: ".?test", + expectedZeroOrOneMatchers: 1, + expectedMatches: []string{"test", "!test"}, + expectedNotMatches: []string{"\ntest", "tes", "test!"}, + }, { + pattern: "(aaa.?|bbb.?)", + expectedZeroOrOneMatchers: 2, + expectedMatches: []string{"aaa", "aaaX", "bbb", "bbbX"}, + expectedNotMatches: []string{"aa", "aaaXX", "aaa\n", "bb", "bbbXX", "bbb\n"}, + }, { + pattern: ".*aaa.?", + expectedZeroOrOneMatchers: 1, + expectedMatches: []string{"aaa", "Xaaa", "aaaX", "XXXaaa", "XXXaaaX"}, + expectedNotMatches: []string{"aa", "aaaXX", "XXXaaaXXX", "XXXaaa\n"}, + }, + + // Match newline. + { + pattern: "(?s)test.?", + expectedZeroOrOneMatchers: 1, + expectedMatches: []string{"test", "test!", "test\n"}, + expectedNotMatches: []string{"tes", "test!!", "test\n\n"}, + }, + + // Mixed flags (a part matches newline another doesn't). + { + pattern: "(aaa.?|((?s).?bbb.+))", + expectedZeroOrOneMatchers: 2, + expectedMatches: []string{"aaa", "aaaX", "bbbX", "XbbbX", "bbbXXX", "\nbbbX"}, + expectedNotMatches: []string{"aa", "aaa\n", "Xbbb", "\nbbb"}, + }, + } { + t.Run(c.pattern, func(t *testing.T) { + parsed, err := syntax.Parse(c.pattern, syntax.Perl) + require.NoError(t, err) + + matcher := stringMatcherFromRegexp(parsed) + require.NotNil(t, matcher) + + re := regexp.MustCompile("^" + c.pattern + "$") + + // Pre-condition check: ensure it contains zeroOrOneCharacterStringMatcher. + numZeroOrOneMatchers := 0 + visitStringMatcher(matcher, func(matcher StringMatcher) { + if _, ok := matcher.(*zeroOrOneCharacterStringMatcher); ok { + numZeroOrOneMatchers++ + } + }) + + require.Equal(t, c.expectedZeroOrOneMatchers, numZeroOrOneMatchers) + + for _, value := range c.expectedMatches { + assert.Truef(t, matcher.Matches(value), "Value: %s", value) + + // Ensure the golang regexp engine would return the same. + assert.Truef(t, re.MatchString(value), "Value: %s", value) + } + + for _, value := range c.expectedNotMatches { + assert.Falsef(t, matcher.Matches(value), "Value: %s", value) + + // Ensure the golang regexp engine would return the same. + assert.Falsef(t, re.MatchString(value), "Value: %s", value) + } + }) + } +} + func randString(randGenerator *rand.Rand, length int) string { b := make([]rune, length) for i := range b { @@ -1106,6 +1202,22 @@ func BenchmarkOptimizeEqualStringMatchers(b *testing.B) { } } +func TestZeroOrOneCharacterStringMatcher(t *testing.T) { + matcher := &zeroOrOneCharacterStringMatcher{matchNL: true} + assert.True(t, matcher.Matches("")) + assert.True(t, matcher.Matches("x")) + assert.True(t, matcher.Matches("\n")) + assert.False(t, matcher.Matches("xx")) + assert.False(t, matcher.Matches("\n\n")) + + matcher = &zeroOrOneCharacterStringMatcher{matchNL: false} + assert.True(t, matcher.Matches("")) + assert.True(t, matcher.Matches("x")) + assert.False(t, matcher.Matches("\n")) + assert.False(t, matcher.Matches("xx")) + assert.False(t, matcher.Matches("\n\n")) +} + func TestLiteralPrefixStringMatcher(t *testing.T) { m := &literalPrefixStringMatcher{prefix: "mar", prefixCaseSensitive: true, right: &emptyStringMatcher{}} assert.True(t, m.Matches("mar")) From 0df8e0fac10aaeee4ea3c89a91e58816bd0db412 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 29 Sep 2023 14:55:28 +0200 Subject: [PATCH 2/3] Clarify ordering Signed-off-by: Marco Pracucci --- model/labels/regexp.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/labels/regexp.go b/model/labels/regexp.go index da22b1000b..92f71b0345 100644 --- a/model/labels/regexp.go +++ b/model/labels/regexp.go @@ -482,7 +482,7 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher { return anyStringWithoutNewlineMatcher{} case syntax.OpQuest: // Only optimize for ".?". - if len(re.Sub) != 1 || re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL { + if len(re.Sub) != 1 || (re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL) { return nil } From 5d7eb6d8f9384c84c494f644fad5d0e1ec6277ae Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 29 Sep 2023 15:01:41 +0200 Subject: [PATCH 3/3] Make linter happy Signed-off-by: Marco Pracucci --- model/labels/regexp_test.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/model/labels/regexp_test.go b/model/labels/regexp_test.go index ee26cfeedb..123f2f19bb 100644 --- a/model/labels/regexp_test.go +++ b/model/labels/regexp_test.go @@ -615,17 +615,20 @@ func TestStringMatcherFromRegexp_Quest(t *testing.T) { expectedZeroOrOneMatchers: 1, expectedMatches: []string{"test", "test!"}, expectedNotMatches: []string{"test\n", "tes", "test!!"}, - }, { + }, + { pattern: ".?test", expectedZeroOrOneMatchers: 1, expectedMatches: []string{"test", "!test"}, expectedNotMatches: []string{"\ntest", "tes", "test!"}, - }, { + }, + { pattern: "(aaa.?|bbb.?)", expectedZeroOrOneMatchers: 2, expectedMatches: []string{"aaa", "aaaX", "bbb", "bbbX"}, expectedNotMatches: []string{"aa", "aaaXX", "aaa\n", "bb", "bbbXX", "bbb\n"}, - }, { + }, + { pattern: ".*aaa.?", expectedZeroOrOneMatchers: 1, expectedMatches: []string{"aaa", "Xaaa", "aaaX", "XXXaaa", "XXXaaaX"},