Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize .? in the FastRegexMatcher #537

Merged
merged 3 commits into from
Oct 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions model/labels/regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,15 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {

// Any string is fine (including an empty one), as far as it doesn't contain any newline.
return anyStringWithoutNewlineMatcher{}
case syntax.OpQuest:
// Only optimize for ".?".
if len(re.Sub) != 1 || (re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL) {
return nil
}

return &zeroOrOneCharacterStringMatcher{
matchNL: re.Sub[0].Op == syntax.OpAnyChar,
}
case syntax.OpEmptyMatch:
return emptyStringMatcher{}

Expand Down Expand Up @@ -511,14 +520,14 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
var left, right StringMatcher

// Let's try to find if there's a first and last any matchers.
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar {
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar || re.Sub[0].Op == syntax.OpQuest {
left = stringMatcherFromRegexpInternal(re.Sub[0])
if left == nil {
return nil
}
re.Sub = re.Sub[1:]
}
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar {
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar || re.Sub[len(re.Sub)-1].Op == syntax.OpQuest {
right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1])
if right == nil {
return nil
Expand Down Expand Up @@ -845,6 +854,26 @@ func (m *anyNonEmptyStringMatcher) Matches(s string) bool {
return len(s) > 0 && strings.IndexByte(s, '\n') == -1
}

// zeroOrOneCharacterStringMatcher is a StringMatcher which matches zero or one occurrence
// of any character. The newline character is matches only if matchNL is set to true.
type zeroOrOneCharacterStringMatcher struct {
matchNL bool
}

func (m *zeroOrOneCharacterStringMatcher) Matches(s string) bool {
// Zero or one.
if len(s) > 1 {
return false
}

// No need to check for the newline if the string is empty or matching a newline is OK.
if m.matchNL || len(s) == 0 {
return true
}

return s[0] != '\n'
}

// trueMatcher is a stringMatcher which matches any string (always returns true).
type trueMatcher struct{}

Expand Down
115 changes: 115 additions & 0 deletions model/labels/regexp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ var (
"(?i:(zQPbMkNO.*|NNSPdvMi.*|iWuuSoAl.*|qbvKMimS.*|IecrXtPa.*|seTckYqt.*|NxnyHkgB.*|fIDlOgKb.*|UhlWIygH.*|OtNoJxHG.*|cUTkFVIV.*|mTgFIHjr.*|jQkoIDtE.*|PPMKxRXl.*|AwMfwVkQ.*|CQyMrTQJ.*|BzrqxVSi.*|nTpcWuhF.*|PertdywG.*|ZZDgCtXN.*|WWdDPyyE.*|uVtNQsKk.*|BdeCHvPZ.*|wshRnFlH.*|aOUIitIp.*|RxZeCdXT.*|CFZMslCj.*|AVBZRDxl.*|IzIGCnhw.*|ythYuWiz.*|oztXVXhl.*|VbLkwqQx.*|qvaUgyVC.*|VawUjPWC.*|ecloYJuj.*|boCLTdSU.*|uPrKeAZx.*|hrMWLWBq.*|JOnUNHRM.*|rYnujkPq.*|dDEdZhIj.*|DRrfvugG.*|yEGfDxVV.*|YMYdJWuP.*|PHUQZNWM.*|AmKNrLis.*|zTxndVfn.*|FPsHoJnc.*|EIulZTua.*|KlAPhdzg.*|ScHJJCLt.*|NtTfMzME.*|eMCwuFdo.*|SEpJVJbR.*|cdhXZeCx.*|sAVtBwRh.*|kVFEVcMI.*|jzJrxraA.*|tGLHTell.*|NNWoeSaw.*|DcOKSetX.*|UXZAJyka.*|THpMphDP.*|rizheevl.*|kDCBRidd.*|pCZZRqyu.*|pSygkitl.*|SwZGkAaW.*|wILOrfNX.*|QkwVOerj.*|kHOMxPDr.*|EwOVycJv.*|AJvtzQFS.*|yEOjKYYB.*|LizIINLL.*|JBRSsfcG.*|YPiUqqNl.*|IsdEbvee.*|MjEpGcBm.*|OxXZVgEQ.*|xClXGuxa.*|UzRCGFEb.*|buJbvfvA.*|IPZQxRet.*|oFYShsMc.*|oBHffuHO.*|bzzKrcBR.*|KAjzrGCl.*|IPUsAVls.*|OGMUMbIU.*|gyDccHuR.*|bjlalnDd.*|ZLWjeMna.*|fdsuIlxQ.*|dVXtiomV.*|XxedTjNg.*|XWMHlNoA.*|nnyqArQX.*|opfkWGhb.*|wYtnhdYb.*))",
// A long case insensitive alternation where each entry starts with ".*".
"(?i:(.*zQPbMkNO|.*NNSPdvMi|.*iWuuSoAl|.*qbvKMimS|.*IecrXtPa|.*seTckYqt|.*NxnyHkgB|.*fIDlOgKb|.*UhlWIygH|.*OtNoJxHG|.*cUTkFVIV|.*mTgFIHjr|.*jQkoIDtE|.*PPMKxRXl|.*AwMfwVkQ|.*CQyMrTQJ|.*BzrqxVSi|.*nTpcWuhF|.*PertdywG|.*ZZDgCtXN|.*WWdDPyyE|.*uVtNQsKk|.*BdeCHvPZ|.*wshRnFlH|.*aOUIitIp|.*RxZeCdXT|.*CFZMslCj|.*AVBZRDxl|.*IzIGCnhw|.*ythYuWiz|.*oztXVXhl|.*VbLkwqQx|.*qvaUgyVC|.*VawUjPWC|.*ecloYJuj|.*boCLTdSU|.*uPrKeAZx|.*hrMWLWBq|.*JOnUNHRM|.*rYnujkPq|.*dDEdZhIj|.*DRrfvugG|.*yEGfDxVV|.*YMYdJWuP|.*PHUQZNWM|.*AmKNrLis|.*zTxndVfn|.*FPsHoJnc|.*EIulZTua|.*KlAPhdzg|.*ScHJJCLt|.*NtTfMzME|.*eMCwuFdo|.*SEpJVJbR|.*cdhXZeCx|.*sAVtBwRh|.*kVFEVcMI|.*jzJrxraA|.*tGLHTell|.*NNWoeSaw|.*DcOKSetX|.*UXZAJyka|.*THpMphDP|.*rizheevl|.*kDCBRidd|.*pCZZRqyu|.*pSygkitl|.*SwZGkAaW|.*wILOrfNX|.*QkwVOerj|.*kHOMxPDr|.*EwOVycJv|.*AJvtzQFS|.*yEOjKYYB|.*LizIINLL|.*JBRSsfcG|.*YPiUqqNl|.*IsdEbvee|.*MjEpGcBm|.*OxXZVgEQ|.*xClXGuxa|.*UzRCGFEb|.*buJbvfvA|.*IPZQxRet|.*oFYShsMc|.*oBHffuHO|.*bzzKrcBR|.*KAjzrGCl|.*IPUsAVls|.*OGMUMbIU|.*gyDccHuR|.*bjlalnDd|.*ZLWjeMna|.*fdsuIlxQ|.*dVXtiomV|.*XxedTjNg|.*XWMHlNoA|.*nnyqArQX|.*opfkWGhb|.*wYtnhdYb))",
// Quest ".?".
"fo.?",
"foo.?",
"f.?o",
".*foo.?",
".?foo.+",
"foo.?|bar",
}
values = []string{
"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
Expand Down Expand Up @@ -418,6 +425,13 @@ func TestStringMatcherFromRegexp(t *testing.T) {
{"foo.+.+", nil},
{".*.*foo", nil},
{".+.+foo", nil},
{"aaa.?.?", nil},
{"aaa.?.*", nil},
// Regexps with ".?".
{"ext.?|xfs", orStringMatcher{&literalPrefixStringMatcher{prefix: "ext", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: false}}, &equalStringMatcher{s: "xfs", caseSensitive: true}}},
{"(?s)(ext.?|xfs)", orStringMatcher{&literalPrefixStringMatcher{prefix: "ext", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: true}}, &equalStringMatcher{s: "xfs", caseSensitive: true}}},
{"foo.?", &literalPrefixStringMatcher{prefix: "foo", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: false}}},
{"f.?o", nil},
} {
c := c
t.Run(c.pattern, func(t *testing.T) {
Expand Down Expand Up @@ -588,6 +602,91 @@ func TestStringMatcherFromRegexp_LiteralSuffix(t *testing.T) {
}
}

func TestStringMatcherFromRegexp_Quest(t *testing.T) {
for _, c := range []struct {
pattern string
expectedZeroOrOneMatchers int
expectedMatches []string
expectedNotMatches []string
}{
// Not match newline.
{
pattern: "test.?",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"test", "test!"},
expectedNotMatches: []string{"test\n", "tes", "test!!"},
},
{
pattern: ".?test",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"test", "!test"},
expectedNotMatches: []string{"\ntest", "tes", "test!"},
},
{
pattern: "(aaa.?|bbb.?)",
expectedZeroOrOneMatchers: 2,
expectedMatches: []string{"aaa", "aaaX", "bbb", "bbbX"},
expectedNotMatches: []string{"aa", "aaaXX", "aaa\n", "bb", "bbbXX", "bbb\n"},
},
{
pattern: ".*aaa.?",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"aaa", "Xaaa", "aaaX", "XXXaaa", "XXXaaaX"},
expectedNotMatches: []string{"aa", "aaaXX", "XXXaaaXXX", "XXXaaa\n"},
},

// Match newline.
{
pattern: "(?s)test.?",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"test", "test!", "test\n"},
expectedNotMatches: []string{"tes", "test!!", "test\n\n"},
},

// Mixed flags (a part matches newline another doesn't).
{
pattern: "(aaa.?|((?s).?bbb.+))",
expectedZeroOrOneMatchers: 2,
expectedMatches: []string{"aaa", "aaaX", "bbbX", "XbbbX", "bbbXXX", "\nbbbX"},
expectedNotMatches: []string{"aa", "aaa\n", "Xbbb", "\nbbb"},
},
} {
t.Run(c.pattern, func(t *testing.T) {
parsed, err := syntax.Parse(c.pattern, syntax.Perl)
require.NoError(t, err)

matcher := stringMatcherFromRegexp(parsed)
require.NotNil(t, matcher)

re := regexp.MustCompile("^" + c.pattern + "$")

// Pre-condition check: ensure it contains zeroOrOneCharacterStringMatcher.
numZeroOrOneMatchers := 0
visitStringMatcher(matcher, func(matcher StringMatcher) {
if _, ok := matcher.(*zeroOrOneCharacterStringMatcher); ok {
numZeroOrOneMatchers++
}
})

require.Equal(t, c.expectedZeroOrOneMatchers, numZeroOrOneMatchers)

for _, value := range c.expectedMatches {
assert.Truef(t, matcher.Matches(value), "Value: %s", value)

// Ensure the golang regexp engine would return the same.
assert.Truef(t, re.MatchString(value), "Value: %s", value)
}

for _, value := range c.expectedNotMatches {
assert.Falsef(t, matcher.Matches(value), "Value: %s", value)

// Ensure the golang regexp engine would return the same.
assert.Falsef(t, re.MatchString(value), "Value: %s", value)
}
})
}
}

func randString(randGenerator *rand.Rand, length int) string {
b := make([]rune, length)
for i := range b {
Expand Down Expand Up @@ -1106,6 +1205,22 @@ func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
}
}

func TestZeroOrOneCharacterStringMatcher(t *testing.T) {
matcher := &zeroOrOneCharacterStringMatcher{matchNL: true}
assert.True(t, matcher.Matches(""))
assert.True(t, matcher.Matches("x"))
assert.True(t, matcher.Matches("\n"))
assert.False(t, matcher.Matches("xx"))
assert.False(t, matcher.Matches("\n\n"))

matcher = &zeroOrOneCharacterStringMatcher{matchNL: false}
assert.True(t, matcher.Matches(""))
assert.True(t, matcher.Matches("x"))
assert.False(t, matcher.Matches("\n"))
assert.False(t, matcher.Matches("xx"))
assert.False(t, matcher.Matches("\n\n"))
}

func TestLiteralPrefixStringMatcher(t *testing.T) {
m := &literalPrefixStringMatcher{prefix: "mar", prefixCaseSensitive: true, right: &emptyStringMatcher{}}
assert.True(t, m.Matches("mar"))
Expand Down
Loading