Skip to content

Commit

Permalink
Optimize .? in the FastRegexMatcher
Browse files Browse the repository at this point in the history
Signed-off-by: Marco Pracucci <[email protected]>
  • Loading branch information
pracucci committed Sep 29, 2023
1 parent 5ae9c19 commit 756e124
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 2 deletions.
33 changes: 31 additions & 2 deletions model/labels/regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,15 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {

// Any string is fine (including an empty one), as far as it doesn't contain any newline.
return anyStringWithoutNewlineMatcher{}
case syntax.OpQuest:
// Only optimize for ".?".
if len(re.Sub) != 1 || re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
return nil
}

return &zeroOrOneCharacterStringMatcher{
matchNL: re.Sub[0].Op == syntax.OpAnyChar,
}
case syntax.OpEmptyMatch:
return emptyStringMatcher{}

Expand Down Expand Up @@ -511,14 +520,14 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
var left, right StringMatcher

// Let's try to find if there's a first and last any matchers.
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar {
if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar || re.Sub[0].Op == syntax.OpQuest {
left = stringMatcherFromRegexpInternal(re.Sub[0])
if left == nil {
return nil
}
re.Sub = re.Sub[1:]
}
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar {
if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar || re.Sub[len(re.Sub)-1].Op == syntax.OpQuest {
right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1])
if right == nil {
return nil
Expand Down Expand Up @@ -845,6 +854,26 @@ func (m *anyNonEmptyStringMatcher) Matches(s string) bool {
return len(s) > 0 && strings.IndexByte(s, '\n') == -1
}

// zeroOrOneCharacterStringMatcher is a StringMatcher which matches zero or one occurrence
// of any character. The newline character is matches only if matchNL is set to true.
type zeroOrOneCharacterStringMatcher struct {
matchNL bool
}

func (m *zeroOrOneCharacterStringMatcher) Matches(s string) bool {
// Zero or one.
if len(s) > 1 {
return false
}

// No need to check for the newline if the string is empty or matching a newline is OK.
if m.matchNL || len(s) == 0 {
return true
}

return s[0] != '\n'
}

// trueMatcher is a stringMatcher which matches any string (always returns true).
type trueMatcher struct{}

Expand Down
112 changes: 112 additions & 0 deletions model/labels/regexp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ var (
"(?i:(zQPbMkNO.*|NNSPdvMi.*|iWuuSoAl.*|qbvKMimS.*|IecrXtPa.*|seTckYqt.*|NxnyHkgB.*|fIDlOgKb.*|UhlWIygH.*|OtNoJxHG.*|cUTkFVIV.*|mTgFIHjr.*|jQkoIDtE.*|PPMKxRXl.*|AwMfwVkQ.*|CQyMrTQJ.*|BzrqxVSi.*|nTpcWuhF.*|PertdywG.*|ZZDgCtXN.*|WWdDPyyE.*|uVtNQsKk.*|BdeCHvPZ.*|wshRnFlH.*|aOUIitIp.*|RxZeCdXT.*|CFZMslCj.*|AVBZRDxl.*|IzIGCnhw.*|ythYuWiz.*|oztXVXhl.*|VbLkwqQx.*|qvaUgyVC.*|VawUjPWC.*|ecloYJuj.*|boCLTdSU.*|uPrKeAZx.*|hrMWLWBq.*|JOnUNHRM.*|rYnujkPq.*|dDEdZhIj.*|DRrfvugG.*|yEGfDxVV.*|YMYdJWuP.*|PHUQZNWM.*|AmKNrLis.*|zTxndVfn.*|FPsHoJnc.*|EIulZTua.*|KlAPhdzg.*|ScHJJCLt.*|NtTfMzME.*|eMCwuFdo.*|SEpJVJbR.*|cdhXZeCx.*|sAVtBwRh.*|kVFEVcMI.*|jzJrxraA.*|tGLHTell.*|NNWoeSaw.*|DcOKSetX.*|UXZAJyka.*|THpMphDP.*|rizheevl.*|kDCBRidd.*|pCZZRqyu.*|pSygkitl.*|SwZGkAaW.*|wILOrfNX.*|QkwVOerj.*|kHOMxPDr.*|EwOVycJv.*|AJvtzQFS.*|yEOjKYYB.*|LizIINLL.*|JBRSsfcG.*|YPiUqqNl.*|IsdEbvee.*|MjEpGcBm.*|OxXZVgEQ.*|xClXGuxa.*|UzRCGFEb.*|buJbvfvA.*|IPZQxRet.*|oFYShsMc.*|oBHffuHO.*|bzzKrcBR.*|KAjzrGCl.*|IPUsAVls.*|OGMUMbIU.*|gyDccHuR.*|bjlalnDd.*|ZLWjeMna.*|fdsuIlxQ.*|dVXtiomV.*|XxedTjNg.*|XWMHlNoA.*|nnyqArQX.*|opfkWGhb.*|wYtnhdYb.*))",
// A long case insensitive alternation where each entry starts with ".*".
"(?i:(.*zQPbMkNO|.*NNSPdvMi|.*iWuuSoAl|.*qbvKMimS|.*IecrXtPa|.*seTckYqt|.*NxnyHkgB|.*fIDlOgKb|.*UhlWIygH|.*OtNoJxHG|.*cUTkFVIV|.*mTgFIHjr|.*jQkoIDtE|.*PPMKxRXl|.*AwMfwVkQ|.*CQyMrTQJ|.*BzrqxVSi|.*nTpcWuhF|.*PertdywG|.*ZZDgCtXN|.*WWdDPyyE|.*uVtNQsKk|.*BdeCHvPZ|.*wshRnFlH|.*aOUIitIp|.*RxZeCdXT|.*CFZMslCj|.*AVBZRDxl|.*IzIGCnhw|.*ythYuWiz|.*oztXVXhl|.*VbLkwqQx|.*qvaUgyVC|.*VawUjPWC|.*ecloYJuj|.*boCLTdSU|.*uPrKeAZx|.*hrMWLWBq|.*JOnUNHRM|.*rYnujkPq|.*dDEdZhIj|.*DRrfvugG|.*yEGfDxVV|.*YMYdJWuP|.*PHUQZNWM|.*AmKNrLis|.*zTxndVfn|.*FPsHoJnc|.*EIulZTua|.*KlAPhdzg|.*ScHJJCLt|.*NtTfMzME|.*eMCwuFdo|.*SEpJVJbR|.*cdhXZeCx|.*sAVtBwRh|.*kVFEVcMI|.*jzJrxraA|.*tGLHTell|.*NNWoeSaw|.*DcOKSetX|.*UXZAJyka|.*THpMphDP|.*rizheevl|.*kDCBRidd|.*pCZZRqyu|.*pSygkitl|.*SwZGkAaW|.*wILOrfNX|.*QkwVOerj|.*kHOMxPDr|.*EwOVycJv|.*AJvtzQFS|.*yEOjKYYB|.*LizIINLL|.*JBRSsfcG|.*YPiUqqNl|.*IsdEbvee|.*MjEpGcBm|.*OxXZVgEQ|.*xClXGuxa|.*UzRCGFEb|.*buJbvfvA|.*IPZQxRet|.*oFYShsMc|.*oBHffuHO|.*bzzKrcBR|.*KAjzrGCl|.*IPUsAVls|.*OGMUMbIU|.*gyDccHuR|.*bjlalnDd|.*ZLWjeMna|.*fdsuIlxQ|.*dVXtiomV|.*XxedTjNg|.*XWMHlNoA|.*nnyqArQX|.*opfkWGhb|.*wYtnhdYb))",
// Quest ".?".
"fo.?",
"foo.?",
"f.?o",
".*foo.?",
".?foo.+",
"foo.?|bar",
}
values = []string{
"foo", " foo bar", "bar", "buzz\nbar", "bar foo", "bfoo", "\n", "\nfoo", "foo\n", "hello foo world", "hello foo\n world", "",
Expand Down Expand Up @@ -418,6 +425,13 @@ func TestStringMatcherFromRegexp(t *testing.T) {
{"foo.+.+", nil},
{".*.*foo", nil},
{".+.+foo", nil},
{"aaa.?.?", nil},
{"aaa.?.*", nil},
// Regexps with ".?".
{"ext.?|xfs", orStringMatcher{&literalPrefixStringMatcher{prefix: "ext", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: false}}, &equalStringMatcher{s: "xfs", caseSensitive: true}}},
{"(?s)(ext.?|xfs)", orStringMatcher{&literalPrefixStringMatcher{prefix: "ext", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: true}}, &equalStringMatcher{s: "xfs", caseSensitive: true}}},
{"foo.?", &literalPrefixStringMatcher{prefix: "foo", prefixCaseSensitive: true, right: &zeroOrOneCharacterStringMatcher{matchNL: false}}},
{"f.?o", nil},
} {
c := c
t.Run(c.pattern, func(t *testing.T) {
Expand Down Expand Up @@ -588,6 +602,88 @@ func TestStringMatcherFromRegexp_LiteralSuffix(t *testing.T) {
}
}

func TestStringMatcherFromRegexp_Quest(t *testing.T) {
for _, c := range []struct {
pattern string
expectedZeroOrOneMatchers int
expectedMatches []string
expectedNotMatches []string
}{
// Not match newline.
{
pattern: "test.?",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"test", "test!"},
expectedNotMatches: []string{"test\n", "tes", "test!!"},
}, {

Check failure on line 618 in model/labels/regexp_test.go

View workflow job for this annotation

GitHub Actions / lint

File is not `gofumpt`-ed with `-extra` (gofumpt)
pattern: ".?test",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"test", "!test"},
expectedNotMatches: []string{"\ntest", "tes", "test!"},
}, {

Check failure on line 623 in model/labels/regexp_test.go

View workflow job for this annotation

GitHub Actions / lint

File is not `gofumpt`-ed with `-extra` (gofumpt)
pattern: "(aaa.?|bbb.?)",
expectedZeroOrOneMatchers: 2,
expectedMatches: []string{"aaa", "aaaX", "bbb", "bbbX"},
expectedNotMatches: []string{"aa", "aaaXX", "aaa\n", "bb", "bbbXX", "bbb\n"},
}, {

Check failure on line 628 in model/labels/regexp_test.go

View workflow job for this annotation

GitHub Actions / lint

File is not `gofumpt`-ed with `-extra` (gofumpt)
pattern: ".*aaa.?",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"aaa", "Xaaa", "aaaX", "XXXaaa", "XXXaaaX"},
expectedNotMatches: []string{"aa", "aaaXX", "XXXaaaXXX", "XXXaaa\n"},
},

// Match newline.
{
pattern: "(?s)test.?",
expectedZeroOrOneMatchers: 1,
expectedMatches: []string{"test", "test!", "test\n"},
expectedNotMatches: []string{"tes", "test!!", "test\n\n"},
},

// Mixed flags (a part matches newline another doesn't).
{
pattern: "(aaa.?|((?s).?bbb.+))",
expectedZeroOrOneMatchers: 2,
expectedMatches: []string{"aaa", "aaaX", "bbbX", "XbbbX", "bbbXXX", "\nbbbX"},
expectedNotMatches: []string{"aa", "aaa\n", "Xbbb", "\nbbb"},
},
} {
t.Run(c.pattern, func(t *testing.T) {
parsed, err := syntax.Parse(c.pattern, syntax.Perl)
require.NoError(t, err)

matcher := stringMatcherFromRegexp(parsed)
require.NotNil(t, matcher)

re := regexp.MustCompile("^" + c.pattern + "$")

// Pre-condition check: ensure it contains zeroOrOneCharacterStringMatcher.
numZeroOrOneMatchers := 0
visitStringMatcher(matcher, func(matcher StringMatcher) {
if _, ok := matcher.(*zeroOrOneCharacterStringMatcher); ok {
numZeroOrOneMatchers++
}
})

require.Equal(t, c.expectedZeroOrOneMatchers, numZeroOrOneMatchers)

for _, value := range c.expectedMatches {
assert.Truef(t, matcher.Matches(value), "Value: %s", value)

// Ensure the golang regexp engine would return the same.
assert.Truef(t, re.MatchString(value), "Value: %s", value)
}

for _, value := range c.expectedNotMatches {
assert.Falsef(t, matcher.Matches(value), "Value: %s", value)

// Ensure the golang regexp engine would return the same.
assert.Falsef(t, re.MatchString(value), "Value: %s", value)
}
})
}
}

func randString(randGenerator *rand.Rand, length int) string {
b := make([]rune, length)
for i := range b {
Expand Down Expand Up @@ -1106,6 +1202,22 @@ func BenchmarkOptimizeEqualStringMatchers(b *testing.B) {
}
}

func TestZeroOrOneCharacterStringMatcher(t *testing.T) {
matcher := &zeroOrOneCharacterStringMatcher{matchNL: true}
assert.True(t, matcher.Matches(""))
assert.True(t, matcher.Matches("x"))
assert.True(t, matcher.Matches("\n"))
assert.False(t, matcher.Matches("xx"))
assert.False(t, matcher.Matches("\n\n"))

matcher = &zeroOrOneCharacterStringMatcher{matchNL: false}
assert.True(t, matcher.Matches(""))
assert.True(t, matcher.Matches("x"))
assert.False(t, matcher.Matches("\n"))
assert.False(t, matcher.Matches("xx"))
assert.False(t, matcher.Matches("\n\n"))
}

func TestLiteralPrefixStringMatcher(t *testing.T) {
m := &literalPrefixStringMatcher{prefix: "mar", prefixCaseSensitive: true, right: &emptyStringMatcher{}}
assert.True(t, m.Matches("mar"))
Expand Down

0 comments on commit 756e124

Please sign in to comment.