From 07e7dfa688e3b8927939f44413b16df09364f375 Mon Sep 17 00:00:00 2001 From: Siavash Askari Nasr Date: Mon, 10 May 2021 16:31:33 +0430 Subject: [PATCH 1/5] Add support for Perl(PCRE) named and unnamed group capturing order. In other words maintain the order of capture groups. --- README.md | 1 + regexp.go | 1 + regexp_MaintainCaptureOrder_test.go | 366 ++++++++++++++++++++++++++++ syntax/parser.go | 159 +++++++----- 4 files changed, 471 insertions(+), 56 deletions(-) create mode 100644 regexp_MaintainCaptureOrder_test.go diff --git a/README.md b/README.md index f92f8b1..9e448f4 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ The internals of `regexp2` always operate on `[]rune` so `Index` and `Length` da | named back reference `\k'name'` | no | yes | | named ascii character class `[[:foo:]]`| yes | no (yes in RE2 compat mode) | | conditionals `(?(expr)yes\|no)` | no | yes | +| PCRE capture group order | no | no (yes in MaintainCaptureOrder mode) | ## RE2 compatibility mode The default behavior of `regexp2` is to match the .NET regexp engine, however the `RE2` option is provided to change the parsing to increase compatibility with RE2. Using the `RE2` option when compiling a regexp will not take away any features, but will change the following behaviors: diff --git a/regexp.go b/regexp.go index 7c7b01d..60fce4a 100644 --- a/regexp.go +++ b/regexp.go @@ -121,6 +121,7 @@ const ( Debug = 0x0080 // "d" ECMAScript = 0x0100 // "e" RE2 = 0x0200 // RE2 (regexp package) compatibility mode + MaintainCaptureOrder = 0x1000 // Maintain named and unnamed capture order ) func (re *Regexp) RightToLeft() bool { diff --git a/regexp_MaintainCaptureOrder_test.go b/regexp_MaintainCaptureOrder_test.go new file mode 100644 index 0000000..99f97ca --- /dev/null +++ b/regexp_MaintainCaptureOrder_test.go @@ -0,0 +1,366 @@ +package regexp2 + +import ( + "testing" +) + +func TestMaintainCaptureOrder_Basic(t *testing.T) { + r, err := Compile("(?this).+?(testing).+?(?stuff)", MaintainCaptureOrder) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := `this is a testing stuff` + m, err := r.FindStringMatch(text) + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `this`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `this`, string(m.GroupByName(`first`).Runes()); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `testing`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `2`, groups[2].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, groups[3].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `last`, groups[3].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, string(m.GroupByNumber(3).Runes()); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_With_Other_Options(t *testing.T) { + r, err := Compile("(?si)(?this).+?\n(testing).+?(?stuff)", MaintainCaptureOrder) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := "This is a \ntesting stuff" + m, err := r.FindStringMatch(text) + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `This`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `testing`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `2`, groups[2].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, groups[3].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `last`, groups[3].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_Enable_Inline(t *testing.T) { + r, err := Compile("(?sio)(?this).+?\n(testing).+?(?stuff)", 0) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := "This is a \ntesting stuff" + m, err := r.FindStringMatch(text) + // t.Errorf(" groups: %#v\n", m) + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `This`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `testing`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `2`, groups[2].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, groups[3].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `last`, groups[3].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_NestedCaptures(t *testing.T) { + r, err := Compile( + `(?This)(?(.)+?(?testing)).+?(some.+?(other).+?(?stuff)) (?\k)`, MaintainCaptureOrder) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := "This is a testing some other stuff testing" + m, err := r.FindStringMatch(text) + + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `This`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := ` is a testing`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `second`, groups[2].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := groups[2].String(), groups[2].Captures[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := ` `, groups[3].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `a`, groups[3].Captures[4].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `3`, groups[3].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `testing`, groups[4].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `test`, groups[4].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `some other stuff`, groups[5].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `5`, groups[5].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `other`, groups[6].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `6`, groups[6].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, groups[7].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `last`, groups[7].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := 8, len(groups); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_RE2_And_NumBackref(t *testing.T) { + r, err := Compile( + `(?'first'This).+?(?Ptesting) (some).+?(?<4>stuff) \2`, MaintainCaptureOrder | RE2) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := "This is a testing some other stuff testing" + m, err := r.FindStringMatch(text) + + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `This`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `testing`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `test`, groups[2].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `some`, groups[3].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `3`, groups[3].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, groups[4].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `4`, groups[4].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_Balancing_Conditional_Alternation(t *testing.T) { + r, err := Compile( + `^[^<>]*(((?'Open'<)[^<>]*)+((?'Close-Open'>)[^<>]*)+)*(?(Open)(?!))$`, MaintainCaptureOrder) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := ">" + m, err := r.FindStringMatch(text) + + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `>`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := ``, groups[1].Captures[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `>`, groups[1].Captures[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `1`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := ``, groups[4].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `>`, groups[4].Captures[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `>`, groups[4].Captures[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `>`, groups[4].Captures[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `4`, groups[4].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `mno`, groups[5].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `abc`, groups[5].Captures[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `xyz`, groups[5].Captures[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `mno`, groups[5].Captures[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `Close`, groups[5].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} diff --git a/syntax/parser.go b/syntax/parser.go index d86f332..2a9ebc4 100644 --- a/syntax/parser.go +++ b/syntax/parser.go @@ -22,6 +22,7 @@ const ( Debug = 0x0080 // "d" ECMAScript = 0x0100 // "e" RE2 = 0x0200 // RE2 compat mode + MaintainCaptureOrder = 0x1000 // "o" Maintain named and unnamed capture order ) func optionFromCode(ch rune) RegexOptions { @@ -43,6 +44,8 @@ func optionFromCode(ch rune) RegexOptions { return Debug case 'e', 'E': return ECMAScript + case 'o', 'O': + return MaintainCaptureOrder default: return 0 } @@ -129,8 +132,9 @@ type parser struct { captop int capsize int - caps map[int]int - capnames map[string]int + caps map[int]int + capnames map[string]int + capnamenums map[string]int capnumlist []int capnamelist []string @@ -214,6 +218,17 @@ func (p *parser) noteCaptureName(name string, pos int) { p.capnames = make(map[string]int) } + if p.useMaintainCaptureOrder() { + if p.capnamenums == nil { + p.capnamenums = make(map[string]int) + } + + if _, ok := p.capnamenums[name]; !ok { + p.capnamenums[name] = p.autocap + p.noteCaptureSlot(p.consumeAutocap(), pos) + } + } + if _, ok := p.capnames[name]; !ok { p.capnames[name] = pos p.capnamelist = append(p.capnamelist, name) @@ -221,73 +236,81 @@ func (p *parser) noteCaptureName(name string, pos int) { } func (p *parser) assignNameSlots() { - if p.capnames != nil { - for _, name := range p.capnamelist { - for p.isCaptureSlot(p.autocap) { + if p.useMaintainCaptureOrder() { + p.capnames = p.capnamenums + // Prepend `0` to capnamelist if it's not set + if p.capnamelist[0] != `0` { + p.capnamelist = append([]string{fmt.Sprint(0)}, p.capnamelist...) + } + } else { + if p.capnames != nil { + for _, name := range p.capnamelist { + for p.isCaptureSlot(p.autocap) { + p.autocap++ + } + pos := p.capnames[name] + p.capnames[name] = p.autocap + p.noteCaptureSlot(p.autocap, pos) + p.autocap++ } - pos := p.capnames[name] - p.capnames[name] = p.autocap - p.noteCaptureSlot(p.autocap, pos) - - p.autocap++ } - } + + // if the caps array has at least one gap, construct the list of used slots + if p.capcount < p.captop { + p.capnumlist = make([]int, p.capcount) + i := 0 - // if the caps array has at least one gap, construct the list of used slots - if p.capcount < p.captop { - p.capnumlist = make([]int, p.capcount) - i := 0 + for k := range p.caps { + p.capnumlist[i] = k + i++ + } - for k := range p.caps { - p.capnumlist[i] = k - i++ + sort.Ints(p.capnumlist) } - sort.Ints(p.capnumlist) - } + // merge capsnumlist into capnamelist + if p.capnames != nil || p.capnumlist != nil { + var oldcapnamelist []string + var next int + var k int - // merge capsnumlist into capnamelist - if p.capnames != nil || p.capnumlist != nil { - var oldcapnamelist []string - var next int - var k int + if p.capnames == nil { + oldcapnamelist = nil + p.capnames = make(map[string]int) + p.capnamelist = []string{} + next = -1 + } else { + oldcapnamelist = p.capnamelist + p.capnamelist = []string{} + next = p.capnames[oldcapnamelist[0]] + } - if p.capnames == nil { - oldcapnamelist = nil - p.capnames = make(map[string]int) - p.capnamelist = []string{} - next = -1 - } else { - oldcapnamelist = p.capnamelist - p.capnamelist = []string{} - next = p.capnames[oldcapnamelist[0]] - } + for i := 0; i < p.capcount; i++ { + j := i + if p.capnumlist != nil { + j = p.capnumlist[i] + } - for i := 0; i < p.capcount; i++ { - j := i - if p.capnumlist != nil { - j = p.capnumlist[i] - } + if next == j { + p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) + k++ - if next == j { - p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) - k++ + if k == len(oldcapnamelist) { + next = -1 + } else { + next = p.capnames[oldcapnamelist[k]] + } - if k == len(oldcapnamelist) { - next = -1 } else { - next = p.capnames[oldcapnamelist[k]] + //feature: culture? + str := strconv.Itoa(j) + p.capnamelist = append(p.capnamelist, str) +p.capnames[str] = j + } } - - } else { - //feature: culture? - str := strconv.Itoa(j) - p.capnamelist = append(p.capnamelist, str) - p.capnames[str] = j } } - } } func (p *parser) consumeAutocap() int { @@ -301,7 +324,11 @@ func (p *parser) consumeAutocap() int { func (p *parser) countCaptures() error { var ch rune - p.noteCaptureSlot(0, 0) + if p.useMaintainCaptureOrder() { + p.noteCaptureName(fmt.Sprint(0), 0) + } else { + p.noteCaptureSlot(0, 0) + } p.autocap = 1 @@ -350,7 +377,11 @@ func (p *parser) countCaptures() error { if err != nil { return err } - p.noteCaptureSlot(dec, pos) + if p.useMaintainCaptureOrder() { + p.noteCaptureName(fmt.Sprint(dec), pos) + } else { + p.noteCaptureSlot(dec, pos) + } } else { p.noteCaptureName(p.scanCapname(), pos) } @@ -386,7 +417,11 @@ func (p *parser) countCaptures() error { } } else { if !p.useOptionN() && !p.ignoreNextParen { - p.noteCaptureSlot(p.consumeAutocap(), pos) + if p.useMaintainCaptureOrder() { + p.noteCaptureName(fmt.Sprint(p.autocap), pos) + } else { + p.noteCaptureSlot(p.consumeAutocap(), pos) + } } } } @@ -921,6 +956,10 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { return nil, p.getErr(ErrInvalidGroupName) } + + if capnum != -1 && p.useMaintainCaptureOrder() { + p.consumeAutocap() + } } else if ch == '-' { proceed = true } else { @@ -1062,6 +1101,9 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { // actually make the node if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' { + if p.useMaintainCaptureOrder() { + p.consumeAutocap() + } return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil } goto BreakRecognize @@ -1968,6 +2010,11 @@ func (p *parser) useRE2() bool { return (p.options & RE2) != 0 } +// true to use MaintainCaptureOrder parsing behavior. +func (p *parser) useMaintainCaptureOrder() bool { + return (p.options & MaintainCaptureOrder) != 0 +} + // True if options stack is empty. func (p *parser) emptyOptionsStack() bool { return len(p.optionsStack) == 0 From 197dfc9f59054a28e0b9bf0249ca24f95c7f0dcb Mon Sep 17 00:00:00 2001 From: Siavash Askari Nasr Date: Wed, 12 May 2021 11:34:00 +0430 Subject: [PATCH 2/5] Check the length of `p.capnamelist` when assigning slots --- syntax/parser.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/syntax/parser.go b/syntax/parser.go index 2a9ebc4..cf07420 100644 --- a/syntax/parser.go +++ b/syntax/parser.go @@ -238,8 +238,8 @@ func (p *parser) noteCaptureName(name string, pos int) { func (p *parser) assignNameSlots() { if p.useMaintainCaptureOrder() { p.capnames = p.capnamenums - // Prepend `0` to capnamelist if it's not set - if p.capnamelist[0] != `0` { + // Prepend `0` to capnamelist if it's not set (MaintainCaptureOrder was enabled inline) + if len(p.capnamelist) == 0 || p.capnamelist[0] != `0` { p.capnamelist = append([]string{fmt.Sprint(0)}, p.capnamelist...) } } else { From 896189e9c2fb62ba14fadd1381321599afe43507 Mon Sep 17 00:00:00 2001 From: Siavash Askari Nasr Date: Wed, 12 May 2021 12:00:08 +0430 Subject: [PATCH 3/5] Add test for when no capture group is used with inline option --- regexp_MaintainCaptureOrder_test.go | 30 ++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/regexp_MaintainCaptureOrder_test.go b/regexp_MaintainCaptureOrder_test.go index 99f97ca..071850b 100644 --- a/regexp_MaintainCaptureOrder_test.go +++ b/regexp_MaintainCaptureOrder_test.go @@ -106,7 +106,6 @@ func TestMaintainCaptureOrder_Enable_Inline(t *testing.T) { } text := "This is a \ntesting stuff" m, err := r.FindStringMatch(text) - // t.Errorf(" groups: %#v\n", m) if err != nil { t.Errorf("unexpected match err: %v", err) } @@ -143,6 +142,35 @@ func TestMaintainCaptureOrder_Enable_Inline(t *testing.T) { } } +func TestMaintainCaptureOrder_Inline_No_Capture_Groups(t *testing.T) { + r, err := Compile("(?o)this.+?testing.+?stuff", 0) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := `this is a testing stuff` + m, err := r.FindStringMatch(text) + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := 1, len(groups); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + func TestMaintainCaptureOrder_NestedCaptures(t *testing.T) { r, err := Compile( `(?This)(?(.)+?(?testing)).+?(some.+?(other).+?(?stuff)) (?\k)`, MaintainCaptureOrder) From 690cfad098ba5154fc48f2635a3b29e7a6300c24 Mon Sep 17 00:00:00 2001 From: Siavash Askari Nasr Date: Thu, 13 May 2021 11:50:47 +0430 Subject: [PATCH 4/5] Applied requested changes - Document MaintainCaptureOrder option - Use return in `assignNameSlots` and remove else - Add test with MaintainCaptureOrder not provided - Change the MaintainCaptureOrder value to `0x0400` - Remove the `o` inline option - Add comment to explain why `autocap` is consumed --- README.md | 14 ++++ regexp.go | 2 +- regexp_MaintainCaptureOrder_test.go | 34 ++++++--- syntax/parser.go | 111 ++++++++++++++-------------- 4 files changed, 94 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index 9e448f4..c8166a4 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,20 @@ if isMatch, _ := re.MatchString(`Something to match`); isMatch { This feature is a work in progress and I'm open to ideas for more things to put here (maybe more relaxed character escaping rules?). +## MaintainCaptureOrder mode +The default behavior of `regexp2` is to match the .NET regexp engine, which unlike PCRE, doesn't maintain the order of the captures and appends the named capture groups to the end of captured groups. Using the `MaintainCaptureOrder` option when compiling a regexp will keep the order of named and unnamed capture groups. + +```go +re := regexp2.MustCompile(`(?This) (is) a (?test)`, regexp2.RE2) +if match, _ := re.FindStringMatch(`This is a test`); match != nil { + // match.Groups()[1].String() == "This" + // match.Groups()[1].Name == "first" + // match.Groups()[2].String() == "is" + // match.Groups()[2].Name == "2" + // match.Groups()[3].String() == "test" + // match.Groups()[3].Name == "last" +} +``` ## Library features that I'm still working on - Regex split diff --git a/regexp.go b/regexp.go index 60fce4a..5179468 100644 --- a/regexp.go +++ b/regexp.go @@ -121,7 +121,7 @@ const ( Debug = 0x0080 // "d" ECMAScript = 0x0100 // "e" RE2 = 0x0200 // RE2 (regexp package) compatibility mode - MaintainCaptureOrder = 0x1000 // Maintain named and unnamed capture order + MaintainCaptureOrder = 0x0400 // Maintain named and unnamed capture order ) func (re *Regexp) RightToLeft() bool { diff --git a/regexp_MaintainCaptureOrder_test.go b/regexp_MaintainCaptureOrder_test.go index 071850b..4b708fe 100644 --- a/regexp_MaintainCaptureOrder_test.go +++ b/regexp_MaintainCaptureOrder_test.go @@ -37,6 +37,9 @@ func TestMaintainCaptureOrder_Basic(t *testing.T) { if want, got := `this`, string(m.GroupByName(`first`).Runes()); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } + if want, got := `first`, m.regex.GroupNameFromNumber(1); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } if want, got := `testing`, groups[2].String(); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } @@ -54,13 +57,13 @@ func TestMaintainCaptureOrder_Basic(t *testing.T) { } } -func TestMaintainCaptureOrder_With_Other_Options(t *testing.T) { - r, err := Compile("(?si)(?this).+?\n(testing).+?(?stuff)", MaintainCaptureOrder) +func TestMaintainCaptureOrder_Mode_Not_Enabled(t *testing.T) { + r, err := Compile("(?this).+?(testing).+?(?stuff)", 0) // t.Logf("code dump: %v", r.code.Dump()) if err != nil { t.Errorf("unexpected compile err: %v", err) } - text := "This is a \ntesting stuff" + text := `this is a testing stuff` m, err := r.FindStringMatch(text) if err != nil { t.Errorf("unexpected match err: %v", err) @@ -78,16 +81,22 @@ func TestMaintainCaptureOrder_With_Other_Options(t *testing.T) { if want, got := text, groups[0].String(); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `This`, groups[1].String(); want != got { + if want, got := `testing`, groups[1].String(); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `first`, groups[1].Name; want != got { + if want, got := `1`, groups[1].Name; want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `testing`, groups[2].String(); want != got { + if want, got := `this`, string(m.GroupByName(`first`).Runes()); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `2`, groups[2].Name; want != got { + if want, got := `first`, m.regex.GroupNameFromNumber(2); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `this`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[2].Name; want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } if want, got := `stuff`, groups[3].String(); want != got { @@ -96,10 +105,13 @@ func TestMaintainCaptureOrder_With_Other_Options(t *testing.T) { if want, got := `last`, groups[3].Name; want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } + if want, got := `stuff`, string(m.GroupByNumber(3).Runes()); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } } -func TestMaintainCaptureOrder_Enable_Inline(t *testing.T) { - r, err := Compile("(?sio)(?this).+?\n(testing).+?(?stuff)", 0) +func TestMaintainCaptureOrder_With_Other_Options(t *testing.T) { + r, err := Compile("(?si)(?this).+?\n(testing).+?(?stuff)", MaintainCaptureOrder) // t.Logf("code dump: %v", r.code.Dump()) if err != nil { t.Errorf("unexpected compile err: %v", err) @@ -142,8 +154,8 @@ func TestMaintainCaptureOrder_Enable_Inline(t *testing.T) { } } -func TestMaintainCaptureOrder_Inline_No_Capture_Groups(t *testing.T) { - r, err := Compile("(?o)this.+?testing.+?stuff", 0) +func TestMaintainCaptureOrder_No_Capture_Groups(t *testing.T) { + r, err := Compile("this.+?testing.+?stuff", MaintainCaptureOrder) // t.Logf("code dump: %v", r.code.Dump()) if err != nil { t.Errorf("unexpected compile err: %v", err) diff --git a/syntax/parser.go b/syntax/parser.go index cf07420..d75ee90 100644 --- a/syntax/parser.go +++ b/syntax/parser.go @@ -22,7 +22,7 @@ const ( Debug = 0x0080 // "d" ECMAScript = 0x0100 // "e" RE2 = 0x0200 // RE2 compat mode - MaintainCaptureOrder = 0x1000 // "o" Maintain named and unnamed capture order + MaintainCaptureOrder = 0x0400 // Maintain named and unnamed capture order ) func optionFromCode(ch rune) RegexOptions { @@ -44,8 +44,6 @@ func optionFromCode(ch rune) RegexOptions { return Debug case 'e', 'E': return ECMAScript - case 'o', 'O': - return MaintainCaptureOrder default: return 0 } @@ -242,75 +240,76 @@ func (p *parser) assignNameSlots() { if len(p.capnamelist) == 0 || p.capnamelist[0] != `0` { p.capnamelist = append([]string{fmt.Sprint(0)}, p.capnamelist...) } - } else { - if p.capnames != nil { - for _, name := range p.capnamelist { - for p.isCaptureSlot(p.autocap) { - p.autocap++ - } - pos := p.capnames[name] - p.capnames[name] = p.autocap - p.noteCaptureSlot(p.autocap, pos) + return + } + if p.capnames != nil { + for _, name := range p.capnamelist { + for p.isCaptureSlot(p.autocap) { p.autocap++ } + pos := p.capnames[name] + p.capnames[name] = p.autocap + p.noteCaptureSlot(p.autocap, pos) + + p.autocap++ } - - // if the caps array has at least one gap, construct the list of used slots - if p.capcount < p.captop { - p.capnumlist = make([]int, p.capcount) - i := 0 + } - for k := range p.caps { - p.capnumlist[i] = k - i++ - } + // if the caps array has at least one gap, construct the list of used slots + if p.capcount < p.captop { + p.capnumlist = make([]int, p.capcount) + i := 0 - sort.Ints(p.capnumlist) + for k := range p.caps { + p.capnumlist[i] = k + i++ } - // merge capsnumlist into capnamelist - if p.capnames != nil || p.capnumlist != nil { - var oldcapnamelist []string - var next int - var k int + sort.Ints(p.capnumlist) + } - if p.capnames == nil { - oldcapnamelist = nil - p.capnames = make(map[string]int) - p.capnamelist = []string{} - next = -1 - } else { - oldcapnamelist = p.capnamelist - p.capnamelist = []string{} - next = p.capnames[oldcapnamelist[0]] - } + // merge capsnumlist into capnamelist + if p.capnames != nil || p.capnumlist != nil { + var oldcapnamelist []string + var next int + var k int - for i := 0; i < p.capcount; i++ { - j := i - if p.capnumlist != nil { - j = p.capnumlist[i] - } + if p.capnames == nil { + oldcapnamelist = nil + p.capnames = make(map[string]int) + p.capnamelist = []string{} + next = -1 + } else { + oldcapnamelist = p.capnamelist + p.capnamelist = []string{} + next = p.capnames[oldcapnamelist[0]] + } - if next == j { - p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) - k++ + for i := 0; i < p.capcount; i++ { + j := i + if p.capnumlist != nil { + j = p.capnumlist[i] + } - if k == len(oldcapnamelist) { - next = -1 - } else { - next = p.capnames[oldcapnamelist[k]] - } + if next == j { + p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) + k++ + if k == len(oldcapnamelist) { + next = -1 } else { - //feature: culture? - str := strconv.Itoa(j) - p.capnamelist = append(p.capnamelist, str) -p.capnames[str] = j - } + next = p.capnames[oldcapnamelist[k]] } + + } else { + //feature: culture? + str := strconv.Itoa(j) + p.capnamelist = append(p.capnamelist, str) + p.capnames[str] = j } } + } } func (p *parser) consumeAutocap() int { @@ -958,6 +957,8 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { } if capnum != -1 && p.useMaintainCaptureOrder() { + // Successfully scanned a named capture group so we need to increment + // our cap number to maintain the order p.consumeAutocap() } } else if ch == '-' { From 65c2a99b86e65803ed6432911fd22b2b5c8b2edd Mon Sep 17 00:00:00 2001 From: Siavash Askari Nasr Date: Fri, 16 Jul 2021 14:36:22 +0430 Subject: [PATCH 5/5] Use the `String` method instead of converting runes to string --- regexp_MaintainCaptureOrder_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/regexp_MaintainCaptureOrder_test.go b/regexp_MaintainCaptureOrder_test.go index 4b708fe..bde5f4c 100644 --- a/regexp_MaintainCaptureOrder_test.go +++ b/regexp_MaintainCaptureOrder_test.go @@ -34,7 +34,7 @@ func TestMaintainCaptureOrder_Basic(t *testing.T) { if want, got := `first`, groups[1].Name; want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `this`, string(m.GroupByName(`first`).Runes()); want != got { + if want, got := `this`, m.GroupByName(`first`).String(); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } if want, got := `first`, m.regex.GroupNameFromNumber(1); want != got { @@ -52,7 +52,7 @@ func TestMaintainCaptureOrder_Basic(t *testing.T) { if want, got := `last`, groups[3].Name; want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `stuff`, string(m.GroupByNumber(3).Runes()); want != got { + if want, got := `stuff`, m.GroupByNumber(3).String(); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } } @@ -87,7 +87,7 @@ func TestMaintainCaptureOrder_Mode_Not_Enabled(t *testing.T) { if want, got := `1`, groups[1].Name; want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `this`, string(m.GroupByName(`first`).Runes()); want != got { + if want, got := `this`, m.GroupByName(`first`).String(); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } if want, got := `first`, m.regex.GroupNameFromNumber(2); want != got { @@ -105,7 +105,7 @@ func TestMaintainCaptureOrder_Mode_Not_Enabled(t *testing.T) { if want, got := `last`, groups[3].Name; want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `stuff`, string(m.GroupByNumber(3).Runes()); want != got { + if want, got := `stuff`, m.GroupByNumber(3).String(); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } }