-
-
Notifications
You must be signed in to change notification settings - Fork 297
add dataset support #361
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add dataset support #361
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,53 @@ | ||||||
| // Copyright 2022 Juan Pablo Tosso and the OWASP Coraza contributors | ||||||
| // SPDX-License-Identifier: Apache-2.0 | ||||||
|
|
||||||
| package operators | ||||||
|
|
||||||
| import ( | ||||||
| "fmt" | ||||||
|
|
||||||
| "github.com/corazawaf/coraza/v3" | ||||||
| ahocorasick "github.com/petar-dambovaliev/aho-corasick" | ||||||
| ) | ||||||
|
|
||||||
| // TODO according to coraza researchs, re2 matching is faster than ahocorasick | ||||||
| // maybe we should switch in the future | ||||||
| // pmFromDataset is always lowercase | ||||||
| type pmFromDataset struct { | ||||||
| matcher ahocorasick.AhoCorasick | ||||||
| } | ||||||
|
|
||||||
| func (o *pmFromDataset) Init(options coraza.RuleOperatorOptions) error { | ||||||
| data := options.Arguments | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this going to be single value always?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could accept multiple datasets if required, but I don't see much value |
||||||
| dataset, ok := options.Datasets[data] | ||||||
| if !ok { | ||||||
| return fmt.Errorf("Dataset %q not found", data) | ||||||
| } | ||||||
| builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{ | ||||||
| AsciiCaseInsensitive: true, | ||||||
| MatchOnlyWholeWords: false, | ||||||
| MatchKind: ahocorasick.LeftMostLongestMatch, | ||||||
| DFA: true, | ||||||
| }) | ||||||
|
|
||||||
| // TODO this operator is supposed to support snort data syntax: "@pmFromDataset A|42|C|44|F" | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I think this TODO is misleading, snort syntax would end up being inside the Dataset, wouldn't it?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, we should keep it only for PM. Regarding duplications, it depends on the operator, the dictionary will store everything case-sensitive. The aho-corasick implementation will read it case-insensitive and might probably remove duplicates. Still, a duplicate shouldn't affect results, just performance. |
||||||
| o.matcher = builder.Build(dataset) | ||||||
| return nil | ||||||
| } | ||||||
|
|
||||||
| func (o *pmFromDataset) Evaluate(tx *coraza.Transaction, value string) bool { | ||||||
| if tx.Capture { | ||||||
| matches := o.matcher.FindAll(value) | ||||||
| for i, match := range matches { | ||||||
| if i == 10 { | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This value seems arbitrary (and kind of repetitive). Maybe we want to abstract it, at least in this package?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, it's part of our technical debt, we should add the constant somewhere. 10 is the documented standard though. |
||||||
| return true | ||||||
| } | ||||||
| tx.CaptureField(i, value[match.Start():match.End()]) | ||||||
| } | ||||||
| return len(matches) > 0 | ||||||
| } | ||||||
| iter := o.matcher.Iter(value) | ||||||
| return iter.Next() != nil | ||||||
| } | ||||||
|
|
||||||
| var _ coraza.RuleOperator = (*pmFromDataset)(nil) | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| // Copyright 2022 Juan Pablo Tosso and the OWASP Coraza contributors | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| package operators | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "testing" | ||
|
|
||
| "github.com/corazawaf/coraza/v3" | ||
| ) | ||
|
|
||
| func TestPmFromDataset(t *testing.T) { | ||
| pm := &pmFromDataset{} | ||
| opts := coraza.RuleOperatorOptions{ | ||
| Arguments: "test_1", | ||
| Datasets: map[string][]string{ | ||
| "test_1": {"test_1", "test_2"}, | ||
| }, | ||
| } | ||
|
|
||
| if err := pm.Init(opts); err != nil { | ||
| t.Error(err) | ||
| } | ||
| waf := coraza.NewWaf() | ||
| tx := waf.NewTransaction(context.Background()) | ||
| tx.Capture = true | ||
| res := pm.Evaluate(tx, "test_1") | ||
| if !res { | ||
| t.Error("pmFromDataset failed") | ||
| } | ||
| opts.Datasets = map[string][]string{} | ||
| if err := pm.Init(opts); err == nil { | ||
| t.Error(fmt.Errorf("pmFromDataset should have failed")) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,10 +18,11 @@ import ( | |
|
|
||
| // DirectiveOptions contains the parsed options for a directive | ||
| type DirectiveOptions struct { | ||
| Waf *coraza.Waf | ||
| Config types.Config | ||
| Opts string | ||
| Path []string | ||
| Waf *coraza.Waf | ||
| Config types.Config | ||
| Opts string | ||
| Path []string | ||
| Datasets map[string][]string | ||
| } | ||
|
|
||
| type directive = func(options *DirectiveOptions) error | ||
|
|
@@ -470,6 +471,28 @@ func directiveSecIgnoreRuleCompilationErrors(options *DirectiveOptions) error { | |
| return nil | ||
| } | ||
|
|
||
| func directiveSecDataset(options *DirectiveOptions) error { | ||
| spl := strings.SplitN(options.Opts, " ", 2) | ||
| if len(spl) != 2 { | ||
| return errors.New("syntax error: SecDataset name `\n...\n`") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The error is a bit unclear to me. What does
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It will be printed as: It is hard to represent the syntax as a single line. |
||
| } | ||
| name := spl[0] | ||
| if _, ok := options.Datasets[name]; ok { | ||
| options.Waf.Logger.Warn("Dataset %s already exists, overwriting", name) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather use
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, we should normalize using %q for external inputs. We should document it as technical debt |
||
| } | ||
| arr := []string{} | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this good enough @anuraaga? Is it worth to work out as a best effort a capacity or since this only happens on bootstrap it is OK to use with no length.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we should care much about optimizations for the seclang package and it is already way faster than modsecurity and it doesn't happen that often. Still, we should track a seclang optimization in the future. |
||
| data := strings.Trim(spl[1], "`") | ||
| for _, s := range strings.Split(data, "\n") { | ||
| s = strings.TrimSpace(s) | ||
| if s == "" || s[0] == '#' { | ||
| continue | ||
| } | ||
| arr = append(arr, s) | ||
| } | ||
| options.Datasets[name] = arr | ||
| return nil | ||
| } | ||
|
|
||
| func newCompileRuleError(err error, opts string) error { | ||
| return fmt.Errorf("failed to compile rule (%s): %s", err, opts) | ||
| } | ||
|
|
@@ -573,6 +596,7 @@ var directivesMap = map[string]directive{ | |
| "secauditlogfilemode": directiveSecAuditLogFileMode, | ||
| "secauditlogdirmode": directiveSecAuditLogDirMode, | ||
| "secignorerulecompilationerrors": directiveSecIgnoreRuleCompilationErrors, | ||
| "secdataset": directiveSecDataset, | ||
|
|
||
| // Unsupported Directives | ||
| "secargumentseparator": directiveUnsupported, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -181,3 +181,19 @@ func TestInvalidRulesWithIgnoredErrors(t *testing.T) { | |
| t.Error("failed to error on invalid rule") | ||
| } | ||
| } | ||
|
|
||
| func TestSecDataset(t *testing.T) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to include more tests here. I guess the char "`" isn't supported in the dataset.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "`" is supported in the dataset, the code only interprets it if it is the last character of the directive declaration or the only character in the line. |
||
| waf := coraza.NewWaf() | ||
| p, _ := NewParser(waf) | ||
| if err := p.FromString("" + | ||
| "SecDataset test `\n123\n456\n`\n"); err != nil { | ||
| t.Error(err) | ||
| } | ||
| ds := p.options.Datasets["test"] | ||
| if len(ds) != 2 { | ||
| t.Errorf("failed to add dataset, got %d records", len(ds)) | ||
| } | ||
| if ds[0] != "123" || ds[1] != "456" { | ||
| t.Error("failed to add dataset") | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,19 +76,29 @@ func (p *Parser) FromString(data string) error { | |
| scanner := bufio.NewScanner(strings.NewReader(data)) | ||
| var linebuffer = "" | ||
| pattern := regexp.MustCompile(`\\(\s+)?$`) | ||
| inQuotes := false | ||
| for scanner.Scan() { | ||
| p.currentLine++ | ||
| line := scanner.Text() | ||
| linebuffer += strings.TrimSpace(line) | ||
| line := strings.TrimSpace(scanner.Text()) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change requires a unit test.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CRS tests are passing, but we should definitely add more tests for this. It is a major change for the seclang parser. |
||
| if !inQuotes && len(line) > 0 && line[len(line)-1] == '`' { | ||
| inQuotes = true | ||
| } else if inQuotes && len(line) > 0 && line[0] == '`' { | ||
| inQuotes = false | ||
| } | ||
| if inQuotes { | ||
| linebuffer += line + "\n" | ||
| } else { | ||
| linebuffer += line | ||
| } | ||
|
|
||
| // Check if line ends with \ | ||
| match := pattern.MatchString(line) | ||
| if !match { | ||
| if !pattern.MatchString(line) && !inQuotes { | ||
| err := p.evaluate(linebuffer) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| linebuffer = "" | ||
| } else { | ||
| } else if !inQuotes { | ||
| linebuffer = strings.TrimSuffix(linebuffer, "\\") | ||
| } | ||
| } | ||
|
|
@@ -158,8 +168,9 @@ func NewParser(waf *coraza.Waf) (*Parser, error) { | |
| } | ||
| p := &Parser{ | ||
| options: &DirectiveOptions{ | ||
| Waf: waf, | ||
| Config: make(types.Config), | ||
| Waf: waf, | ||
| Config: make(types.Config), | ||
| Datasets: make(map[string][]string), | ||
| }, | ||
| } | ||
| return p, nil | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shall we link the benchmarks or show results?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should remove this comment, it comes from v1