|
1 | 1 | package tokenizer |
2 | 2 |
|
3 | 3 | import ( |
| 4 | + "bytes" |
4 | 5 | "github.com/stretchr/testify/require" |
| 6 | + "strings" |
5 | 7 | "testing" |
6 | 8 | ) |
7 | 9 |
|
@@ -151,6 +153,8 @@ func TestTokenizeEdgeCases(t *testing.T) { |
151 | 153 | {key: TokenKeyword, value: s2b("E"), offset: 1, line: 1, id: 1}, |
152 | 154 | {key: TokenUnknown, value: s2b("+"), offset: 2, line: 1, id: 2}, |
153 | 155 | }}, |
| 156 | + {"\x00", []Token{ // https://github.com/bzick/tokenizer/issues/28 |
| 157 | + }}, |
154 | 158 | } |
155 | 159 | for _, v := range data1 { |
156 | 160 | t.Run(v.str, func(t *testing.T) { |
@@ -347,3 +351,58 @@ func TestTokenizeInject(t *testing.T) { |
347 | 351 | }, |
348 | 352 | }, stream.GetSnippet(10, 10), "parsed %s as %s", str, stream) |
349 | 353 | } |
| 354 | + |
| 355 | +func FuzzStream(f *testing.F) { |
| 356 | + testcases := []string{ |
| 357 | + `{id: 1, key: "object number 1", value: 1.2E3}`, |
| 358 | + "hello\n \n\tworld", |
| 359 | + "test\x00", |
| 360 | + "\x00", |
| 361 | + } |
| 362 | + |
| 363 | + for _, tc := range testcases { |
| 364 | + f.Add(tc) // Use f.Add to provide a seed corpus |
| 365 | + } |
| 366 | + f.Fuzz(func(t *testing.T, orig string) { |
| 367 | + |
| 368 | + nullIndex := strings.IndexRune(orig, '\x00') |
| 369 | + if nullIndex != -1 && nullIndex != len(orig)-1 { |
| 370 | + t.Skipf("Skipping input with data after null byte") |
| 371 | + } |
| 372 | + |
| 373 | + origBytes := []byte(orig) |
| 374 | + buffer := bytes.NewBuffer(origBytes) |
| 375 | + tokenizer := New() |
| 376 | + commaKey := TokenKey(10) |
| 377 | + colonKey := TokenKey(11) |
| 378 | + openKey := TokenKey(12) |
| 379 | + closeKey := TokenKey(13) |
| 380 | + dquoteKey := TokenKey(14) |
| 381 | + tokenizer.DefineTokens(commaKey, []string{","}) |
| 382 | + tokenizer.DefineTokens(colonKey, []string{":"}) |
| 383 | + tokenizer.DefineTokens(openKey, []string{"{"}) |
| 384 | + tokenizer.DefineTokens(closeKey, []string{"}"}) |
| 385 | + tokenizer.DefineStringToken(dquoteKey, `"`, `"`).SetEscapeSymbol('\\') |
| 386 | + |
| 387 | + stream := tokenizer.ParseStream(buffer, 100) |
| 388 | + var actual []byte |
| 389 | + for stream.IsValid() { |
| 390 | + current := stream.CurrentToken() |
| 391 | + // t.Logf("%#v", current) |
| 392 | + actual = append(actual, current.Indent()...) |
| 393 | + actual = append(actual, current.Value()...) |
| 394 | + stream.GoNext() |
| 395 | + } |
| 396 | + // t.Logf("%#v", stream.CurrentToken()) |
| 397 | + |
| 398 | + // As we only concatenate the indents of each token, the trailing |
| 399 | + // whitespaces and token separators are lost, so we trim these |
| 400 | + // characters on the right of both actual and expected slices. |
| 401 | + trimset := ". \t\r\n\x00" |
| 402 | + expected := bytes.TrimRight(origBytes, trimset) |
| 403 | + actual = bytes.TrimRight(actual, trimset) |
| 404 | + if !bytes.Equal(expected, actual) { |
| 405 | + t.Errorf("input:\n%q\nexpected:\n%q\nactual:\n%q", orig, expected, actual) |
| 406 | + } |
| 407 | + }) |
| 408 | +} |
0 commit comments