Skip to content

Commit eaaba8c

Browse files
committed
Add more edge cases
1 parent 33bb86e commit eaaba8c

File tree

3 files changed

+75
-6
lines changed

3 files changed

+75
-6
lines changed

readme.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ stream := parser.ParseString(`{"key": [1]}`)
376376

377377
## Known issues
378378

379-
* zero-byte `\0` ignores in the source string.
379+
* zero-byte `\x00` (`\0`) stops parsing.
380380

381381
## Benchmark
382382

stream.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,19 @@ type Stream struct {
2929
historySize int
3030
}
3131

32-
// NewStream creates new parsed stream of tokens.
32+
func validateToken(t *Token) *Token {
33+
if t != nil {
34+
return t
35+
}
36+
return undefToken
37+
}
38+
39+
// NewStream creates a new parsed stream of tokens.
3340
func NewStream(p *parsing) *Stream {
3441
return &Stream{
3542
t: p.t,
36-
head: p.head,
37-
current: p.head,
43+
head: validateToken(p.head),
44+
current: validateToken(p.head),
3845
len: p.n,
3946
wsTail: p.tail,
4047
parsed: p.parsed + p.pos,
@@ -47,8 +54,8 @@ func NewInfStream(p *parsing) *Stream {
4754
t: p.t,
4855
p: p,
4956
len: p.n,
50-
head: p.head,
51-
current: p.head,
57+
head: validateToken(p.head),
58+
current: validateToken(p.head),
5259
}
5360
}
5461

@@ -289,6 +296,9 @@ func (s *Stream) GetSnippet(before, after int) []Token {
289296
after = s.len - before - 1
290297
}
291298
segment = make([]Token, before+after+1)
299+
if len(segment) == 0 {
300+
return segment
301+
}
292302
var ptr *Token
293303
if s.next != nil {
294304
ptr = s.next

tokenizer_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
package tokenizer
22

33
import (
4+
"bytes"
45
"github.com/stretchr/testify/require"
6+
"strings"
57
"testing"
68
)
79

@@ -151,6 +153,8 @@ func TestTokenizeEdgeCases(t *testing.T) {
151153
{key: TokenKeyword, value: s2b("E"), offset: 1, line: 1, id: 1},
152154
{key: TokenUnknown, value: s2b("+"), offset: 2, line: 1, id: 2},
153155
}},
156+
{"\x00", []Token{ // https://github.com/bzick/tokenizer/issues/28
157+
}},
154158
}
155159
for _, v := range data1 {
156160
t.Run(v.str, func(t *testing.T) {
@@ -347,3 +351,58 @@ func TestTokenizeInject(t *testing.T) {
347351
},
348352
}, stream.GetSnippet(10, 10), "parsed %s as %s", str, stream)
349353
}
354+
355+
func FuzzStream(f *testing.F) {
356+
testcases := []string{
357+
`{id: 1, key: "object number 1", value: 1.2E3}`,
358+
"hello\n \n\tworld",
359+
"test\x00",
360+
"\x00",
361+
}
362+
363+
for _, tc := range testcases {
364+
f.Add(tc) // Use f.Add to provide a seed corpus
365+
}
366+
f.Fuzz(func(t *testing.T, orig string) {
367+
368+
nullIndex := strings.IndexRune(orig, '\x00')
369+
if nullIndex != -1 && nullIndex != len(orig)-1 {
370+
t.Skipf("Skipping input with data after null byte")
371+
}
372+
373+
origBytes := []byte(orig)
374+
buffer := bytes.NewBuffer(origBytes)
375+
tokenizer := New()
376+
commaKey := TokenKey(10)
377+
colonKey := TokenKey(11)
378+
openKey := TokenKey(12)
379+
closeKey := TokenKey(13)
380+
dquoteKey := TokenKey(14)
381+
tokenizer.DefineTokens(commaKey, []string{","})
382+
tokenizer.DefineTokens(colonKey, []string{":"})
383+
tokenizer.DefineTokens(openKey, []string{"{"})
384+
tokenizer.DefineTokens(closeKey, []string{"}"})
385+
tokenizer.DefineStringToken(dquoteKey, `"`, `"`).SetEscapeSymbol('\\')
386+
387+
stream := tokenizer.ParseStream(buffer, 100)
388+
var actual []byte
389+
for stream.IsValid() {
390+
current := stream.CurrentToken()
391+
// t.Logf("%#v", current)
392+
actual = append(actual, current.Indent()...)
393+
actual = append(actual, current.Value()...)
394+
stream.GoNext()
395+
}
396+
// t.Logf("%#v", stream.CurrentToken())
397+
398+
// As we only concatenate the indents of each token, the trailing
399+
// whitespaces and token separators are lost, so we trim these
400+
// characters on the right of both actual and expected slices.
401+
trimset := ". \t\r\n\x00"
402+
expected := bytes.TrimRight(origBytes, trimset)
403+
actual = bytes.TrimRight(actual, trimset)
404+
if !bytes.Equal(expected, actual) {
405+
t.Errorf("input:\n%q\nexpected:\n%q\nactual:\n%q", orig, expected, actual)
406+
}
407+
})
408+
}

0 commit comments

Comments
 (0)