Skip to content

Commit adf0ff4

Browse files
n-peugnetbzick
authored andcommitted
Update documentation to match the current code
readme.md: - Update examples, fixing some arguments, and stop using deprecated methods. - Add blank lines between successive codeblocks, this is usually what is expected by markdown parsers. - Fix a typo in method name. - Explain that numbers and underscores are not part of keywords by default. - Remove some whitespaces at the end of lines. tokenizer.go - Add blank lines before "Deprecated:" doc comment to allow go tools to mark them as deprecated in the docs and in editors. - Fix the description of DefineStringToken().
1 parent c98a149 commit adf0ff4

File tree

2 files changed

+41
-26
lines changed

2 files changed

+41
-26
lines changed

readme.md

Lines changed: 35 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -32,29 +32,33 @@ Use cases:
3232
For example, parsing SQL `WHERE` condition `user_id = 119 and modified > "2020-01-01 00:00:00" or amount >= 122.34`:
3333

3434
```go
35+
import "github.com/bzick/tokenizer"
36+
3537
// define custom tokens keys
36-
const (
37-
TEquality = 1
38-
TDot = 2
39-
TMath = 3
38+
const (
39+
TEquality = iota + 1
40+
TDot
41+
TMath
42+
TDoubleQuoted
4043
)
4144

4245
// configure tokenizer
4346
parser := tokenizer.New()
4447
parser.DefineTokens(TEquality, []string{"<", "<=", "==", ">=", ">", "!="})
4548
parser.DefineTokens(TDot, []string{"."})
4649
parser.DefineTokens(TMath, []string{"+", "-", "/", "*", "%"})
47-
parser.DefineStringToken(`"`, `"`).SetEscapeSymbol(tokenizer.BackSlash)
50+
parser.DefineStringToken(TDoubleQuoted, `"`, `"`).SetEscapeSymbol(tokenizer.BackSlash)
51+
parser.AllowKeywordSymbols(tokenizer.Underscore, tokenizer.Numbers)
4852

4953
// create tokens' stream
5054
stream := parser.ParseString(`user_id = 119 and modified > "2020-01-01 00:00:00" or amount >= 122.34`)
5155
defer stream.Close()
5256

5357
// iterate over each token
54-
for stream.Valid() {
58+
for stream.IsValid() {
5559
if stream.CurrentToken().Is(tokenizer.TokenKeyword) {
56-
field := stream.NextToken().ValueString()
57-
// ...
60+
field := stream.CurrentToken().ValueString()
61+
// ...
5862
}
5963
stream.GoNext()
6064
}
@@ -68,15 +72,15 @@ tokens: |user_id| =| 119| and| modified| >| "2020-01-01 00:00:00"| or| amount| >
6872
6973
0: {key: TokenKeyword, value: "user_id"} token.Value() == "user_id"
7074
1: {key: TEquality, value: "="} token.Value() == "="
71-
2: {key: TokenInteger, value: "119"} token.ValueInt() == 119
75+
2: {key: TokenInteger, value: "119"} token.ValueInt64() == 119
7276
3: {key: TokenKeyword, value: "and"} token.Value() == "and"
7377
4: {key: TokenKeyword, value: "modified"} token.Value() == "modified"
7478
5: {key: TEquality, value: ">"} token.Value() == ">"
7579
6: {key: TokenString, value: "\"2020-01-01 00:00:00\""} token.ValueUnescaped() == "2020-01-01 00:00:00"
7680
7: {key: TokenKeyword, value: "or"} token.Value() == "and"
7781
8: {key: TokenKeyword, value: "amount"} token.Value() == "amount"
7882
9: {key: TEquality, value: ">="} token.Value() == ">="
79-
10: {key: TokenFloat, value: "122.34"} token.ValueFloat() == 122.34
83+
10: {key: TokenFloat, value: "122.34"} token.ValueFloat64() == 122.34
8084
```
8185

8286
More examples:
@@ -87,12 +91,11 @@ More examples:
8791
### Create and parse
8892

8993
```go
90-
import (
91-
"github.com/bzick/tokenizer"
92-
)
94+
import "github.com/bzick/tokenizer"
9395

9496
var parser := tokenizer.New()
95-
parser.AllowKeywordUnderscore() // ... and other configuration code
97+
parser.AllowKeywordSymbols(tokenizer.Underscore, []rune{})
98+
// ... and other configuration code
9699

97100
```
98101

@@ -110,20 +113,20 @@ fp, err := os.Open("data.json") // huge JSON file
110113

111114
stream := parser.ParseStream(fp, 4096).SetHistorySize(10)
112115
defer stream.Close()
113-
for stream.IsValid() {
116+
for stream.IsValid() {
114117
// ...
115118
stream.GoNext()
116119
}
117120
```
118121

119122
## Embedded tokens
120123

121-
- `tokenizer.TokenUnknown` — unspecified token key.
124+
- `tokenizer.TokenUnknown` — unspecified token key.
122125
- `tokenizer.TokenKeyword` — keyword, any combination of letters, including unicode letters.
123126
- `tokenizer.TokenInteger` — integer value
124127
- `tokenizer.TokenFloat` — float/double value
125128
- `tokenizer.TokenString` — quoted string
126-
- `tokenizer.TokenStringFragment` — fragment framed (quoted) string
129+
- `tokenizer.TokenStringFragment` — fragment framed (quoted) string
127130

128131
### Unknown token
129132

@@ -132,6 +135,7 @@ A token marks as `tokenizer.TokenUnknown` if the parser detects an unknown token
132135
```go
133136
stream := parser.ParseString(`one!`)
134137
```
138+
135139
```
136140
stream: [
137141
{
@@ -151,6 +155,7 @@ Setting `tokenizer.StopOnUndefinedToken()` stops parser when `tokenizer.TokenUn
151155
```go
152156
stream := parser.ParseString(`one!`)
153157
```
158+
154159
```
155160
stream: [
156161
{
@@ -168,11 +173,12 @@ and the length of the original string.
168173

169174
Any word that is not a custom token is stored in a single token as `tokenizer.TokenKeyword`.
170175

171-
The word can contain unicode characters, numbers (see `tokenizer.AllowNumbersInKeyword()`) and underscore (see `tokenizer.AllowKeywordUnderscore ()`).
176+
The word can contain unicode characters, and it can be configured to contain other characters, like numbers and underscores (see `tokenizer.AllowKeywordSymbols()`).
172177

173178
```go
174179
stream := parser.ParseString(`one 二 три`)
175180
```
181+
176182
```
177183
stream: [
178184
{
@@ -210,6 +216,7 @@ Any integer is stored as one token with key `tokenizer.TokenInteger`.
210216
```go
211217
stream := parser.ParseString(`223 999`)
212218
```
219+
213220
```
214221
stream: [
215222
{
@@ -223,11 +230,11 @@ stream: [
223230
]
224231
```
225232

226-
To get int64 from the token value use `stream.GetInt()`:
233+
To get int64 from the token value use `stream.GetInt64()`:
227234

228235
```go
229236
stream := tokenizer.ParseString("123")
230-
fmt.Print("Token is %d", stream.CurrentToken().GetInt()) // Token is 123
237+
fmt.Print("Token is %d", stream.CurrentToken().GetInt64()) // Token is 123
231238
```
232239

233240
### Float number
@@ -241,6 +248,7 @@ Any float number is stored as one token with key `tokenizer.TokenFloat`. Float n
241248
```go
242249
stream := parser.ParseString(`1.3e-8`):
243250
```
251+
244252
```
245253
stream: [
246254
{
@@ -250,11 +258,11 @@ stream: [
250258
]
251259
```
252260

253-
To get float64 from the token value use `token.GetFloat()`:
261+
To get float64 from the token value use `token.GetFloat64()`:
254262

255263
```go
256264
stream := parser.ParseString("1.3e2")
257-
fmt.Print("Token is %d", stream.CurrentToken().GetFloat()) // Token is 130
265+
fmt.Print("Token is %d", stream.CurrentToken().GetFloat64()) // Token is 130
258266
```
259267

260268
### Framed string
@@ -271,6 +279,7 @@ parser.DefineStringToken(TokenDoubleQuotedString, `"`, `"`).SetEscapeSymbol('\\'
271279
// ...
272280
stream := parser.ParseString(`"two \"three"`)
273281
```
282+
274283
```
275284
stream: [
276285
{
@@ -280,10 +289,10 @@ stream: [
280289
]
281290
```
282291

283-
To get a framed string without edge tokens and special characters, use the `stream.ValueUnescape()` method:
292+
To get a framed string without edge tokens and special characters, use the `stream.ValueUnescaped()` method:
284293

285294
```go
286-
value := stream.CurrentToken().ValueUnescape() // result: two "three
295+
value := stream.CurrentToken().ValueUnescaped() // result: two "three
287296
```
288297

289298
The method `token.StringKey()` will be return token string key defined in the `DefineStringToken`:
@@ -313,7 +322,9 @@ parser.DefineStringToken(TokenQuotedString, `"`, `"`).AddInjection(TokenOpenInje
313322

314323
parser.ParseString(`"one {{ two }} three"`)
315324
```
325+
316326
Tokens:
327+
317328
```
318329
{
319330
{

tokenizer.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ const BackSlash = '\\'
4040
var DefaultWhiteSpaces = []byte{' ', '\t', '\n', '\r'}
4141

4242
// DefaultStringEscapes is default escaped symbols. Those symbols are often used everywhere.
43+
//
4344
// Deprecated: use DefaultSpecialString and AddSpecialStrings
4445
var DefaultStringEscapes = map[byte]byte{
4546
'n': '\n',
@@ -102,6 +103,7 @@ func (q *StringSettings) SetEscapeSymbol(symbol byte) *StringSettings {
102103
}
103104

104105
// SetSpecialSymbols set mapping of all escapable symbols for escape symbol, like \n, \t, \r.
106+
//
105107
// Deprecated: use AddSpecialStrings
106108
func (q *StringSettings) SetSpecialSymbols(special map[byte]byte) *StringSettings {
107109
for _, v := range special {
@@ -171,6 +173,7 @@ func (t *Tokenizer) AllowKeywordSymbols(majorSymbols []rune, minorSymbols []rune
171173
}
172174

173175
// AllowKeywordUnderscore allows underscore symbol in keywords, like `one_two` or `_three`
176+
//
174177
// Deprecated: use AllowKeywordSymbols
175178
func (t *Tokenizer) AllowKeywordUnderscore() *Tokenizer {
176179
t.kwMajorSymbols = append(t.kwMajorSymbols, '_')
@@ -180,6 +183,7 @@ func (t *Tokenizer) AllowKeywordUnderscore() *Tokenizer {
180183
// AllowNumbersInKeyword allows numbers in keywords, like `one1` or `r2d2`
181184
// The method allows numbers in keywords, but the keyword itself must not start with a number.
182185
// There should be no spaces between letters and numbers.
186+
//
183187
// Deprecated: use AllowKeywordSymbols
184188
func (t *Tokenizer) AllowNumbersInKeyword() *Tokenizer {
185189
t.kwMinorSymbols = append(t.kwMinorSymbols, Numbers...)
@@ -230,10 +234,10 @@ func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer {
230234
// For example, a piece of data surrounded by quotes: "string in quotes" or 'string on single quotes'.
231235
// Arguments startToken and endToken defines open and close "quotes".
232236
//
233-
// - `t.DefineStringToken("`", "`")` - parse string "one `two three`" will be parsed as
237+
// - `t.DefineStringToken(10, "`", "`")` - parse string "one `two three`" will be parsed as
234238
// [{key: TokenKeyword, value: "one"}, {key: TokenString, value: "`two three`"}]
235239
//
236-
// - `t.DefineStringToken("//", "\n")` - parse string "parse // like comment\n" will be parsed as
240+
// - `t.DefineStringToken(11, "//", "\n")` - parse string "parse // like comment\n" will be parsed as
237241
// [{key: TokenKeyword, value: "parse"}, {key: TokenString, value: "// like comment"}]
238242
func (t *Tokenizer) DefineStringToken(key TokenKey, startToken, endToken string) *StringSettings {
239243
q := &StringSettings{

0 commit comments

Comments
 (0)