@@ -119,7 +119,7 @@ class PreTokenizerTests: XCTestCase {
119119 )
120120 XCTAssertEqual (
121121 preTokenizer1. preTokenize ( text: " Hey, friend, what's up? " ) ,
122- [ " " , " " , " " , " Hey, " , " " , " " , " " , " " , " friend, " , " " , " " , " " , " " , " what's " , " " , " up? " , " " , " " , " " ]
122+ [ " " , " " , " " , " Hey, " , " " , " " , " " , " " , " friend, " , " " , " " , " " , " " , " what's " , " " , " up? " , " " , " " ]
123123 )
124124
125125 let preTokenizer2 = SplitPreTokenizer ( config: Config ( [ " pattern " : [ " Regex " : " \\ s " ] ] ) )
@@ -133,21 +133,22 @@ class PreTokenizerTests: XCTestCase {
133133 )
134134 XCTAssertEqual (
135135 preTokenizer2. preTokenize ( text: " Hey, friend, what's up? " ) ,
136- [ " " , " " , " " , " Hey, " , " " , " " , " " , " " , " friend, " , " " , " " , " " , " " , " what's " , " " , " up? " , " " , " " , " " ]
136+ [ " " , " " , " " , " Hey, " , " " , " " , " " , " " , " friend, " , " " , " " , " " , " " , " what's " , " " , " up? " , " " , " " ]
137137 )
138138
139- let preTokenizer3 = SplitPreTokenizer ( config: Config ( [ " pattern " : [ " Regex " : " \\ s " ] , " invert " : true ] ) )
139+ let preTokenizer3 = SplitPreTokenizer ( config: Config ( [ " pattern " : [ " Regex " : " (?i: \' s| \' t| \' re| \' ve| \' m| \' ll| \' d)|[^ \\ r \\ n \\ p{L} \\ p{N}]? \\ p{L}+| \\ p{N}{1,3}| ?[^ \\ s \\ p{L} \\ p{N}]+[ \\ r \\ n]*| \\ s*[ \\ r \\ n]+| \\ s+(?! \\ S)| \\ s+ " ] , " invert " : true ] ) )
140140 XCTAssertEqual (
141- preTokenizer3. preTokenize ( text: " Hey friend! " ) ,
142- [ " Hey " , " friend! " ]
141+ preTokenizer3. preTokenize ( text: " Hello " ) ,
142+ [ " Hello " ]
143143 )
144+
144145 XCTAssertEqual (
145- preTokenizer3. preTokenize ( text: " Hey friend! How are you?!? " ) ,
146- [ " Hey " , " friend! " , " How " , " are " , " you?!? " ]
146+ preTokenizer3. preTokenize ( text: " Hey friend! " ) ,
147+ [ " Hey " , " friend" , " ! " ]
147148 )
148149 XCTAssertEqual (
149- preTokenizer3. preTokenize ( text: " Hey, friend, what's up? " ) ,
150- [ " Hey, " , " friend, " , " what's " , " up? " , " " ]
150+ preTokenizer3. preTokenize ( text: " Hey friend! How are you?!? " ) ,
151+ [ " Hey " , " friend" , " ! " , " " , " How " , " are " , " you " , " ?!? " ]
151152 )
152153 }
153154
0 commit comments