Skip to content

Commit 4a930e9

Browse files
committed
some more refactoring to make the code more readable and less error prone
1 parent 9036c7a commit 4a930e9

File tree

4 files changed

+41
-35
lines changed

4 files changed

+41
-35
lines changed

src/json_repair/json_parser.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -133,23 +133,29 @@ def get_char_at(self, count: int = 0) -> str | None:
133133
except IndexError:
134134
return None
135135

136-
def skip_whitespaces_at(self, idx: int = 0, move_main_index: bool = True) -> int:
136+
def skip_whitespaces(self) -> None:
137137
"""
138-
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
138+
This function quickly iterates on whitespaces, moving the self.index forward
139139
"""
140140
try:
141-
char = self.json_str[self.index + idx]
142-
except IndexError:
143-
return idx
144-
while char.isspace():
145-
if move_main_index:
141+
char = self.json_str[self.index]
142+
while char.isspace():
146143
self.index += 1
147-
else:
144+
char = self.json_str[self.index]
145+
except IndexError:
146+
pass
147+
148+
def scroll_whitespaces(self, idx: int = 0) -> int:
149+
"""
150+
This function quickly iterates on whitespaces. Doesn't move the self.index and returns the offset from self.index
151+
"""
152+
try:
153+
char = self.json_str[self.index + idx]
154+
while char.isspace():
148155
idx += 1
149-
try:
150156
char = self.json_str[self.index + idx]
151-
except IndexError:
152-
return idx
157+
except IndexError:
158+
pass
153159
return idx
154160

155161
def skip_to_character(self, character: str | list[str], idx: int = 0) -> int:

src/json_repair/parse_array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
1515
# Stop when you either find the closing parentheses or you have iterated over the entire string
1616
char = self.get_char_at()
1717
while char and char not in ["]", "}"]:
18-
self.skip_whitespaces_at()
18+
self.skip_whitespaces()
1919
value: JSONReturnType = ""
2020
if char in STRING_DELIMITERS:
2121
# Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
2222
# So we are going to check if this string is followed by a : or not
2323
# And either parse the string or parse the object
2424
i = 1
2525
i = self.skip_to_character(char, i)
26-
i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
26+
i = self.scroll_whitespaces(idx=i + 1)
2727
value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
2828
else:
2929
value = self.parse_json()

src/json_repair/parse_object.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
1717
# <member> ::= <string> ': ' <json>
1818

1919
# Skip filler whitespaces
20-
self.skip_whitespaces_at()
20+
self.skip_whitespaces()
2121

2222
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
2323
if self.get_char_at() == ":":
@@ -53,14 +53,14 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
5353
prev_value.extend(
5454
new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array
5555
)
56-
self.skip_whitespaces_at()
56+
self.skip_whitespaces()
5757
if self.get_char_at() == ",":
5858
self.index += 1
59-
self.skip_whitespaces_at()
59+
self.skip_whitespaces()
6060
continue
6161
key = str(self.parse_string())
6262
if key == "":
63-
self.skip_whitespaces_at()
63+
self.skip_whitespaces()
6464
if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
6565
# If the string is empty but there is a object divider, we are done here
6666
break
@@ -74,13 +74,13 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
7474
break
7575

7676
# Skip filler whitespaces
77-
self.skip_whitespaces_at()
77+
self.skip_whitespaces()
7878

7979
# We reached the end here
8080
if (self.get_char_at() or "}") == "}":
8181
continue
8282

83-
self.skip_whitespaces_at()
83+
self.skip_whitespaces()
8484

8585
# An extreme case of missing ":" after a key
8686
if self.get_char_at() != ":":
@@ -92,7 +92,7 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
9292
self.context.reset()
9393
self.context.set(ContextValues.OBJECT_VALUE)
9494
# The value can be any valid json
95-
self.skip_whitespaces_at()
95+
self.skip_whitespaces()
9696
# Corner case, a lone comma
9797
value: JSONReturnType = ""
9898
if self.get_char_at() in [",", "}"]:
@@ -110,7 +110,7 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
110110
self.index += 1
111111

112112
# Remove trailing spaces
113-
self.skip_whitespaces_at()
113+
self.skip_whitespaces()
114114

115115
self.index += 1
116116

@@ -126,11 +126,11 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
126126
if not self.context.empty:
127127
return obj
128128

129-
self.skip_whitespaces_at()
129+
self.skip_whitespaces()
130130
if self.get_char_at() != ",":
131131
return obj
132132
self.index += 1
133-
self.skip_whitespaces_at()
133+
self.skip_whitespaces()
134134
if self.get_char_at() not in STRING_DELIMITERS:
135135
return obj
136136
self.log(

src/json_repair/parse_string.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
9393
self.index += 1
9494
else:
9595
# Ok this is not a doubled quote, check if this is an empty string or not
96-
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
96+
i = self.scroll_whitespaces(idx=1)
9797
next_c = self.get_char_at(i)
9898
if next_c in STRING_DELIMITERS + ["{", "["]:
9999
# something fishy is going on here
@@ -143,7 +143,7 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
143143
):
144144
rstring_delimiter_missing = True
145145
# check if this is a case in which the closing comma is NOT missing instead
146-
self.skip_whitespaces_at()
146+
self.skip_whitespaces()
147147
if self.get_char_at(1) == "\\":
148148
# Ok this is a quoted string, skip
149149
rstring_delimiter_missing = False
@@ -153,7 +153,7 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
153153
i += 1
154154
# found a delimiter, now we need to check that is followed strictly by a comma or brace
155155
# or the string ended
156-
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
156+
i = self.scroll_whitespaces(idx=i)
157157
next_c = self.get_char_at(i)
158158
if not next_c or next_c in [",", "}"]:
159159
rstring_delimiter_missing = False
@@ -168,7 +168,7 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
168168
else:
169169
# But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
170170
# Check if we find a : afterwards (skipping space)
171-
i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
171+
i = self.scroll_whitespaces(idx=i + 1)
172172
next_c = self.get_char_at(i)
173173
if next_c and next_c != ":":
174174
rstring_delimiter_missing = False
@@ -183,7 +183,7 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
183183
break
184184
else:
185185
# skip any whitespace first
186-
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
186+
i = self.scroll_whitespaces(idx=1)
187187
# We couldn't find any rstring_delimeter before the end of the string
188188
# check if this is the last string of an object and therefore we can keep going
189189
# make an exception if this is the last char before the closing brace
@@ -220,7 +220,7 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
220220
if self.context.current == ContextValues.OBJECT_VALUE and char == "}":
221221
# We found the end of an object while parsing a value
222222
# Check if the object is really over, to avoid doubling the closing brace
223-
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
223+
i = self.scroll_whitespaces(idx=1)
224224
next_c = self.get_char_at(i)
225225
if next_c == "`" and self.get_char_at(i + 1) == "`" and self.get_char_at(i + 2) == "`":
226226
# This could be a special case in which the LLM added code fences after the object
@@ -286,7 +286,7 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
286286
# found a second delimiter
287287
i += 1
288288
# Skip spaces
289-
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
289+
i = self.scroll_whitespaces(idx=i)
290290
if self.get_char_at(i) in [",", "}"]:
291291
# Ok then this is a missing right quote
292292
self.log(
@@ -319,7 +319,7 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
319319
# We found a quote, now let's make sure there's a ":" following
320320
i += 1
321321
# found a delimiter, now we need to check that is followed strictly by a comma or brace
322-
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
322+
i = self.scroll_whitespaces(idx=i)
323323
if self.get_char_at(i) == ":":
324324
# Reset the cursor
325325
self.index -= 1
@@ -365,7 +365,7 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
365365
next_c = self.get_char_at(i)
366366
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a } or a ,
367367
i += 1
368-
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
368+
i = self.scroll_whitespaces(idx=i)
369369
next_c = self.get_char_at(i)
370370
if next_c in ["}", ","]:
371371
self.log(
@@ -378,15 +378,15 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
378378
if all(str(self.get_char_at(j)).isspace() for j in range(1, i) if self.get_char_at(j)):
379379
break
380380
if self.context.current == ContextValues.OBJECT_VALUE:
381-
i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
381+
i = self.scroll_whitespaces(idx=i + 1)
382382
if self.get_char_at(i) == ",":
383383
# So we found a comma, this could be a case of a single quote like "va"lue",
384384
# Search if it's followed by another key, starting with the first delimeter
385385
i = self.skip_to_character(character=lstring_delimiter, idx=i + 1)
386386
i += 1
387387
i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
388388
i += 1
389-
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
389+
i = self.scroll_whitespaces(idx=i)
390390
next_c = self.get_char_at(i)
391391
if next_c == ":":
392392
self.log(
@@ -449,7 +449,7 @@ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str |
449449
self.log(
450450
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
451451
)
452-
self.skip_whitespaces_at()
452+
self.skip_whitespaces()
453453
if self.get_char_at() not in [":", ","]:
454454
return ""
455455

0 commit comments

Comments
 (0)