Skip to content

Commit bfc084d

Browse files
authored
fix: Handle various HTML encodings in documentation parsing (#58)
Fix encoding errors when parsing Home Manager and nix-darwin documentation that may be served with different encodings (windows-1252, UTF-8 with BOM, ISO-8859-1). Changes: - Use resp.content instead of resp.text in parse_html_options() - Let BeautifulSoup handle encoding detection automatically - Add test coverage for windows-1252, UTF-8 with BOM, and ISO-8859-1 - Update all existing tests to use mock_resp.content This resolves intermittent "unknown encoding: windows-1252" errors when fetching documentation from CDN edge servers with different configurations. Fixes errors like: - "Failed to fetch docs: unknown encoding: windows-1252"
1 parent 50b02bc commit bfc084d

File tree

8 files changed

+118
-30
lines changed

8 files changed

+118
-30
lines changed

mcp_nixos/server.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,9 @@ def parse_html_options(url: str, query: str = "", prefix: str = "", limit: int =
249249
try:
250250
resp = requests.get(url, timeout=30) # Increase timeout for large docs
251251
resp.raise_for_status()
252-
soup = BeautifulSoup(resp.text, "html.parser")
252+
# Use resp.content to let BeautifulSoup handle encoding detection
253+
# This prevents encoding errors like "unknown encoding: windows-1252"
254+
soup = BeautifulSoup(resp.content, "html.parser")
253255
options = []
254256

255257
# Get all dt elements

tests/test_edge_cases.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def test_parse_html_options_large_document(self, mock_get):
108108

109109
mock_resp = Mock()
110110
mock_resp.raise_for_status = Mock()
111-
mock_resp.text = large_html
111+
mock_resp.content = large_html.encode("utf-8")
112112
mock_get.return_value = mock_resp
113113

114114
# Should respect limit
@@ -135,7 +135,7 @@ def test_parse_html_options_malformed_html(self, mock_get):
135135

136136
mock_resp = Mock()
137137
mock_resp.raise_for_status = Mock()
138-
mock_resp.text = malformed_html
138+
mock_resp.content = malformed_html.encode("utf-8")
139139
mock_get.return_value = mock_resp
140140

141141
options = parse_html_options("http://test.com")
@@ -160,7 +160,7 @@ def test_parse_html_options_special_characters(self, mock_get):
160160

161161
mock_resp = Mock()
162162
mock_resp.raise_for_status = Mock()
163-
mock_resp.text = html_with_entities
163+
mock_resp.content = html_with_entities.encode("utf-8")
164164
mock_get.return_value = mock_resp
165165

166166
options = parse_html_options("http://test.com")

tests/test_evals.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ async def test_complete_firefox_installation_flow(self, mock_get, mock_post):
124124

125125
# Step 3: Search Home Manager options
126126
hm_resp = Mock()
127-
hm_resp.text = """
127+
hm_resp.content = b"""
128128
<html>
129129
<dt>programs.firefox.enable</dt>
130130
<dd>

tests/test_flakes.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ async def test_flake_search_error_handling(self, mock_post):
249249
"""Test flake search error handling."""
250250
mock_response = MagicMock()
251251
mock_response.status_code = 500
252-
mock_response.text = "Internal Server Error"
252+
mock_response.content = b"Internal Server Error"
253253

254254
# Create an HTTPError with a response attribute
255255
http_error = requests.HTTPError("500 Server Error")
@@ -314,7 +314,7 @@ async def test_home_manager_stats_with_data(self, mock_get):
314314
"""
315315

316316
mock_get.return_value.status_code = 200
317-
mock_get.return_value.text = mock_html
317+
mock_get.return_value.content = mock_html.encode("utf-8")
318318

319319
result = await home_manager_stats()
320320

@@ -329,7 +329,7 @@ async def test_home_manager_stats_with_data(self, mock_get):
329329
async def test_home_manager_stats_error_handling(self, mock_get):
330330
"""Test home_manager_stats error handling."""
331331
mock_get.return_value.status_code = 404
332-
mock_get.return_value.text = "Not Found"
332+
mock_get.return_value.content = b"Not Found"
333333

334334
result = await home_manager_stats()
335335

@@ -359,7 +359,7 @@ async def test_darwin_stats_with_data(self, mock_get):
359359
"""
360360

361361
mock_get.return_value.status_code = 200
362-
mock_get.return_value.text = mock_html
362+
mock_get.return_value.content = mock_html.encode("utf-8")
363363

364364
result = await darwin_stats()
365365

@@ -374,7 +374,7 @@ async def test_darwin_stats_with_data(self, mock_get):
374374
async def test_darwin_stats_error_handling(self, mock_get):
375375
"""Test darwin_stats error handling."""
376376
mock_get.return_value.status_code = 500
377-
mock_get.return_value.text = "Server Error"
377+
mock_get.return_value.content = b"Server Error"
378378

379379
result = await darwin_stats()
380380

@@ -402,7 +402,7 @@ async def test_stats_with_complex_categories(self, mock_get):
402402
"""
403403

404404
mock_get.return_value.status_code = 200
405-
mock_get.return_value.text = mock_html
405+
mock_get.return_value.content = mock_html.encode("utf-8")
406406

407407
result = await home_manager_stats()
408408

@@ -416,7 +416,7 @@ async def test_stats_with_complex_categories(self, mock_get):
416416
async def test_stats_with_empty_html(self, mock_get):
417417
"""Test stats functions with empty HTML."""
418418
mock_get.return_value.status_code = 200
419-
mock_get.return_value.text = "<html><body></body></html>"
419+
mock_get.return_value.content = b"<html><body></body></html>"
420420

421421
result = await home_manager_stats()
422422

@@ -546,7 +546,7 @@ async def test_combined_workflow_stats_and_search(self, mock_post, mock_get):
546546
"""
547547

548548
mock_get.return_value.status_code = 200
549-
mock_get.return_value.text = stats_html
549+
mock_get.return_value.content = stats_html.encode("utf-8")
550550

551551
stats_result = await home_manager_stats()
552552

tests/test_mcp_tools.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ async def test_parse_html_options_type_extraction(self, mock_get):
148148
"""Test that type information is not properly extracted from HTML."""
149149
# Mock HTML response with proper structure
150150
mock_response = MagicMock()
151-
mock_response.text = """
151+
mock_response.content = """.encode("utf-8")
152152
<html>
153153
<body>
154154
<dt>programs.git.enable</dt>

tests/test_plain_text_output.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ async def test_home_manager_search_plain_text(self, mock_get):
143143
"""Test home_manager_search returns plain text."""
144144
# Mock HTML response
145145
mock_response = Mock()
146-
mock_response.text = """
146+
mock_response.content = """.encode("utf-8")
147147
<html>
148148
<dt>programs.git.enable</dt>
149149
<dd>
@@ -168,7 +168,7 @@ async def test_home_manager_info_plain_text(self, mock_get):
168168
"""Test home_manager_info returns plain text."""
169169
# Mock HTML response
170170
mock_response = Mock()
171-
mock_response.text = """
171+
mock_response.content = """.encode("utf-8")
172172
<html>
173173
<dt>programs.git.enable</dt>
174174
<dd>
@@ -215,7 +215,7 @@ async def test_home_manager_list_options_plain_text(self, mock_get):
215215
"""Test home_manager_list_options returns plain text."""
216216
# Mock HTML response
217217
mock_response = Mock()
218-
mock_response.text = """
218+
mock_response.content = """.encode("utf-8")
219219
<html>
220220
<dt>programs.git.enable</dt>
221221
<dd><p>Enable git</p></dd>
@@ -238,7 +238,7 @@ async def test_darwin_search_plain_text(self, mock_get):
238238
"""Test darwin_search returns plain text."""
239239
# Mock HTML response
240240
mock_response = Mock()
241-
mock_response.text = """
241+
mock_response.content = """.encode("utf-8")
242242
<html>
243243
<dt>system.defaults.dock.autohide</dt>
244244
<dd>
@@ -263,7 +263,7 @@ async def test_no_results_plain_text(self, mock_get):
263263
"""Test empty results return appropriate plain text."""
264264
# Mock empty HTML response
265265
mock_response = Mock()
266-
mock_response.text = "<html></html>"
266+
mock_response.content = b"<html></html>"
267267
mock_response.raise_for_status = Mock()
268268
mock_get.return_value = mock_response
269269

tests/test_server.py

Lines changed: 96 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def test_es_query_missing_hits(self, mock_post):
129129
def test_parse_html_options_success(self, mock_get):
130130
"""Test successful HTML parsing."""
131131
mock_resp = Mock()
132-
mock_resp.text = """
132+
html_content = """
133133
<html>
134134
<dt>programs.git.enable</dt>
135135
<dd>
@@ -143,6 +143,7 @@ def test_parse_html_options_success(self, mock_get):
143143
</dd>
144144
</html>
145145
"""
146+
mock_resp.content = html_content.encode("utf-8")
146147
mock_resp.raise_for_status = Mock()
147148
mock_get.return_value = mock_resp
148149

@@ -156,14 +157,15 @@ def test_parse_html_options_success(self, mock_get):
156157
def test_parse_html_options_with_query(self, mock_get):
157158
"""Test HTML parsing with query filter."""
158159
mock_resp = Mock()
159-
mock_resp.text = """
160+
html_content = """
160161
<html>
161162
<dt>programs.git.enable</dt>
162163
<dd><p>Enable git</p></dd>
163164
<dt>programs.vim.enable</dt>
164165
<dd><p>Enable vim</p></dd>
165166
</html>
166167
"""
168+
mock_resp.content = html_content.encode("utf-8")
167169
mock_resp.raise_for_status = Mock()
168170
mock_get.return_value = mock_resp
169171

@@ -175,14 +177,15 @@ def test_parse_html_options_with_query(self, mock_get):
175177
def test_parse_html_options_with_prefix(self, mock_get):
176178
"""Test HTML parsing with prefix filter."""
177179
mock_resp = Mock()
178-
mock_resp.text = """
180+
html_content = """
179181
<html>
180182
<dt>programs.git.enable</dt>
181183
<dd><p>Enable git</p></dd>
182184
<dt>services.nginx.enable</dt>
183185
<dd><p>Enable nginx</p></dd>
184186
</html>
185187
"""
188+
mock_resp.content = html_content.encode("utf-8")
186189
mock_resp.raise_for_status = Mock()
187190
mock_get.return_value = mock_resp
188191

@@ -194,7 +197,7 @@ def test_parse_html_options_with_prefix(self, mock_get):
194197
def test_parse_html_options_empty_response(self, mock_get):
195198
"""Test HTML parsing with empty response."""
196199
mock_resp = Mock()
197-
mock_resp.text = "<html></html>"
200+
mock_resp.content = b"<html></html>"
198201
mock_resp.raise_for_status = Mock()
199202
mock_get.return_value = mock_resp
200203

@@ -217,13 +220,92 @@ def test_parse_html_options_limit(self, mock_get):
217220
options_html = ""
218221
for i in range(10):
219222
options_html += f"<dt>option.{i}</dt><dd><p>desc{i}</p></dd>"
220-
mock_resp.text = f"<html>{options_html}</html>"
223+
mock_resp.content = f"<html>{options_html}</html>".encode()
221224
mock_resp.raise_for_status = Mock()
222225
mock_get.return_value = mock_resp
223226

224227
result = parse_html_options("http://test.com", limit=5)
225228
assert len(result) == 5
226229

230+
@patch("mcp_nixos.server.requests.get")
231+
def test_parse_html_options_windows_1252_encoding(self, mock_get):
232+
"""Test HTML parsing with windows-1252 encoding."""
233+
# Create HTML content with special characters
234+
html_content = """
235+
<html>
236+
<head><meta charset="windows-1252"></head>
237+
<dt>programs.git.userName</dt>
238+
<dd>
239+
<p>Git user name with special chars: café</p>
240+
<span class="term">Type: string</span>
241+
</dd>
242+
</html>
243+
"""
244+
245+
mock_resp = Mock()
246+
# Simulate windows-1252 encoded content
247+
mock_resp.content = html_content.encode("windows-1252")
248+
mock_resp.encoding = "windows-1252"
249+
mock_resp.raise_for_status = Mock()
250+
mock_get.return_value = mock_resp
251+
252+
# Should not raise encoding errors
253+
result = parse_html_options("http://test.com")
254+
assert len(result) == 1
255+
assert result[0]["name"] == "programs.git.userName"
256+
assert "café" in result[0]["description"]
257+
258+
@patch("mcp_nixos.server.requests.get")
259+
def test_parse_html_options_utf8_with_bom(self, mock_get):
260+
"""Test HTML parsing with UTF-8 BOM."""
261+
html_content = """
262+
<html>
263+
<dt>programs.neovim.enable</dt>
264+
<dd>
265+
<p>Enable Neovim with unicode: 你好</p>
266+
<span class="term">Type: boolean</span>
267+
</dd>
268+
</html>
269+
"""
270+
271+
mock_resp = Mock()
272+
# Add UTF-8 BOM at the beginning
273+
mock_resp.content = b"\xef\xbb\xbf" + html_content.encode("utf-8")
274+
mock_resp.encoding = "utf-8-sig"
275+
mock_resp.raise_for_status = Mock()
276+
mock_get.return_value = mock_resp
277+
278+
result = parse_html_options("http://test.com")
279+
assert len(result) == 1
280+
assert result[0]["name"] == "programs.neovim.enable"
281+
assert "你好" in result[0]["description"]
282+
283+
@patch("mcp_nixos.server.requests.get")
284+
def test_parse_html_options_iso_8859_1_encoding(self, mock_get):
285+
"""Test HTML parsing with ISO-8859-1 encoding."""
286+
html_content = """
287+
<html>
288+
<head><meta charset="iso-8859-1"></head>
289+
<dt>services.nginx.virtualHosts</dt>
290+
<dd>
291+
<p>Nginx config with special: naïve résumé</p>
292+
</dd>
293+
</html>
294+
"""
295+
296+
mock_resp = Mock()
297+
# Simulate ISO-8859-1 encoded content
298+
mock_resp.content = html_content.encode("iso-8859-1")
299+
mock_resp.encoding = "iso-8859-1"
300+
mock_resp.raise_for_status = Mock()
301+
mock_get.return_value = mock_resp
302+
303+
result = parse_html_options("http://test.com")
304+
assert len(result) == 1
305+
assert result[0]["name"] == "services.nginx.virtualHosts"
306+
assert "naïve" in result[0]["description"]
307+
assert "résumé" in result[0]["description"]
308+
227309

228310
class TestNixOSTools:
229311
"""Test all NixOS tools."""
@@ -516,8 +598,10 @@ async def test_home_manager_stats(self, mock_get):
516598
</body>
517599
</html>
518600
"""
519-
mock_get.return_value.status_code = 200
520-
mock_get.return_value.text = mock_html
601+
mock_resp = Mock()
602+
mock_resp.content = mock_html.encode("utf-8")
603+
mock_resp.raise_for_status = Mock()
604+
mock_get.return_value = mock_resp
521605

522606
result = await home_manager_stats()
523607
assert "Home Manager Statistics:" in result
@@ -604,8 +688,10 @@ async def test_darwin_stats(self, mock_get):
604688
</body>
605689
</html>
606690
"""
607-
mock_get.return_value.status_code = 200
608-
mock_get.return_value.text = mock_html
691+
mock_resp = Mock()
692+
mock_resp.content = mock_html.encode("utf-8")
693+
mock_resp.raise_for_status = Mock()
694+
mock_get.return_value = mock_resp
609695

610696
result = await darwin_stats()
611697
assert "nix-darwin Statistics:" in result
@@ -664,7 +750,7 @@ async def test_special_characters_in_query(self, mock_query):
664750
def test_malformed_html_response(self, mock_get):
665751
"""Test parsing malformed HTML."""
666752
mock_resp = Mock()
667-
mock_resp.text = "<html><dt>broken" # Malformed HTML
753+
mock_resp.content = b"<html><dt>broken" # Malformed HTML
668754
mock_resp.raise_for_status = Mock()
669755
mock_get.return_value = mock_resp
670756

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)