Skip to content

Commit 328df2b

Browse files
committed
Refactor MarkdownScraper to improve markdown parsing and XML conversion; update transformers version in uv.lock
1 parent 48b66ef commit 328df2b

File tree

3 files changed

+191
-19
lines changed

3 files changed

+191
-19
lines changed

RAGnificent/core/scraper.py

Lines changed: 186 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -468,28 +468,207 @@ def _convert_content(
468468

469469
# Then convert to the requested format
470470
try:
471-
# Try to use functions from ragnificent_rs for conversion
472-
from ragnificent_rs import document_to_xml, parse_markdown_to_document
473-
474-
document = parse_markdown_to_document(markdown_content, url)
471+
# Parse markdown to structured document
472+
document = self._parse_markdown_to_document(markdown_content, url)
475473

476474
if output_format == "json":
477475
# Using json that was already imported at the top level
478476
content = json.dumps(document, indent=2)
479477
elif output_format == "xml":
480-
content = document_to_xml(document)
478+
content = self._document_to_xml(document)
481479
else:
482480
# Fallback to markdown if format not supported
483481
content = markdown_content
484-
except ImportError:
482+
except Exception as e:
485483
# Fallback to markdown if conversion functions are not available
486484
logger.warning(
487-
f"Could not convert to {output_format}, using markdown instead"
485+
f"Could not convert to {output_format}, using markdown instead. Error: {e}"
488486
)
489487
content = markdown_content
490488

491489
return content, markdown_content
492490

491+
def _parse_markdown_to_document(self, markdown: str, base_url: str) -> Dict:
492+
"""Parse markdown to structured document for JSON/XML conversion."""
493+
lines = markdown.split("\n")
494+
document = {
495+
"title": "No Title",
496+
"base_url": base_url,
497+
"headings": [],
498+
"paragraphs": [],
499+
"links": [],
500+
"images": [],
501+
"lists": [],
502+
"code_blocks": [],
503+
"blockquotes": [],
504+
}
505+
506+
# Extract title (first h1)
507+
for line in lines:
508+
if line.startswith("# "):
509+
document["title"] = line[2:].strip()
510+
break
511+
512+
# Process other elements
513+
current_block = []
514+
in_code_block = False
515+
code_lang = ""
516+
in_list = False
517+
current_list = []
518+
519+
for line in lines:
520+
# Skip title line which we already processed
521+
if line.strip() == f"# {document['title']}":
522+
continue
523+
524+
# Handle headings
525+
if line.startswith("#") and not in_code_block:
526+
level = 0
527+
while level < len(line) and line[level] == "#":
528+
level += 1
529+
if level <= 6 and level < len(line) and line[level] == " ":
530+
document["headings"].append(
531+
{"level": level, "text": line[level + 1 :].strip()}
532+
)
533+
534+
# Handle code blocks
535+
elif line.startswith("```") and not in_code_block:
536+
in_code_block = True
537+
code_lang = line[3:].strip()
538+
current_block = []
539+
elif line.startswith("```") and in_code_block:
540+
in_code_block = False
541+
document["code_blocks"].append(
542+
{"language": code_lang, "code": "\n".join(current_block)}
543+
)
544+
current_block = []
545+
546+
# Collect code block content
547+
elif in_code_block:
548+
current_block.append(line)
549+
550+
# Handle lists
551+
elif (line.strip().startswith("- ") or line.strip().startswith("* ") or
552+
(line.strip() and line.strip()[0].isdigit() and ". " in line.strip()[:4])) and not in_code_block:
553+
if not in_list:
554+
in_list = True
555+
current_list = []
556+
# Remove list marker
557+
if line.strip().startswith("- ") or line.strip().startswith("* "):
558+
current_list.append(line.strip()[2:])
559+
else:
560+
# Numbered list
561+
idx = line.strip().find(". ")
562+
current_list.append(line.strip()[idx+2:])
563+
elif in_list and (not line.strip() or not (line.strip().startswith("- ") or line.strip().startswith("* "))):
564+
# End of list
565+
if current_list:
566+
document["lists"].append(current_list)
567+
in_list = False
568+
current_list = []
569+
570+
# Handle blockquotes
571+
elif line.startswith(">") and not in_code_block:
572+
document["blockquotes"].append(line[1:].strip())
573+
574+
# Handle paragraphs (very simplified)
575+
elif line.strip() and not in_code_block and not in_list:
576+
# Extract links
577+
import re
578+
link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)'
579+
for match in re.finditer(link_pattern, line):
580+
document["links"].append({"text": match.group(1), "url": match.group(2)})
581+
582+
# Extract images
583+
img_pattern = r'!\[([^\]]*)\]\(([^\)]+)\)'
584+
for match in re.finditer(img_pattern, line):
585+
document["images"].append({"alt": match.group(1), "url": match.group(2)})
586+
587+
# Add as paragraph
588+
document["paragraphs"].append(line.strip())
589+
590+
# Handle any remaining list items
591+
if in_list and current_list:
592+
document["lists"].append(current_list)
593+
594+
return document
595+
596+
def _document_to_xml(self, document: Dict) -> str:
597+
"""Convert document structure to XML."""
598+
import xml.etree.ElementTree as ET
599+
from xml.dom import minidom
600+
601+
root = ET.Element("document")
602+
603+
# Add title
604+
title = ET.SubElement(root, "title")
605+
title.text = document["title"]
606+
607+
# Add base URL
608+
base_url = ET.SubElement(root, "base_url")
609+
base_url.text = document["base_url"]
610+
611+
# Add headings
612+
if document["headings"]:
613+
headings = ET.SubElement(root, "headings")
614+
for h in document["headings"]:
615+
heading = ET.SubElement(headings, "heading")
616+
heading.set("level", str(h["level"]))
617+
heading.text = h["text"]
618+
619+
# Add paragraphs
620+
if document["paragraphs"]:
621+
paragraphs = ET.SubElement(root, "paragraphs")
622+
for p in document["paragraphs"]:
623+
paragraph = ET.SubElement(paragraphs, "paragraph")
624+
paragraph.text = p
625+
626+
# Add links
627+
if document["links"]:
628+
links = ET.SubElement(root, "links")
629+
for l in document["links"]:
630+
link = ET.SubElement(links, "link")
631+
link.set("href", l["url"])
632+
link.text = l["text"]
633+
634+
# Add images
635+
if document["images"]:
636+
images = ET.SubElement(root, "images")
637+
for img in document["images"]:
638+
image = ET.SubElement(images, "image")
639+
image.set("src", img["url"])
640+
image.set("alt", img["alt"])
641+
642+
# Add lists
643+
if document["lists"]:
644+
lists = ET.SubElement(root, "lists")
645+
for lst in document["lists"]:
646+
list_elem = ET.SubElement(lists, "list")
647+
for item in lst:
648+
item_elem = ET.SubElement(list_elem, "item")
649+
item_elem.text = item
650+
651+
# Add code blocks
652+
if document["code_blocks"]:
653+
code_blocks = ET.SubElement(root, "code_blocks")
654+
for cb in document["code_blocks"]:
655+
code_block = ET.SubElement(code_blocks, "code_block")
656+
if cb["language"]:
657+
code_block.set("language", cb["language"])
658+
code_block.text = cb["code"]
659+
660+
# Add blockquotes
661+
if document["blockquotes"]:
662+
blockquotes = ET.SubElement(root, "blockquotes")
663+
for bq in document["blockquotes"]:
664+
blockquote = ET.SubElement(blockquotes, "blockquote")
665+
blockquote.text = bq
666+
667+
# Convert to string with pretty formatting
668+
rough_string = ET.tostring(root, encoding='utf-8')
669+
reparsed = minidom.parseString(rough_string)
670+
return reparsed.toprettyxml(indent=" ")
671+
493672
def scrape_by_sitemap(
494673
self,
495674
base_url: str,

output.md

Lines changed: 0 additions & 7 deletions
This file was deleted.

uv.lock

Lines changed: 5 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)