@@ -468,28 +468,207 @@ def _convert_content(
468468
469469 # Then convert to the requested format
470470 try :
471- # Try to use functions from ragnificent_rs for conversion
472- from ragnificent_rs import document_to_xml , parse_markdown_to_document
473-
474- document = parse_markdown_to_document (markdown_content , url )
471+ # Parse markdown to structured document
472+ document = self ._parse_markdown_to_document (markdown_content , url )
475473
476474 if output_format == "json" :
477475 # Using json that was already imported at the top level
478476 content = json .dumps (document , indent = 2 )
479477 elif output_format == "xml" :
480- content = document_to_xml (document )
478+ content = self . _document_to_xml (document )
481479 else :
482480 # Fallback to markdown if format not supported
483481 content = markdown_content
484- except ImportError :
482+ except Exception as e :
485483 # Fallback to markdown if conversion functions are not available
486484 logger .warning (
487- f"Could not convert to { output_format } , using markdown instead"
485+ f"Could not convert to { output_format } , using markdown instead. Error: { e } "
488486 )
489487 content = markdown_content
490488
491489 return content , markdown_content
492490
491+ def _parse_markdown_to_document (self , markdown : str , base_url : str ) -> Dict :
492+ """Parse markdown to structured document for JSON/XML conversion."""
493+ lines = markdown .split ("\n " )
494+ document = {
495+ "title" : "No Title" ,
496+ "base_url" : base_url ,
497+ "headings" : [],
498+ "paragraphs" : [],
499+ "links" : [],
500+ "images" : [],
501+ "lists" : [],
502+ "code_blocks" : [],
503+ "blockquotes" : [],
504+ }
505+
506+ # Extract title (first h1)
507+ for line in lines :
508+ if line .startswith ("# " ):
509+ document ["title" ] = line [2 :].strip ()
510+ break
511+
512+ # Process other elements
513+ current_block = []
514+ in_code_block = False
515+ code_lang = ""
516+ in_list = False
517+ current_list = []
518+
519+ for line in lines :
520+ # Skip title line which we already processed
521+ if line .strip () == f"# { document ['title' ]} " :
522+ continue
523+
524+ # Handle headings
525+ if line .startswith ("#" ) and not in_code_block :
526+ level = 0
527+ while level < len (line ) and line [level ] == "#" :
528+ level += 1
529+ if level <= 6 and level < len (line ) and line [level ] == " " :
530+ document ["headings" ].append (
531+ {"level" : level , "text" : line [level + 1 :].strip ()}
532+ )
533+
534+ # Handle code blocks
535+ elif line .startswith ("```" ) and not in_code_block :
536+ in_code_block = True
537+ code_lang = line [3 :].strip ()
538+ current_block = []
539+ elif line .startswith ("```" ) and in_code_block :
540+ in_code_block = False
541+ document ["code_blocks" ].append (
542+ {"language" : code_lang , "code" : "\n " .join (current_block )}
543+ )
544+ current_block = []
545+
546+ # Collect code block content
547+ elif in_code_block :
548+ current_block .append (line )
549+
550+ # Handle lists
551+ elif (line .strip ().startswith ("- " ) or line .strip ().startswith ("* " ) or
552+ (line .strip () and line .strip ()[0 ].isdigit () and ". " in line .strip ()[:4 ])) and not in_code_block :
553+ if not in_list :
554+ in_list = True
555+ current_list = []
556+ # Remove list marker
557+ if line .strip ().startswith ("- " ) or line .strip ().startswith ("* " ):
558+ current_list .append (line .strip ()[2 :])
559+ else :
560+ # Numbered list
561+ idx = line .strip ().find (". " )
562+ current_list .append (line .strip ()[idx + 2 :])
563+ elif in_list and (not line .strip () or not (line .strip ().startswith ("- " ) or line .strip ().startswith ("* " ))):
564+ # End of list
565+ if current_list :
566+ document ["lists" ].append (current_list )
567+ in_list = False
568+ current_list = []
569+
570+ # Handle blockquotes
571+ elif line .startswith (">" ) and not in_code_block :
572+ document ["blockquotes" ].append (line [1 :].strip ())
573+
574+ # Handle paragraphs (very simplified)
575+ elif line .strip () and not in_code_block and not in_list :
576+ # Extract links
577+ import re
578+ link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)'
579+ for match in re .finditer (link_pattern , line ):
580+ document ["links" ].append ({"text" : match .group (1 ), "url" : match .group (2 )})
581+
582+ # Extract images
583+ img_pattern = r'!\[([^\]]*)\]\(([^\)]+)\)'
584+ for match in re .finditer (img_pattern , line ):
585+ document ["images" ].append ({"alt" : match .group (1 ), "url" : match .group (2 )})
586+
587+ # Add as paragraph
588+ document ["paragraphs" ].append (line .strip ())
589+
590+ # Handle any remaining list items
591+ if in_list and current_list :
592+ document ["lists" ].append (current_list )
593+
594+ return document
595+
596+ def _document_to_xml (self , document : Dict ) -> str :
597+ """Convert document structure to XML."""
598+ import xml .etree .ElementTree as ET
599+ from xml .dom import minidom
600+
601+ root = ET .Element ("document" )
602+
603+ # Add title
604+ title = ET .SubElement (root , "title" )
605+ title .text = document ["title" ]
606+
607+ # Add base URL
608+ base_url = ET .SubElement (root , "base_url" )
609+ base_url .text = document ["base_url" ]
610+
611+ # Add headings
612+ if document ["headings" ]:
613+ headings = ET .SubElement (root , "headings" )
614+ for h in document ["headings" ]:
615+ heading = ET .SubElement (headings , "heading" )
616+ heading .set ("level" , str (h ["level" ]))
617+ heading .text = h ["text" ]
618+
619+ # Add paragraphs
620+ if document ["paragraphs" ]:
621+ paragraphs = ET .SubElement (root , "paragraphs" )
622+ for p in document ["paragraphs" ]:
623+ paragraph = ET .SubElement (paragraphs , "paragraph" )
624+ paragraph .text = p
625+
626+ # Add links
627+ if document ["links" ]:
628+ links = ET .SubElement (root , "links" )
629+ for l in document ["links" ]:
630+ link = ET .SubElement (links , "link" )
631+ link .set ("href" , l ["url" ])
632+ link .text = l ["text" ]
633+
634+ # Add images
635+ if document ["images" ]:
636+ images = ET .SubElement (root , "images" )
637+ for img in document ["images" ]:
638+ image = ET .SubElement (images , "image" )
639+ image .set ("src" , img ["url" ])
640+ image .set ("alt" , img ["alt" ])
641+
642+ # Add lists
643+ if document ["lists" ]:
644+ lists = ET .SubElement (root , "lists" )
645+ for lst in document ["lists" ]:
646+ list_elem = ET .SubElement (lists , "list" )
647+ for item in lst :
648+ item_elem = ET .SubElement (list_elem , "item" )
649+ item_elem .text = item
650+
651+ # Add code blocks
652+ if document ["code_blocks" ]:
653+ code_blocks = ET .SubElement (root , "code_blocks" )
654+ for cb in document ["code_blocks" ]:
655+ code_block = ET .SubElement (code_blocks , "code_block" )
656+ if cb ["language" ]:
657+ code_block .set ("language" , cb ["language" ])
658+ code_block .text = cb ["code" ]
659+
660+ # Add blockquotes
661+ if document ["blockquotes" ]:
662+ blockquotes = ET .SubElement (root , "blockquotes" )
663+ for bq in document ["blockquotes" ]:
664+ blockquote = ET .SubElement (blockquotes , "blockquote" )
665+ blockquote .text = bq
666+
667+ # Convert to string with pretty formatting
668+ rough_string = ET .tostring (root , encoding = 'utf-8' )
669+ reparsed = minidom .parseString (rough_string )
670+ return reparsed .toprettyxml (indent = " " )
671+
493672 def scrape_by_sitemap (
494673 self ,
495674 base_url : str ,
0 commit comments