From 2e932f4ce191ce35d557b1155cced3f7c69f2301 Mon Sep 17 00:00:00 2001 From: Lars Kiesow Date: Sat, 18 Apr 2026 12:46:33 +0200 Subject: [PATCH] Add base_url option to get absolute URLs This patch adds the new option `base_url` which allows users to set a URL which is then used for converting URLs from relative to absolute URLs. This option is very useful if you want to convert a random web page and then use it outside of the original context. For a very simple example, if you just want to store the Markdown as a local file for later reference, you would have lost the page context and wouldn't know how to resolve any relative URLs. --- README.rst | 5 +++++ markdownify/__init__.py | 20 ++++++++++++++++++++ markdownify/__init__.pyi | 2 ++ tests/test_conversions.py | 7 +++++++ 4 files changed, 34 insertions(+) diff --git a/README.rst b/README.rst index 059a68f..e63e9a5 100644 --- a/README.rst +++ b/README.rst @@ -64,6 +64,11 @@ autolinks A boolean indicating whether the "automatic link" style should be used when a ``a`` tag's contents match its href. Defaults to ``True``. +base_url + A base URL to use for resolving relative URLs. When specified, relative URLs + in the HTML will be converted to absolute URLs using this base. Defaults to + no base URL. + default_title A boolean to enable setting the title of a link to its href, if no title is given. Defaults to ``False``. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 148d340..dbffc82 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -2,6 +2,7 @@ from textwrap import fill import re import six +import urllib.parse # General-purpose regex patterns @@ -176,6 +177,7 @@ def _next_block_content_sibling(el): class MarkdownConverter(object): class DefaultOptions: autolinks = True + base_url = '' bs4_options = 'html.parser' bullets = '*+-' # An iterable of bullet types. code_language = '' @@ -435,6 +437,20 @@ def underline(self, text, pad_char): text = (text or '').rstrip() return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' + def _make_absolute(self, url): + """Convert a URL to absolute using base_url if it's not already absolute.""" + base_url = self.options['base_url'] + + # Check if URLs to join actually exist + if not url or not base_url: + return url + + # Check if URL is already absolute + if urllib.parse.urlparse(url).netloc: + return url + + return urllib.parse.urljoin(base_url, url) + def convert_a(self, el, text, parent_tags): if '_noformat' in parent_tags: return text @@ -453,6 +469,7 @@ def convert_a(self, el, text, parent_tags): if self.options['default_title'] and not title: title = href title_part = ' "%s"' % title.replace('"', r'\"') if title else '' + href = self._make_absolute(href) return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol']) @@ -588,6 +605,7 @@ def convert_img(self, el, text, parent_tags): and el.parent.name not in self.options['keep_inline_images_in']): return alt + src = self._make_absolute(src) return '![%s](%s%s)' % (alt, src, title_part) def convert_video(self, el, text, parent_tags): @@ -600,6 +618,8 @@ def convert_video(self, el, text, parent_tags): if sources: src = sources[0].attrs.get('src', None) or '' poster = el.attrs.get('poster', None) or '' + src = self._make_absolute(src) + poster = self._make_absolute(poster) if src and poster: return '[![%s](%s)](%s)' % (text, poster, src) if src: diff --git a/markdownify/__init__.pyi b/markdownify/__init__.pyi index 5f9b852..94a9060 100644 --- a/markdownify/__init__.pyi +++ b/markdownify/__init__.pyi @@ -18,6 +18,7 @@ STRIP_ONE: str def markdownify( html: str, autolinks: bool = ..., + base_url: str = ..., bs4_options: str = ..., bullets: str = ..., code_language: str = ..., @@ -46,6 +47,7 @@ class MarkdownConverter: def __init__( self, autolinks: bool = ..., + base_url: str = ..., bs4_options: str = ..., bullets: str = ..., code_language: str = ..., diff --git a/tests/test_conversions.py b/tests/test_conversions.py index dd99dfb..b8d04d1 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -374,3 +374,10 @@ def test_spaces(): assert md('
  1. x
  2. y
') == '\n\n1. x\n2. y\n' assert md('