element-hq · clokep · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 11, 2025
@@ -444,7 +444,7 @@ jobs:
       - run: |
           sudo apt-get -qq update
           sudo apt-get -qq install build-essential libffi-dev python3-dev \
-            libxml2-dev libxslt-dev xmlsec1 zlib1g-dev libjpeg-dev libwebp-dev
+            libxslt-dev xmlsec1 zlib1g-dev libjpeg-dev libwebp-dev
 
       - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
         with:
@@ -496,7 +496,7 @@ jobs:
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
       # Install libs necessary for PyPy to build binary wheels for dependencies
-      - run: sudo apt-get -qq install xmlsec1 libxml2-dev libxslt-dev
+      - run: sudo apt-get -qq install xmlsec1 libxslt-dev
       - uses: matrix-org/setup-python-poetry@5bbf6603c5c930615ec8a29f1b5d7d258d905aa4 # v2.0.0
         with:
           python-version: ${{ matrix.python-version }}

@@ -0,0 +1 @@
+Switch to `beautifulsoup4` from `lxml` for URL previews. Contributed by @clokep.
@@ -307,7 +307,7 @@ Installing prerequisites on CentOS or Fedora Linux:
 
 ```sh
 sudo dnf install libtiff-devel libjpeg-devel libzip-devel freetype-devel \
-                 libwebp-devel libxml2-devel libxslt-devel libpq-devel \
+                 libwebp-devel libxslt-devel libpq-devel \
                  python3-virtualenv libffi-devel openssl-devel python3-devel
 sudo dnf group install "Development Tools"
 ```
@@ -633,10 +633,6 @@ This is critical from a security perspective to stop arbitrary Matrix users
 spidering 'internal' URLs on your network. At the very least we recommend that
 your loopback and RFC1918 IP addresses are blacklisted.
 
-This also requires the optional `lxml` python dependency to be  installed. This
-in turn requires the `libxml2` library to be available - on  Debian/Ubuntu this
-means `apt-get install libxml2-dev`, or equivalent for your OS.
-
 ### Backups
 
 Don't forget to take [backups](../usage/administration/backups.md) of your new server!

@@ -100,7 +100,6 @@
                   libjpeg
                   libpqxx
                   libwebp
-                  libxml2
                   libxslt
                   sqlite
 

@@ -139,7 +139,7 @@ oidc = ["authlib>=0.15.1"]
 # `systemd.journal.JournalHandler`, as is documented in
 # `contrib/systemd/log_config.yaml`.
 systemd = ["systemd-python>=231"]
-url-preview = ["lxml>=4.6.3"]
+url-preview = ["beautifulsoup4>=4.13.0"]
 sentry = ["sentry-sdk>=0.7.2"]
 opentracing = [
     "jaeger-client>=4.2.0",
@@ -182,7 +182,7 @@ all = [
     # oidc and jwt
     "authlib>=0.15.1",
     # url-preview
-    "lxml>=4.6.3",
+    "beautifulsoup4>=4.13.0",
     # sentry
     "sentry-sdk>=0.7.2",
     # opentracing
@@ -266,7 +266,6 @@ generate-setup-file = true
 ruff = "0.14.6"
 
 # Typechecking
-lxml-stubs = ">=0.4.0"
 mypy = "*"
 mypy-zope = "*"
 types-bleach = ">=4.1.0"

@@ -21,16 +21,21 @@
 import html
 import logging
 import urllib.parse
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING
 
 import attr
 
-from synapse.media.preview_html import parse_html_description
+from synapse.media.preview_html import (
+    NON_BLANK,
+    decode_body,
+    get_attribute,
+    parse_html_description,
+)
 from synapse.types import JsonDict
 from synapse.util.json import json_decoder
 
 if TYPE_CHECKING:
-    from lxml import etree
+    from bs4 import BeautifulSoup
 
     from synapse.server import HomeServer
 
@@ -105,35 +110,25 @@ def get_oembed_url(self, url: str) -> str | None:
         # No match.
         return None
 
-    def autodiscover_from_html(self, tree: "etree._Element") -> str | None:
+    def autodiscover_from_html(self, soup: "BeautifulSoup") -> str | None:
         """
         Search an HTML document for oEmbed autodiscovery information.
 
         Args:
-            tree: The parsed HTML body.
+            soup: The parsed HTML body.
 
         Returns:
             The URL to use for oEmbed information, or None if no URL was found.
         """
         # Search for link elements with the proper rel and type attributes.
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-        for tag in cast(
-            list["etree._Element"],
-            tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"),
-        ):
-            if "href" in tag.attrib:
-                return cast(str, tag.attrib["href"])
-
-        # Some providers (e.g. Flickr) use alternative instead of alternate.
-        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-        for tag in cast(
-            list["etree._Element"],
-            tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"),
-        ):
-            if "href" in tag.attrib:
-                return cast(str, tag.attrib["href"])
-
-        return None
+        # Some providers (e.g. Flickr) use `alternative` instead of `alternate`.
+        tag = soup.find(
+            "link",
+            rel=("alternate", "alternative"),
+            type="application/json+oembed",
+            href=NON_BLANK,
+        )
+        return get_attribute(tag, "href") if tag else None
 
     def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         """
@@ -196,7 +191,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         if oembed_type == "rich":
             html_str = oembed.get("html")
             if isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, html_str)
+                calc_description_and_urls(open_graph_response, html_str, url)
 
         elif oembed_type == "photo":
             # If this is a photo, use the full image, not the thumbnail.
@@ -208,7 +203,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
             open_graph_response["og:type"] = "video.other"
             html_str = oembed.get("html")
             if html_str and isinstance(html_str, str):
-                calc_description_and_urls(open_graph_response, oembed["html"])
+                calc_description_and_urls(open_graph_response, oembed["html"], url)
             for size in ("width", "height"):
                 val = oembed.get(size)
                 if type(val) is int:  # noqa: E721
@@ -223,55 +218,45 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         return OEmbedResult(open_graph_response, author_name, cache_age)
 
 
-def _fetch_urls(tree: "etree._Element", tag_name: str) -> list[str]:
-    results = []
-    # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
-    for tag in cast(list["etree._Element"], tree.xpath("//*/" + tag_name)):
-        if "src" in tag.attrib:
-            results.append(cast(str, tag.attrib["src"]))
-    return results
+def _fetch_url(soup: "BeautifulSoup", tag_name: str) -> str | None:
+    tag = soup.find(tag_name, src=NON_BLANK)
+    return get_attribute(tag, "src") if tag else None
 
 
-def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
+def calc_description_and_urls(
+    open_graph_response: JsonDict, html_body: str, url: str
+) -> None:
     """
     Calculate description for an HTML document.
 
-    This uses lxml to convert the HTML document into plaintext. If errors
+    This uses BeautifulSoup to convert the HTML document into plaintext. If errors
     occur during processing of the document, an empty response is returned.
 
     Args:
         open_graph_response: The current Open Graph summary. This is updated with additional fields.
         html_body: The HTML document, as bytes.
-
-    Returns:
-        The summary
+        url: The URL which is being previewed (not the one which was requested).
     """
-    # If there's no body, nothing useful is going to be found.
-    if not html_body:
-        return
-
-    from lxml import etree
+    soup = decode_body(html_body, url)
 
-    # Create an HTML parser. If this fails, log and return no metadata.
-    parser = etree.HTMLParser(recover=True, encoding="utf-8")
-
-    # Attempt to parse the body. If this fails, log and return no metadata.
-    tree = etree.fromstring(html_body, parser)
-
-    # The data was successfully parsed, but no tree was found.
-    if tree is None:
+    # If there's no body, nothing useful is going to be found.
+    if not soup:
         return
 
     # Attempt to find interesting URLs (images, videos, embeds).
     if "og:image" not in open_graph_response:
-        image_urls = _fetch_urls(tree, "img")
-        if image_urls:
-            open_graph_response["og:image"] = image_urls[0]
-
-    video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
-    if video_urls:
-        open_graph_response["og:video"] = video_urls[0]
-
-    description = parse_html_description(tree)
+        image_url = _fetch_url(soup, "img")
+        if image_url:
+            open_graph_response["og:image"] = image_url
+
+    video_url = _fetch_url(soup, "video")
+    if video_url:
+        open_graph_response["og:video"] = video_url
+    else:
+        embed_url = _fetch_url(soup, "embed")
+        if embed_url:
+            open_graph_response["og:video"] = embed_url
+
+    description = parse_html_description(soup)
     if description:
         open_graph_response["og:description"] = description
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Switch to `beautifulsoup4` from `lxml` for URL previews. Contributed by @clokep.
-Original file line number
+Diff line change
@@ Expand Up / @@ -100,7 +100,6 @@ @@
                       libjpeg
                       libpqxx
                       libwebp
-                      libxml2
                       libxslt
                       sqlite
@@ Expand Down @@