Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ jobs:
- run: |
sudo apt-get -qq update
sudo apt-get -qq install build-essential libffi-dev python3-dev \
libxml2-dev libxslt-dev xmlsec1 zlib1g-dev libjpeg-dev libwebp-dev
libxslt-dev xmlsec1 zlib1g-dev libjpeg-dev libwebp-dev

- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
Expand Down Expand Up @@ -496,7 +496,7 @@ jobs:
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
# Install libs necessary for PyPy to build binary wheels for dependencies
- run: sudo apt-get -qq install xmlsec1 libxml2-dev libxslt-dev
- run: sudo apt-get -qq install xmlsec1 libxslt-dev
- uses: matrix-org/setup-python-poetry@5bbf6603c5c930615ec8a29f1b5d7d258d905aa4 # v2.0.0
with:
python-version: ${{ matrix.python-version }}
Expand Down
1 change: 1 addition & 0 deletions changelog.d/19301.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Switch to `beautifulsoup4` from `lxml` for URL previews. Contributed by @clokep.
6 changes: 1 addition & 5 deletions docs/setup/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ Installing prerequisites on CentOS or Fedora Linux:

```sh
sudo dnf install libtiff-devel libjpeg-devel libzip-devel freetype-devel \
libwebp-devel libxml2-devel libxslt-devel libpq-devel \
libwebp-devel libxslt-devel libpq-devel \
python3-virtualenv libffi-devel openssl-devel python3-devel
sudo dnf group install "Development Tools"
```
Expand Down Expand Up @@ -633,10 +633,6 @@ This is critical from a security perspective to stop arbitrary Matrix users
spidering 'internal' URLs on your network. At the very least we recommend that
your loopback and RFC1918 IP addresses are blacklisted.

This also requires the optional `lxml` python dependency to be installed. This
in turn requires the `libxml2` library to be available - on Debian/Ubuntu this
means `apt-get install libxml2-dev`, or equivalent for your OS.
Comment thread
clokep marked this conversation as resolved.
Comment thread
clokep marked this conversation as resolved.
Comment thread
clokep marked this conversation as resolved.

### Backups

Don't forget to take [backups](../usage/administration/backups.md) of your new server!
Expand Down
1 change: 0 additions & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@
libjpeg
libpqxx
libwebp
libxml2
libxslt
sqlite

Expand Down
314 changes: 82 additions & 232 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ oidc = ["authlib>=0.15.1"]
# `systemd.journal.JournalHandler`, as is documented in
# `contrib/systemd/log_config.yaml`.
systemd = ["systemd-python>=231"]
url-preview = ["lxml>=4.6.3"]
url-preview = ["beautifulsoup4>=4.13.0"]
sentry = ["sentry-sdk>=0.7.2"]
opentracing = [
"jaeger-client>=4.2.0",
Expand Down Expand Up @@ -182,7 +182,7 @@ all = [
# oidc and jwt
"authlib>=0.15.1",
# url-preview
"lxml>=4.6.3",
"beautifulsoup4>=4.13.0",
# sentry
"sentry-sdk>=0.7.2",
# opentracing
Expand Down Expand Up @@ -266,7 +266,6 @@ generate-setup-file = true
ruff = "0.14.6"

# Typechecking
lxml-stubs = ">=0.4.0"
mypy = "*"
mypy-zope = "*"
types-bleach = ">=4.1.0"
Expand Down
103 changes: 44 additions & 59 deletions synapse/media/oembed.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,21 @@
import html
import logging
import urllib.parse
from typing import TYPE_CHECKING, cast
from typing import TYPE_CHECKING

import attr

from synapse.media.preview_html import parse_html_description
from synapse.media.preview_html import (
NON_BLANK,
decode_body,
get_attribute,
parse_html_description,
)
from synapse.types import JsonDict
from synapse.util.json import json_decoder

if TYPE_CHECKING:
from lxml import etree
from bs4 import BeautifulSoup

from synapse.server import HomeServer

Expand Down Expand Up @@ -105,35 +110,25 @@ def get_oembed_url(self, url: str) -> str | None:
# No match.
return None

def autodiscover_from_html(self, tree: "etree._Element") -> str | None:
def autodiscover_from_html(self, soup: "BeautifulSoup") -> str | None:
"""
Search an HTML document for oEmbed autodiscovery information.

Args:
tree: The parsed HTML body.
soup: The parsed HTML body.

Returns:
The URL to use for oEmbed information, or None if no URL was found.
"""
# Search for link elements with the proper rel and type attributes.
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
for tag in cast(
list["etree._Element"],
tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"),
):
if "href" in tag.attrib:
return cast(str, tag.attrib["href"])

# Some providers (e.g. Flickr) use alternative instead of alternate.
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
for tag in cast(
list["etree._Element"],
tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"),
):
if "href" in tag.attrib:
return cast(str, tag.attrib["href"])

return None
# Some providers (e.g. Flickr) use `alternative` instead of `alternate`.
tag = soup.find(
"link",
rel=("alternate", "alternative"),
Comment thread
MadLittleMods marked this conversation as resolved.
type="application/json+oembed",
href=NON_BLANK,
)
return get_attribute(tag, "href") if tag else None

def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
"""
Expand Down Expand Up @@ -196,7 +191,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
if oembed_type == "rich":
html_str = oembed.get("html")
if isinstance(html_str, str):
calc_description_and_urls(open_graph_response, html_str)
calc_description_and_urls(open_graph_response, html_str, url)

elif oembed_type == "photo":
# If this is a photo, use the full image, not the thumbnail.
Expand All @@ -208,7 +203,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
open_graph_response["og:type"] = "video.other"
html_str = oembed.get("html")
if html_str and isinstance(html_str, str):
calc_description_and_urls(open_graph_response, oembed["html"])
calc_description_and_urls(open_graph_response, oembed["html"], url)
for size in ("width", "height"):
val = oembed.get(size)
if type(val) is int: # noqa: E721
Expand All @@ -223,55 +218,45 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
return OEmbedResult(open_graph_response, author_name, cache_age)


def _fetch_urls(tree: "etree._Element", tag_name: str) -> list[str]:
results = []
# Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
for tag in cast(list["etree._Element"], tree.xpath("//*/" + tag_name)):
if "src" in tag.attrib:
results.append(cast(str, tag.attrib["src"]))
return results
def _fetch_url(soup: "BeautifulSoup", tag_name: str) -> str | None:
tag = soup.find(tag_name, src=NON_BLANK)
return get_attribute(tag, "src") if tag else None


def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
def calc_description_and_urls(
open_graph_response: JsonDict, html_body: str, url: str
) -> None:
"""
Calculate description for an HTML document.

This uses lxml to convert the HTML document into plaintext. If errors
This uses BeautifulSoup to convert the HTML document into plaintext. If errors
occur during processing of the document, an empty response is returned.

Args:
open_graph_response: The current Open Graph summary. This is updated with additional fields.
html_body: The HTML document, as bytes.

Returns:
The summary
url: The URL which is being previewed (not the one which was requested).
"""
# If there's no body, nothing useful is going to be found.
if not html_body:
return

from lxml import etree
soup = decode_body(html_body, url)

# Create an HTML parser. If this fails, log and return no metadata.
parser = etree.HTMLParser(recover=True, encoding="utf-8")

# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(html_body, parser)

# The data was successfully parsed, but no tree was found.
if tree is None:
# If there's no body, nothing useful is going to be found.
if not soup:
return

# Attempt to find interesting URLs (images, videos, embeds).
if "og:image" not in open_graph_response:
image_urls = _fetch_urls(tree, "img")
if image_urls:
open_graph_response["og:image"] = image_urls[0]

video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
if video_urls:
open_graph_response["og:video"] = video_urls[0]

description = parse_html_description(tree)
image_url = _fetch_url(soup, "img")
if image_url:
open_graph_response["og:image"] = image_url

video_url = _fetch_url(soup, "video")
if video_url:
open_graph_response["og:video"] = video_url
else:
embed_url = _fetch_url(soup, "embed")
if embed_url:
open_graph_response["og:video"] = embed_url

description = parse_html_description(soup)
if description:
open_graph_response["og:description"] = description
Loading
Loading