Skip to content

Commit 6bd530b

Browse files
committed
feat: add Typesense backend for search
Also run `make upgrade` to be able to successfully compile dependencies. This upgrade may need to be refactored out of this commit. Private-ref: https://tasks.opencraft.com/browse/BB-9975
1 parent 51ec507 commit 6bd530b

File tree

11 files changed

+1351
-511
lines changed

11 files changed

+1351
-511
lines changed

forum/search/typesense.py

Lines changed: 377 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,377 @@
1+
"""
2+
Typesense backend for searching comments and threads.
3+
"""
4+
5+
from typing import Any, Optional
6+
7+
from bs4 import BeautifulSoup
8+
from django.conf import settings
9+
from django.core.paginator import Paginator
10+
from typesense import Client
11+
from typesense.types.collection import CollectionCreateSchema
12+
from typesense.types.document import DocumentSchema, SearchParameters
13+
from typesense.exceptions import ObjectNotFound
14+
15+
from forum.backends.mysql.models import Comment, CommentThread
16+
from forum.constants import FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT
17+
from forum.search.base import (
18+
BaseDocumentSearchBackend,
19+
BaseIndexSearchBackend,
20+
BaseSearchBackend,
21+
BaseThreadSearchBackend,
22+
)
23+
24+
_TYPESENSE_CLIENT: Client | None = None
25+
26+
27+
def get_typesense_client() -> Client:
28+
"""
29+
Return a singleton Typesense client instance.
30+
"""
31+
global _TYPESENSE_CLIENT
32+
if _TYPESENSE_CLIENT is None:
33+
_TYPESENSE_CLIENT = Client(
34+
{
35+
"api_key": settings.TYPESENSE_API_KEY,
36+
"nodes": [settings.TYPESENSE_URL],
37+
}
38+
)
39+
return _TYPESENSE_CLIENT
40+
41+
42+
class CommentsIndex:
43+
"""
44+
Common data and operations relating to the comments index.
45+
"""
46+
47+
model = Comment
48+
49+
@staticmethod
50+
def name() -> str:
51+
"""
52+
Return the Typesense index name for the index.
53+
"""
54+
return settings.TYPESENSE_COLLECTION_PREFIX + "comments"
55+
56+
@classmethod
57+
def schema(cls) -> CollectionCreateSchema:
58+
return {
59+
"name": cls.name(),
60+
"fields": [
61+
{"name": "course_id", "type": "string"},
62+
{"name": "comment_thread_id", "type": "string"},
63+
{"name": "body", "type": "string"},
64+
],
65+
}
66+
67+
@staticmethod
68+
def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
69+
"""
70+
Build a Typesense document for this index.
71+
"""
72+
# NOTE: Comments have no commentable_id or title, and the context is hardcoded to "course".
73+
return {
74+
"id": str(doc_id),
75+
"course_id": str(data.get("course_id", "")),
76+
"comment_thread_id": str(data.get("comment_thread_id", "")),
77+
"body": (
78+
BeautifulSoup(data["body"], features="html.parser").get_text()
79+
if data.get("body")
80+
else ""
81+
),
82+
}
83+
84+
@staticmethod
85+
def build_search_parameters(
86+
*, search_text: str, course_id: str | None
87+
) -> SearchParameters:
88+
"""
89+
Build Typesense search parameters for this index.
90+
"""
91+
return {
92+
"q": search_text,
93+
"query_by": "body",
94+
"filter_by": (
95+
f"course_id:={quote_filter_value(course_id)}" if course_id else ""
96+
),
97+
"per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
98+
}
99+
100+
101+
class CommentThreadsIndex:
102+
"""
103+
Common data and operations relating to the comments index.
104+
"""
105+
106+
model = CommentThread
107+
108+
@staticmethod
109+
def name() -> str:
110+
"""
111+
Return the Typesense index name for the index.
112+
"""
113+
return settings.TYPESENSE_COLLECTION_PREFIX + "comment_threads"
114+
115+
@classmethod
116+
def schema(cls) -> CollectionCreateSchema:
117+
return {
118+
"name": cls.name(),
119+
"fields": [
120+
{"name": "course_id", "type": "string"},
121+
{"name": "commentable_id", "type": "string"},
122+
{"name": "context", "type": "string"},
123+
{"name": "title", "type": "string"},
124+
{"name": "body", "type": "string"},
125+
],
126+
}
127+
128+
@staticmethod
129+
def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
130+
"""
131+
Build a Typesense document for this index.
132+
"""
133+
return {
134+
"id": str(doc_id),
135+
"course_id": str(data.get("course_id", "")),
136+
"commentable_id": str(data.get("commentable_id", "")),
137+
"context": str(data.get("context", "")),
138+
"title": str(data.get("title", "")),
139+
"body": (
140+
BeautifulSoup(data["body"], features="html.parser").get_text()
141+
if data.get("body")
142+
else ""
143+
),
144+
}
145+
146+
@staticmethod
147+
def build_search_parameters(
148+
*,
149+
search_text: str,
150+
course_id: str | None,
151+
context: str,
152+
commentable_ids: list[str] | None,
153+
) -> SearchParameters:
154+
"""
155+
Build Typesense search parameters for this index.
156+
"""
157+
# Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy.
158+
filters = [f"context:{quote_filter_value(context)}"]
159+
if commentable_ids:
160+
safe_ids = ", ".join(quote_filter_value(value) for value in commentable_ids)
161+
filters.append(f"commentable_ids:[{safe_ids}]")
162+
if course_id:
163+
filters.append(f"course_id:={quote_filter_value(course_id)}")
164+
165+
return {
166+
"q": search_text,
167+
"query_by": "title,body",
168+
"filter_by": " && ".join(filters),
169+
"per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
170+
}
171+
172+
173+
INDICES: dict[str, type[CommentsIndex] | type[CommentThreadsIndex]] = {
174+
"comments": CommentsIndex,
175+
"comment_threads": CommentThreadsIndex,
176+
}
177+
178+
179+
class TypesenseDocumentBackend(BaseDocumentSearchBackend):
180+
"""
181+
Document backend implementation for Typesense.
182+
"""
183+
184+
def index_document(
185+
self, index_name: str, doc_id: str | int, document: dict[str, Any]
186+
) -> None:
187+
"""
188+
Index a document in Typesense.
189+
"""
190+
client = get_typesense_client()
191+
index = INDICES[index_name]
192+
typesense_document = index.build_document(doc_id, document)
193+
client.collections[index.name()].documents.upsert(typesense_document)
194+
195+
def update_document(
196+
self, index_name: str, doc_id: str | int, update_data: dict[str, Any]
197+
) -> None:
198+
"""
199+
Same operation as index_document, because upsert is used.
200+
"""
201+
return self.index_document(index_name, doc_id, update_data)
202+
203+
def delete_document(self, index_name: str, doc_id: str | int) -> None:
204+
"""
205+
Delete a document from Typesense.
206+
"""
207+
client = get_typesense_client()
208+
index = INDICES[index_name]
209+
client.collections[index.name()].documents[str(doc_id)].delete(
210+
delete_parameters={"ignore_not_found": True},
211+
)
212+
213+
214+
class TypesenseIndexBackend(BaseIndexSearchBackend):
215+
"""
216+
Manage indexes for the Typesense backend.
217+
218+
Typesense calls these "collections". https://typesense.org/docs/29.0/api/collections.html
219+
"""
220+
221+
def initialize_indices(self, force_new_index: bool = False) -> None:
222+
"""
223+
Initialize the indices in Typesense.
224+
225+
If force_new_index is True, the indexes will be dropped before being recreated.
226+
"""
227+
client = get_typesense_client()
228+
for index in INDICES.values():
229+
exists: bool = True
230+
try:
231+
client.collections[index.name()].retrieve()
232+
except ObjectNotFound:
233+
exists = False
234+
235+
if force_new_index and exists:
236+
client.collections[index.name()].delete()
237+
238+
if force_new_index or not exists:
239+
client.collections.create(index.schema())
240+
241+
def rebuild_indices(
242+
self, batch_size: int = 500, extra_catchup_minutes: int = 5
243+
) -> None:
244+
"""
245+
Reindex everything in Typesense
246+
247+
The Typesense collections are dropped and recreated,
248+
and data is reindexed from the MySQL database.
249+
250+
Only MySQL-backed instances are supported.
251+
Note that the `extra_catchup_minutes` argument is ignored.
252+
"""
253+
client = get_typesense_client()
254+
self.initialize_indices(force_new_index=True)
255+
for index in INDICES.values():
256+
paginator = Paginator(index.model.objects.all(), per_page=batch_size)
257+
for page_number in paginator.page_range:
258+
page = paginator.get_page(page_number)
259+
documents = [
260+
index.build_document(obj.pk, obj.doc_to_hash())
261+
for obj in page.object_list
262+
]
263+
if documents:
264+
client.collections[index.name()].documents.import_(
265+
documents, {"action": "upsert"}
266+
)
267+
268+
def validate_indices(self) -> None:
269+
"""
270+
Check if the indices exist and are valid.
271+
272+
Raise an exception if any do not exist or if any are not valid.
273+
"""
274+
client = get_typesense_client()
275+
for index in INDICES.values():
276+
collection = client.collections[index.name()].retrieve()
277+
# TODO: collection returns more information than the initial create schema,
278+
# so we need a better comparison here; this is currently broken
279+
if collection != index.schema():
280+
print(f"Expected schema: {index.schema()}")
281+
print(f"Found schema: {collection}")
282+
raise AssertionError(
283+
f"Collection {index.name()} exists, but schema does not match expected."
284+
)
285+
286+
def refresh_indices(self) -> None:
287+
"""
288+
Noop on Typesense, as all write API operations are synchronous.
289+
290+
See https://typesense.org/docs/guide/migrating-from-algolia.html#synchronous-write-apis for more information.
291+
"""
292+
return None
293+
294+
def delete_unused_indices(self) -> int:
295+
"""
296+
Noop on Typesense.
297+
"""
298+
return 0
299+
300+
301+
def quote_filter_value(value: str) -> str:
302+
"""
303+
Sanitize and safely quote a value for use in a Typesense filter.
304+
305+
https://typesense.org/docs/guide/tips-for-filtering.html#escaping-special-characters
306+
"""
307+
return "`" + value.replace("`", "") + "`"
308+
309+
310+
class TypesenseThreadSearchBackend(BaseThreadSearchBackend):
311+
"""
312+
Thread search backend implementation for Typesense.
313+
"""
314+
315+
def get_thread_ids(
316+
self,
317+
context: str,
318+
# This argument is unsupported. Anyway, its only role was to boost some results,
319+
# which did not have much effect because they are shuffled anyway downstream.
320+
group_ids: list[int],
321+
search_text: str,
322+
# This parameter is unsupported, but as far as we know it's not used anywhere.
323+
sort_criteria: Optional[list[dict[str, str]]] = None,
324+
commentable_ids: Optional[list[str]] = None,
325+
course_id: Optional[str] = None,
326+
) -> list[str]:
327+
"""
328+
Retrieve thread IDs based on search criteria.
329+
"""
330+
client = get_typesense_client()
331+
thread_ids: set[str] = set()
332+
333+
# All comments have "course" as their context, and none of them have a commentable_id.
334+
if context == "course" and not commentable_ids:
335+
comment_results = client.collections[CommentsIndex.name()].documents.search(
336+
CommentsIndex.build_search_parameters(
337+
search_text=search_text, course_id=course_id
338+
)
339+
)
340+
for hit in comment_results.get("hits", []):
341+
thread_ids.add(hit["document"]["comment_thread_id"])
342+
343+
thread_results = client.collections[
344+
CommentThreadsIndex.name()
345+
].documents.search(
346+
CommentThreadsIndex.build_search_parameters(
347+
search_text=search_text,
348+
course_id=course_id,
349+
context=context,
350+
commentable_ids=commentable_ids,
351+
)
352+
)
353+
for hit in thread_results.get("hits", []):
354+
thread_ids.add(hit["document"]["id"])
355+
356+
return list(thread_ids)
357+
358+
def get_suggested_text(self, search_text: str) -> Optional[str]:
359+
"""
360+
Retrieve text suggestions for a given search query.
361+
362+
:param search_text: Text to search for suggestions
363+
:return: Suggested text or None
364+
"""
365+
# TODO: https://typesense.org/docs/guide/query-suggestions.html
366+
# TODO: if this is implemented, do we need to also implement get_thread_ids_with_corrected_text?
367+
return None
368+
369+
370+
class TypesenseBackend(BaseSearchBackend):
371+
"""
372+
Typesense-powered search backend.
373+
"""
374+
375+
DOCUMENT_SEARCH_CLASS = TypesenseDocumentBackend
376+
INDEX_SEARCH_CLASS = TypesenseIndexBackend
377+
THREAD_SEARCH_CLASS = TypesenseThreadSearchBackend

forum/settings/common.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@ def plugin_settings(settings: Any) -> None:
1010
Common settings for forum app
1111
"""
1212
# Search backend
13-
if getattr(settings, "MEILISEARCH_ENABLED", False):
13+
if getattr(settings, "TYPESENSE_ENABLED", False):
14+
settings.FORUM_SEARCH_BACKEND = getattr(
15+
settings,
16+
"FORUM_SEARCH_BACKEND",
17+
"forum.search.typesense.TypesenseBackend",
18+
)
19+
elif getattr(settings, "MEILISEARCH_ENABLED", False):
1420
settings.FORUM_SEARCH_BACKEND = getattr(
1521
settings,
1622
"FORUM_SEARCH_BACKEND",

requirements/base.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ requests
1111
pymongo
1212
elasticsearch
1313
edx-search # meilisearch backend
14+
typesense
1415
mysqlclient

0 commit comments

Comments
 (0)