From 1d757c81308a14a2a7e30a8e107f2072f03d1168 Mon Sep 17 00:00:00 2001 From: Carolin Walter Date: Fri, 24 Apr 2026 11:57:13 +0000 Subject: [PATCH 01/34] added pipeline functions for updater --- learn2rag/pipeline/app.py | 10 +--- learn2rag/pipeline/ingestion.py | 20 +------ learn2rag/pipeline/json_loader.py | 12 ---- learn2rag/pipeline/main.py | 44 +++++++++++++- learn2rag/pipeline/store.py | 99 +++++++++++++++++++++++++++++++ 5 files changed, 146 insertions(+), 39 deletions(-) delete mode 100644 learn2rag/pipeline/json_loader.py create mode 100644 learn2rag/pipeline/store.py diff --git a/learn2rag/pipeline/app.py b/learn2rag/pipeline/app.py index d6958f5..c2e2a13 100644 --- a/learn2rag/pipeline/app.py +++ b/learn2rag/pipeline/app.py @@ -171,14 +171,6 @@ async def search( return await search_authorized(user=input.user, question=input.question) - - -@app.post("/ingest") -async def ingest() -> None: - ingestion.index(user_config, opt_config) - - - @app.get("/test") async def test() -> TestResponse: - return TestResponse(message="Hello World") + return TestResponse(message="Hello World") \ No newline at end of file diff --git a/learn2rag/pipeline/ingestion.py b/learn2rag/pipeline/ingestion.py index b4c1150..68a52d9 100644 --- a/learn2rag/pipeline/ingestion.py +++ b/learn2rag/pipeline/ingestion.py @@ -12,8 +12,6 @@ from .qdrant import Qdrant from qdrant_client.models import PointStruct, Filter, FieldCondition, MatchValue, SparseVector, VectorParams, MultiVectorConfig, MultiVectorComparator, Distance - -from . import json_loader from .embeddings import create_embeddings @@ -106,9 +104,8 @@ def payload(sample: dict[str, Any]) -> dict[str, str]: "document_id": sample["metadata"].get("document_id", "") } -def index(user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: - logging.info('Loading documents') - all_documents = json_loader.json_loader(user_config['imported_documents_file_path']) +def index(documents: list[Document], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: + all_documents = documents # Split documents into chunks logging.info('Splitting documents into chunks') @@ -196,15 +193,4 @@ def index(user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: elif opt_config["query_mode"] == "multi": insert_multi(qdrant, collection_name, sample) else: - insert(qdrant, collection_name, sample) - - -def main() -> None: - logging.basicConfig(level=logging.INFO) - from .config import user_config, opt_config - index(user_config, opt_config) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - main() + insert(qdrant, collection_name, sample) \ No newline at end of file diff --git a/learn2rag/pipeline/json_loader.py b/learn2rag/pipeline/json_loader.py deleted file mode 100644 index 739bc60..0000000 --- a/learn2rag/pipeline/json_loader.py +++ /dev/null @@ -1,12 +0,0 @@ -from langchain_community.document_loaders import JSONLoader -from langchain_core.documents import Document - - -def json_loader(file_path: str) -> list[Document]: - loader = JSONLoader( - file_path, - jq_schema=".[]", - content_key="content", - metadata_func=lambda record, meta: record.get("metadata", {}), - ) - return loader.load() \ No newline at end of file diff --git a/learn2rag/pipeline/main.py b/learn2rag/pipeline/main.py index aea86fd..5600deb 100755 --- a/learn2rag/pipeline/main.py +++ b/learn2rag/pipeline/main.py @@ -4,9 +4,12 @@ import yaml import json +from langchain_core.documents.base import Document + from . import ingestion from . import search from . import generate +from .store import delete_collection, delete_documents, get_documents, update_documents if __name__ == "__main__": @@ -17,7 +20,46 @@ from .config import user_config, opt_config - ingestion.index(user_config, opt_config) + #delete_collection(loader_id="json_test_file", user_config=user_config, opt_config=opt_config) + results = get_documents(loader_id="json_test_file", user_config=user_config, opt_config=opt_config) + + documents = [ + Document(page_content=d["content"], metadata=d["metadata"]) + for d in [ + { + "metadata": { + "source": "C:C:\\Users\\foo\\Revised Manuscript_Text categorization approach.docx", + "content_hash": "e18e509d138cf86c22df0b0dfafc5ca5b8f1e266f5e3470de68190f3ebe495b0", + "source_path": "C:\\Users\\foo", + "file_extension": "docx", + "process_date": "2025-07-28", + "process_time": "14:42:02", + "loader_type": "DirectoryLoader", + "loader_id": "json_test_file", + "title": "The title of a real document", + "summary": "This document is awesome" + }, + "content": "A brand-new Corpus-based Real-time Text Classification and Tagging Approach for Social Data..." + }, + { + "metadata": { + "source": "C:C:\\Users\\foo\\qdrant.docx", + "content_hash": "7f3b9c1a0d4e6f8b2c5a7d9e1f0b3c6d8a4e2f1c9b7d0a6e5f1c3a8b9d2e4f0", + "source_path": "C:\\Users\\foo", + "file_extension": "docx", + "process_date": "2025-07-28", + "process_time": "14:42:02", + "loader_type": "DirectoryLoader", + "loader_id": "json_test_file", + "title": "The title of a real document", + "summary": "This document is awesome" + }, + "content": "Qdrant ist eine Open-Source-Vektordatenbank..." + }, + ] +] + update_documents(loader_id="json_test_file", documents=documents, user_config=user_config, opt_config=opt_config) + ingestion.index(documents, user_config, opt_config) if opt_config["query_mode"] == "multi": # in query_mode 'multi' different querys for each vector in the multi-vector are allowed diff --git a/learn2rag/pipeline/store.py b/learn2rag/pipeline/store.py new file mode 100644 index 0000000..56cec20 --- /dev/null +++ b/learn2rag/pipeline/store.py @@ -0,0 +1,99 @@ +import logging +from typing import Any + +from langchain_core import documents +from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny, FilterSelector +from langchain_core.documents.base import Document + +from learn2rag.pipeline.ingestion import index +from learn2rag.pipeline.qdrant import Qdrant + +def delete_collection(loader_id: str|None, user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: + """Delete a collection from the vector store or a subset of points based on loader_id.""" + qdrant = Qdrant(user_config["collection_name"], opt_config) + if qdrant.client.collection_exists(user_config["collection_name"]): + if loader_id is None: + logging.info('Deleting entire collection: %s', user_config["collection_name"]) + qdrant.client.delete_collection(collection_name=user_config["collection_name"]) + return + else: + # Delete points with the specified loader_id + logging.info('Deleting points with loader_id: %s from collection: %s', loader_id, user_config["collection_name"]) + qdrant.client.delete( + collection_name=user_config["collection_name"], + points_selector=FilterSelector( + filter=Filter( + must=[FieldCondition( + key="loader_id", + match=MatchValue(value=loader_id), + ), + ], + ) + ), + ) + +def delete_documents(loader_id: str, paths: list[str], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: + """Delete documents from the vector store based on loader_id and paths.""" + qdrant = Qdrant(user_config["collection_name"], opt_config) + if qdrant.client.collection_exists(user_config["collection_name"]): + logging.info('Deleting documents with loader_id: %s and paths: %s', loader_id, paths) + # Delete points with the specified loader_id and paths + for path in paths: + qdrant.client.delete( + collection_name=user_config["collection_name"], + points_selector=FilterSelector( + filter=Filter( + must=[ + FieldCondition( + key="loader_id", + match=MatchValue(value=loader_id), + ), + FieldCondition( + key="path", + match=MatchValue(value=path) + ), + ], + ) + ), + ) + +def get_documents(loader_id: str, user_config: dict[str, Any], opt_config: dict[str, Any]) -> list[dict[str, Any]]|None: + """Retrieve documents from the vector store based on loader_id.""" + qdrant = Qdrant(user_config["collection_name"], opt_config) + if qdrant.client.collection_exists(user_config["collection_name"]): + logging.info('Scrolling through collection to retrieve documents with loader_id: %s', loader_id) + filter = Filter( + must=[ + FieldCondition( + key="loader_id", + match=MatchValue(value=loader_id) + ) + ] + ) + points = [] + offset = None + + while True: + result = qdrant.client.scroll( + collection_name=user_config["collection_name"], + scroll_filter=filter, + limit=100, + offset=offset, + with_payload=True, + with_vectors=False, + ) + + points.extend(result[0]) + offset = result[1] + + if offset is None: + break + return [point.payload for point in points if point.payload is not None] + return None + +def update_documents(loader_id: str, documents: list[Document], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: + qdrant = Qdrant(user_config["collection_name"], opt_config) + if qdrant.client.collection_exists(user_config["collection_name"]): + logging.info('Updating documents with loader_id: %s', loader_id) + delete_documents(loader_id, paths=[doc.metadata["source"] for doc in documents], user_config=user_config, opt_config=opt_config) + index(documents, user_config, opt_config) \ No newline at end of file From ad52c5eb8c366295dc962de1fcb71bb0f1c8e0ec Mon Sep 17 00:00:00 2001 From: kymeyer Date: Tue, 28 Apr 2026 08:04:28 +0200 Subject: [PATCH 02/34] prepare loaders for updater Co-authored-by: Copilot --- learn2rag/importer/loaders/drupal_loader.py | 112 +++++- learn2rag/importer/loaders/process_loaders.py | 328 +++++++++++++++++- .../importer/loaders/sharepoint_loader.py | 170 ++++++++- learn2rag/importer/utils/import_state.py | 128 +++++++ learn2rag/pipeline/ingestion.py | 166 ++++++++- 5 files changed, 872 insertions(+), 32 deletions(-) create mode 100644 learn2rag/importer/utils/import_state.py diff --git a/learn2rag/importer/loaders/drupal_loader.py b/learn2rag/importer/loaders/drupal_loader.py index a6a731a..9c70868 100644 --- a/learn2rag/importer/loaders/drupal_loader.py +++ b/learn2rag/importer/loaders/drupal_loader.py @@ -11,13 +11,14 @@ Author: Kyrill Meyer Institution: IFDT -Version: 0.0.1 +Version: 0.0.2 Creation Date: March 17, 2026 -Last Modified: March 17, 2026 +Last Modified: April 24, 2026 """ import hashlib import logging +from datetime import datetime, timezone from typing import Any, Dict, List, Optional from bs4 import BeautifulSoup from langchain_core.documents import Document @@ -140,6 +141,7 @@ def load_from_drupal( text_fields: Optional[List[str]] = None, page_size: int = 50, language: str = "", + since: Optional[datetime] = None, ) -> List[Document]: """ Load documents from a Drupal instance via the JSON:API. @@ -193,6 +195,15 @@ def load_from_drupal( "page[offset]": 0, } + # Timestamp filter: only documents changed on or after `since` + if since is not None: + # Ensure the timestamp is timezone-aware and formatted as ISO 8601 for JSON:API + since_utc = since.astimezone(timezone.utc) if since.tzinfo else since.replace(tzinfo=timezone.utc) + params["filter[changed-filter][condition][path]"] = "changed" + params["filter[changed-filter][condition][operator]"] = ">=" + params["filter[changed-filter][condition][value]"] = since_utc.isoformat() + logger.info(f"DrupalLoader: applying since-filter >= {since_utc.isoformat()} for '{content_type}'") + page_count = 0 next_url: Optional[str] = endpoint @@ -301,3 +312,100 @@ def load_from_drupal( logger.info(f"DrupalLoader: finished. Total documents loaded: {len(all_documents)}") return all_documents + + +def get_all_drupal_document_ids( + base_url: str, + content_types: List[str], + auth_type: str = "none", + username: str = "", + password: str = "", + token: str = "", + page_size: int = 100, + language: str = "", +) -> List[str]: + """ + Retrieve the source URL for every current node in Drupal without loading content. + + Intended for deletion detection in the 2-pass delta import: compare the returned + set against the paths stored in Qdrant to find nodes that have been removed. + + Args: + base_url (str): Base URL of the Drupal site, e.g. ``"https://example.com"``. + content_types (list): List of content type machine names, e.g. ``["article", "page"]``. + auth_type (str): Authentication type: ``"none"``, ``"basic"``, or ``"token"``. + username (str): Username for Basic Auth. + password (str): Password for Basic Auth. + token (str): Bearer token for token auth. + page_size (int): Items per API page request (default 100; larger than load_from_drupal + default because only IDs are fetched, saving bandwidth). + language (str): Optional language filter passed via ``Accept-Language`` header. + + Returns: + List[str]: Source URLs of all current nodes, e.g. ``["https://example.com/node/42"]``. + """ + session = _build_session(auth_type, username, password, token) + if language: + session.headers.update({"Accept-Language": language}) + + endpoint_map = _discover_endpoint_map(base_url, session) + all_ids: List[str] = [] + + for content_type in content_types: + if stop_loading: + break + + resource_key = f"node--{content_type}" + if resource_key in endpoint_map: + endpoint = endpoint_map[resource_key] + else: + endpoint = f"{base_url.rstrip('/')}/jsonapi/node/{content_type}" + + # Request only nid + langcode (no content) to minimise bandwidth + params: Dict[str, Any] = { + "fields[node--{}]".format(content_type): "drupal_internal__nid,langcode", + "page[limit]": page_size, + "page[offset]": 0, + } + + next_url: Optional[str] = endpoint + page_count = 0 + + while next_url: + try: + if page_count == 0: + response = session.get(next_url, params=params, timeout=30) + else: + response = session.get(next_url, timeout=30) + response.raise_for_status() + data = response.json() + except requests.exceptions.RequestException as e: + logger.error(f"get_all_drupal_document_ids: Request failed for '{content_type}': {e}") + break + + items = data.get("data", []) + if not items: + break + + for item in items: + attributes: Dict[str, Any] = item.get("attributes", {}) + node_id: str = item.get("id", "") + drupal_id: str = str(attributes.get("drupal_internal__nid", node_id)) + source_url = f"{base_url.rstrip('/')}/node/{drupal_id}" + all_ids.append(source_url) + + links = data.get("links", {}) + next_link = links.get("next") + if next_link and isinstance(next_link, dict): + next_url = next_link.get("href") + elif next_link and isinstance(next_link, str): + next_url = next_link + else: + next_url = None + + page_count += 1 + + logger.info(f"get_all_drupal_document_ids: found {len(all_ids)} IDs so far after content type '{content_type}'") + + logger.info(f"get_all_drupal_document_ids: total {len(all_ids)} document IDs") + return all_ids diff --git a/learn2rag/importer/loaders/process_loaders.py b/learn2rag/importer/loaders/process_loaders.py index f9a4a06..a6eb3bd 100644 --- a/learn2rag/importer/loaders/process_loaders.py +++ b/learn2rag/importer/loaders/process_loaders.py @@ -6,20 +6,24 @@ Author: Kyrill Meyer Institution: IFDT -Version: 0.0.5 +Version: 0.0.6 Creation Date: June 10, 2025 -Last Modified: March 17, 2026 +Last Modified: April 24, 2026 """ +import hashlib import logging -from typing import List, Dict, Any +from typing import Callable, Dict, List, Any, TYPE_CHECKING +if TYPE_CHECKING: + from learn2rag.pipeline.qdrant import Qdrant + from learn2rag.importer.utils.import_state import ImportState from ..globals import stop_loading from langchain_core.documents import Document from .directory_loader import load_from_directory from .csv_loader import load_from_csv from .html_loader import load_html_content -from .sharepoint_loader import load_from_sharepoint -from .drupal_loader import load_from_drupal +from .sharepoint_loader import load_from_sharepoint, get_all_sharepoint_document_ids +from .drupal_loader import load_from_drupal, get_all_drupal_document_ids # # initialize logger @@ -60,10 +64,15 @@ def process_configuration_entries(config_entries: List[Dict[str, Any]]) -> List[ documents = load_from_directory(path, recursive=recursive, silent_errors=silent_errors, loader_id=loader_id) logger.info(f"Loaded {len(documents)} documents from {path} using {loader_type} for configuration entry with loader_id: {loader_id}.") elif loader_type == "CSVLoader": + path = entry.get("path") if not path: logger.error("Missing 'path' for 'CSVLoader' in configuration entry.") continue documents = load_from_csv(path) + # CSVLoader does not set loader_id or content_hash — populate them here + for doc in documents: + doc.metadata["loader_id"] = loader_id + doc.metadata["content_hash"] = hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest() logger.info(f"Loaded {len(documents)} documents from {path} using {loader_type}.") elif loader_type == "HTMLLoader": url = entry.get("url") @@ -152,3 +161,312 @@ def process_configuration_entries(config_entries: List[Dict[str, Any]]) -> List[ logger.error(f"Error processing entry {entry}: {e}") return all_documents + + +def process_delta_imports( + config_entries: List[Dict[str, Any]], + qdrant: "Qdrant", + user_config: Dict[str, Any], + opt_config: Dict[str, Any], + import_state: "ImportState", +) -> None: + """ + Perform a delta import for all configured loaders. + + Dispatches between two strategies: + + - **Intelligent loaders** (DrupalLoader, SharepointLoader): 2-pass approach — + fetch all current document IDs to detect deletions, then load only changed + documents using a server-side timestamp filter. + - **Normal loaders** (DirectoryLoader, HTMLLoader, CSVLoader): full load followed + by content-hash comparison to detect additions, changes, and deletions. + + The import timestamp is only persisted on successful completion, so a failed run + will be retried in full on the next call. + + Args: + config_entries (List[Dict[str, Any]]): Loader configuration entries from the + importer config file. + qdrant (Qdrant): Authenticated Qdrant wrapper instance + (``learn2rag.pipeline.qdrant.Qdrant``). + user_config (Dict[str, Any]): User configuration dict (must contain + ``collection_name``). + opt_config (Dict[str, Any]): Optimisation configuration dict. + import_state (ImportState): ImportState instance for timestamp management. + """ + from datetime import datetime, timezone + from learn2rag.pipeline.ingestion import ( + get_documents_by_loader_id, + delete_chunks_by_document, + ingest_batch, + ) + + collection_name = user_config.get("collection_name", opt_config.get("collection_name", "")) + + for entry in config_entries: + if stop_loading: + logger.info("Delta import stopped by user.") + break + + loader_type = entry.get("loader_type") + loader_id = entry.get("loader_id") or "" + + if not loader_type or not loader_id: + logger.error(f"process_delta_imports: entry missing loader_type or loader_id: {entry}") + continue + + try: + last_import_time = import_state.get_last_import_time(loader_id) + import_start = datetime.now(timezone.utc) + import_state.record_import_start(loader_id, import_start) + + # Retrieve existing Qdrant documents for this loader: {source_path: content_hash} + existing_docs: Dict[str, str] = get_documents_by_loader_id(qdrant, collection_name, loader_id) + is_initial = len(existing_docs) == 0 + + logger.info( + f"Delta import '{loader_id}': is_initial={is_initial}, " + f"last_import_time={last_import_time}, existing_docs={len(existing_docs)}" + ) + + # ---------------------------------------------------------------- + # INTELLIGENT LOADERS: Drupal / SharePoint + # 2-pass: (1) fetch all current IDs → detect deletions, + # (2) load only changed documents via timestamp filter + # ---------------------------------------------------------------- + if loader_type == "DrupalLoader": + base_url = entry.get("base_url") + if not base_url: + logger.error(f"DrupalLoader '{loader_id}': missing 'base_url'") + continue + base_url = str(base_url) + content_types = entry.get("content_types", []) + auth_type = str(entry.get("auth_type", "none")) + username = str(entry.get("username", "")) + password = str(entry.get("password", "")) + token = str(entry.get("token", "")) + text_fields = entry.get("text_fields") + page_size = int(entry.get("page_size", 50)) + language = str(entry.get("language", "")) + + if is_initial or last_import_time is None: + # No prior state: fall back to full load + hash comparison + logger.info(f"Drupal '{loader_id}': full load (initial={is_initial})") + all_docs = load_from_drupal( + base_url=base_url, content_types=content_types, loader_id=loader_id, + auth_type=auth_type, username=username, password=password, token=token, + text_fields=text_fields, page_size=page_size, language=language, + ) + if is_initial: + ingest_batch(all_docs, qdrant, user_config, opt_config) + else: + # Hash comparison: replace changed, remove deleted + _delta_by_hash(all_docs, existing_docs, qdrant, collection_name, loader_id, user_config, opt_config, delete_chunks_by_document, ingest_batch) + else: + # 2-pass delta + logger.info(f"Drupal '{loader_id}': 2-pass delta since {last_import_time.isoformat()}") + # Pass 1: fetch all current IDs to detect deleted documents + current_ids = set(get_all_drupal_document_ids( + base_url=base_url, content_types=content_types, + auth_type=auth_type, username=username, password=password, token=token, + page_size=page_size, language=language, + )) + deleted_paths = [p for p in existing_docs if p not in current_ids] + for path in deleted_paths: + logger.info(f"Drupal '{loader_id}': deleting removed document {path}") + delete_chunks_by_document(qdrant, collection_name, loader_id, path) + + # Pass 2: load and index changed documents + changed_docs = load_from_drupal( + base_url=base_url, content_types=content_types, loader_id=loader_id, + auth_type=auth_type, username=username, password=password, token=token, + text_fields=text_fields, page_size=page_size, language=language, + since=last_import_time, + ) + for doc in changed_docs: + source = doc.metadata.get("source", "") + delete_chunks_by_document(qdrant, collection_name, loader_id, source) + ingest_batch(changed_docs, qdrant, user_config, opt_config) + logger.info(f"Drupal '{loader_id}': {len(deleted_paths)} deleted, {len(changed_docs)} updated") + + elif loader_type == "SharepointLoader": + client_id = entry.get("client_id", "") + client_secret = entry.get("client_secret", "") + document_library_id = entry.get("document_library_id", "") + folder_path = entry.get("folder_path") + folder_id = entry.get("folder_id") + recursive = entry.get("recursive", False) + auth_with_token = entry.get("auth_with_token", True) + reset_token = entry.get("reset_token", False) + tenant_id = entry.get("tenant_id", "common") + site_id = entry.get("site_id") + + if is_initial or last_import_time is None: + logger.info(f"SharePoint '{loader_id}': full load (initial={is_initial})") + all_docs = load_from_sharepoint( + client_id=client_id, client_secret=client_secret, + document_library_id=document_library_id, folder_path=folder_path, + folder_id=folder_id, recursive=recursive, auth_with_token=auth_with_token, + reset_token=reset_token, tenant_id=tenant_id, site_id=site_id, + loader_id=loader_id, + ) + if is_initial: + ingest_batch(all_docs, qdrant, user_config, opt_config) + else: + _delta_by_hash(all_docs, existing_docs, qdrant, collection_name, loader_id, user_config, opt_config, delete_chunks_by_document, ingest_batch) + else: + logger.info(f"SharePoint '{loader_id}': 2-pass delta since {last_import_time.isoformat()}") + # Pass 1: fetch all current URLs to detect deleted documents + current_ids = set(get_all_sharepoint_document_ids( + client_id=client_id, client_secret=client_secret, + document_library_id=document_library_id, folder_path=folder_path, + folder_id=folder_id, recursive=recursive, auth_with_token=auth_with_token, + reset_token=reset_token, tenant_id=tenant_id, site_id=site_id, + )) + deleted_paths = [p for p in existing_docs if p not in current_ids] + for path in deleted_paths: + logger.info(f"SharePoint '{loader_id}': deleting removed document {path}") + delete_chunks_by_document(qdrant, collection_name, loader_id, path) + + # Pass 2: load and index changed documents + changed_docs = load_from_sharepoint( + client_id=client_id, client_secret=client_secret, + document_library_id=document_library_id, folder_path=folder_path, + folder_id=folder_id, recursive=recursive, auth_with_token=auth_with_token, + reset_token=reset_token, tenant_id=tenant_id, site_id=site_id, + loader_id=loader_id, since=last_import_time, + ) + for doc in changed_docs: + source = doc.metadata.get("source", "") + delete_chunks_by_document(qdrant, collection_name, loader_id, source) + ingest_batch(changed_docs, qdrant, user_config, opt_config) + logger.info(f"SharePoint '{loader_id}': {len(deleted_paths)} deleted, {len(changed_docs)} updated") + + # ---------------------------------------------------------------- + # NORMAL LOADERS: Directory / HTML / CSV — hash comparison + # ---------------------------------------------------------------- + elif loader_type == "DirectoryLoader": + path = entry.get("path") + if not path: + logger.error(f"DirectoryLoader '{loader_id}': missing 'path'") + continue + all_docs = load_from_directory( + path, + recursive=entry.get("recursive", False), + silent_errors=entry.get("silent_errors", True), + loader_id=loader_id, + ) + if is_initial: + ingest_batch(all_docs, qdrant, user_config, opt_config) + else: + _delta_by_hash(all_docs, existing_docs, qdrant, collection_name, loader_id, user_config, opt_config, delete_chunks_by_document, ingest_batch) + + elif loader_type == "HTMLLoader": + url = entry.get("url") + depth = entry.get("depth", 0) + if not url: + logger.error(f"HTMLLoader '{loader_id}': missing 'url'") + continue + all_docs = load_html_content(url, depth=depth, loader_id=loader_id) + if is_initial: + ingest_batch(all_docs, qdrant, user_config, opt_config) + else: + _delta_by_hash(all_docs, existing_docs, qdrant, collection_name, loader_id, user_config, opt_config, delete_chunks_by_document, ingest_batch) + + elif loader_type == "CSVLoader": + path = entry.get("path") + if not path: + logger.error(f"CSVLoader '{loader_id}': missing 'path'") + continue + all_docs = load_from_csv(path) + for doc in all_docs: + doc.metadata["loader_id"] = loader_id + doc.metadata["content_hash"] = hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest() + if is_initial: + ingest_batch(all_docs, qdrant, user_config, opt_config) + else: + _delta_by_hash(all_docs, existing_docs, qdrant, collection_name, loader_id, user_config, opt_config, delete_chunks_by_document, ingest_batch) + + else: + logger.error(f"process_delta_imports: unknown loader_type '{loader_type}' for loader_id '{loader_id}'") + continue + + import_state.save_success(loader_id) + logger.info(f"Delta import '{loader_id}': completed successfully.") + + except Exception as e: + logger.error(f"process_delta_imports: error processing loader '{loader_id}': {e}", exc_info=True) + + +def _delta_by_hash( + all_docs: List[Document], + existing_docs: Dict[str, str], + qdrant: "Qdrant", + collection_name: str, + loader_id: str, + user_config: Dict[str, Any], + opt_config: Dict[str, Any], + delete_chunks_by_document: Callable[..., None], + ingest_batch: Callable[..., None], +) -> None: + """ + Hash-based delta import for normal loaders (DirectoryLoader, HTMLLoader, CSVLoader). + + Groups freshly loaded documents by ``source`` URL, computes a combined content hash + per source, and then: + + - Deletes Qdrant chunks for sources that no longer exist in the new load. + - Re-indexes sources whose combined hash has changed (delete old + ingest new). + - Leaves unchanged sources untouched. + + Args: + all_docs (List[Document]): All documents returned by the loader for this run. + existing_docs (Dict[str, str]): Mapping of ``{source_url: content_hash}`` as + stored in Qdrant (from ``get_documents_by_loader_id``). + qdrant (Qdrant): Authenticated Qdrant wrapper instance. + collection_name (str): Target Qdrant collection name. + loader_id (str): Unique loader identifier. + user_config (Dict[str, Any]): User configuration dict. + opt_config (Dict[str, Any]): Optimisation configuration dict. + delete_chunks_by_document (Callable): Function with signature + ``(qdrant, collection, loader_id, path) -> None``. + ingest_batch (Callable): Function with signature + ``(docs, qdrant, user_config, opt_config) -> None``. + """ + # Group freshly loaded documents by source URL (1 source = N chunks) + # Comparison is performed at source level using the combined content hash + new_docs_by_source: Dict[str, List[Document]] = {} + for doc in all_docs: + source = doc.metadata.get("source", "") + new_docs_by_source.setdefault(source, []).append(doc) + + # Compute a combined content hash per source by concatenating all chunk hashes + new_hash_by_source: Dict[str, str] = {} + for source, docs in new_docs_by_source.items(): + combined = "".join(d.metadata.get("content_hash", d.page_content) for d in docs) + new_hash_by_source[source] = hashlib.sha256(combined.encode("utf-8")).hexdigest() + + # Remove documents that are no longer present in the fresh load + deleted_count = 0 + for source in list(existing_docs.keys()): + if source not in new_docs_by_source: + delete_chunks_by_document(qdrant, collection_name, loader_id, source) + deleted_count += 1 + + # Re-index changed and new documents + changed_docs: List[Document] = [] + for source, docs in new_docs_by_source.items(): + existing_hash = existing_docs.get(source) + if existing_hash != new_hash_by_source[source]: + if existing_hash is not None: + delete_chunks_by_document(qdrant, collection_name, loader_id, source) + changed_docs.extend(docs) + + if changed_docs: + ingest_batch(changed_docs, qdrant, user_config, opt_config) + + logger.info( + f"_delta_by_hash '{loader_id}': {deleted_count} deleted, " + f"{len(changed_docs)} chunks re-indexed from " + f"{len(set(d.metadata.get('source','') for d in changed_docs))} changed sources" + ) diff --git a/learn2rag/importer/loaders/sharepoint_loader.py b/learn2rag/importer/loaders/sharepoint_loader.py index 819b94a..7c521bf 100644 --- a/learn2rag/importer/loaders/sharepoint_loader.py +++ b/learn2rag/importer/loaders/sharepoint_loader.py @@ -7,15 +7,17 @@ and Site-Specific contexts. Author: Kyrill Meyer -Version: 0.0.5 +Version: 0.0.6 Institution: IFDT Creation Date: January 14, 2026 -Last Modified Date: March 17, 2026 +Last Modified Date: April 24, 2026 """ +import hashlib import logging import os import tempfile import shutil +from datetime import datetime, timezone from pathlib import Path from typing import List, Optional, Any, Union from langchain_community.document_loaders import UnstructuredFileLoader, TextLoader, UnstructuredExcelLoader, PyPDFLoader @@ -105,7 +107,8 @@ def _parse_file(file_path: Path, original_item: Any, loader_id: str = "N/A") -> "created": str(original_item.created), "modified": str(original_item.modified), "loader": "SharePointLoader", - "loader_id": loader_id + "loader_id": loader_id, + "content_hash": hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest(), }) return docs @@ -137,11 +140,13 @@ def _parse_file(file_path: Path, original_item: Any, loader_id: str = "N/A") -> for doc in docs: doc.metadata.update({ "source": original_item.web_url, - "sharepoint_id": original_item.object_id, + "document_id": original_item.object_id, "name": original_item.name, "created": str(original_item.created), "modified": str(original_item.modified), - "loader": "SharePointLoader" + "loader": "SharePointLoader", + "loader_id": loader_id, + "content_hash": hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest(), }) return docs @@ -234,7 +239,7 @@ def _list_available_drives(account: Account, search_term: Optional[str] = None) except Exception as e: logger.error(f"Error while listing available drives: {e}") -def _load_items_manual_traversal(drive: Any, folder_id: Optional[str] = None, recursive: bool = True, loader_id: str = "N/A") -> List[Document]: +def _load_items_manual_traversal(drive: Any, folder_id: Optional[str] = None, recursive: bool = True, loader_id: str = "N/A", since: Optional[datetime] = None) -> List[Document]: """ Internal helper to manually traverse and load items into Document objects. This bypasses LangChain's internal 'storage()' call which fails in App-Only context. @@ -276,6 +281,17 @@ def _process_folder_items(item_list: Any) -> None: elif item.is_file: try: + # Seit-Filter: Dateien überspringen, die vor `since` zuletzt geändert wurden + if since is not None: + item_modified = item.modified + if item_modified is not None: + # Sicherstellen, dass beide tz-aware sind + since_utc = since.astimezone(timezone.utc) if since.tzinfo else since.replace(tzinfo=timezone.utc) + item_modified_utc = item_modified.astimezone(timezone.utc) if item_modified.tzinfo else item_modified.replace(tzinfo=timezone.utc) + if item_modified_utc < since_utc: + logger.debug(f"Skipping unchanged file (modified={item_modified_utc.isoformat()}): {item.name}") + continue + # 2. Download file download_success = item.download(to_path=temp_dir) @@ -307,12 +323,150 @@ def _process_folder_items(item_list: Any) -> None: return documents +def _list_items_web_urls(drive: Any, folder_id: Optional[str] = None, recursive: bool = True) -> List[str]: + """ + Traverse SharePoint files without downloading and collect their web URLs. + + Intended for deletion detection in the 2-pass delta import: compare the returned + set against the paths stored in Qdrant to find files that have been removed. + + Args: + drive (Any): Authenticated O365 Drive object. + folder_id (Optional[str]): Object ID of the folder to start from. + Uses the drive root when ``None``. + recursive (bool): Whether to traverse sub-folders recursively (default ``True``). + + Returns: + List[str]: Web URLs of all files found, e.g. + ``["https://tenant.sharepoint.com/.../file.pdf"]``. + """ + urls: List[str] = [] + + try: + if folder_id: + folder = drive.get_item(folder_id) + else: + folder = drive.get_root_folder() + + items = folder.get_items() + except Exception as e: + logger.error(f"_list_items_web_urls: error accessing folder: {e}") + return urls + + for item in items: + try: + if item.is_folder and recursive: + sub_urls = _list_items_web_urls(drive, folder_id=item.object_id, recursive=recursive) + urls.extend(sub_urls) + elif item.is_file: + if item.web_url: + urls.append(item.web_url) + except Exception as e: + logger.warning(f"_list_items_web_urls: error processing item {getattr(item, 'name', '?')}: {e}") + + return urls + + +def get_all_sharepoint_document_ids( + client_id: str, + client_secret: str, + document_library_id: str, + folder_path: Optional[str] = None, + folder_id: Optional[str] = None, + recursive: bool = False, + auth_with_token: bool = True, + reset_token: bool = False, + tenant_id: str = "common", + site_id: Optional[str] = None, +) -> List[str]: + """ + Retrieve the web URL for every file in a SharePoint document library without loading content. + + Intended for deletion detection in the 2-pass delta import: compare the returned + set against the paths stored in Qdrant to find files that have been removed. + + Args: + client_id (str): Azure AD application (client) ID. + client_secret (str): Azure AD client secret. + document_library_id (str): GUID of the SharePoint document library (Drive ID). + folder_path (Optional[str]): Slash-separated path to a sub-folder relative to + the library root, e.g. ``"Docs/Reports"``. + folder_id (Optional[str]): Object ID of the entry-point folder; takes precedence + over ``folder_path`` when both are provided. + recursive (bool): Whether to traverse sub-folders recursively (default ``False``). + auth_with_token (bool): Use cached O365 token when available (default ``True``). + reset_token (bool): Delete the cached token before authenticating (default ``False``). + tenant_id (str): Azure AD tenant ID or ``"common"`` (default). + site_id (Optional[str]): SharePoint site ID; when provided, the library is looked + up on that specific site rather than the root site. + + Returns: + List[str]: Web URLs of all files found, e.g. + ``["https://tenant.sharepoint.com/.../file.pdf"]``. + """ + if reset_token: + reset_o365_token() + + token_path = Path.home() / ".credentials" / "o365_token.txt" + token_backend = FileSystemTokenBackend(token_path=Path.home() / ".credentials", token_filename="o365_token.txt") + + if (not auth_with_token) or (not token_path.exists()): + if tenant_id and tenant_id != "common": + _authenticate_directly_with_o365(client_id, client_secret, tenant_id) + else: + logger.error("get_all_sharepoint_document_ids: No valid authentication method available.") + return [] + + account = Account((client_id, client_secret), token_backend=token_backend) + + if not account.is_authenticated: + logger.error("get_all_sharepoint_document_ids: Authentication failed.") + return [] + + try: + if site_id: + sp = account.sharepoint() + site = sp.get_site(site_id) + storage = site.storage + else: + storage = account.storage() + + drive = storage.get_drive(document_library_id) + if drive is None: + logger.error(f"get_all_sharepoint_document_ids: Drive not found: {document_library_id}") + return [] + + # Optionaler Unterordner-Start + effective_folder_id = folder_id + if folder_path and not folder_id: + root = drive.get_root_folder() + for part in folder_path.strip("/").split("/"): + found = None + for child in root.get_items(): + if child.is_folder and child.name == part: + found = child + break + if found: + root = found + else: + logger.warning(f"get_all_sharepoint_document_ids: folder part '{part}' not found") + return [] + effective_folder_id = root.object_id + + return _list_items_web_urls(drive, folder_id=effective_folder_id, recursive=recursive) + + except Exception as e: + logger.error(f"get_all_sharepoint_document_ids: error: {e}") + return [] + + def load_from_sharepoint(client_id: str, client_secret: str, document_library_id: str, folder_path: Optional[str] = None, folder_id: Optional[str] = None, object_ids: Optional[List[str]] = None, recursive: bool = False, auth_with_token: bool = True, load_extended_metadata: bool = True, reset_token: bool = False, tenant_id: str = "common", - site_id: Optional[str] = None, loader_id: str = "N/A") -> List[Document]: + site_id: Optional[str] = None, loader_id: str = "N/A", + since: Optional[datetime] = None) -> List[Document]: """ Load documents from SharePoint and set metadata. """ @@ -372,7 +526,7 @@ def load_from_sharepoint(client_id: str, client_secret: str, document_library_id # Load documents using internal helper function # Use folder_id if provided, otherwise use Root of the Drive - loaded_docs = _load_items_manual_traversal(drive, folder_id=folder_id, recursive=recursive, loader_id=loader_id) + loaded_docs = _load_items_manual_traversal(drive, folder_id=folder_id, recursive=recursive, loader_id=loader_id, since=since) logger.info(f"Found {len(loaded_docs)} documents.") diff --git a/learn2rag/importer/utils/import_state.py b/learn2rag/importer/utils/import_state.py new file mode 100644 index 0000000..3356f4b --- /dev/null +++ b/learn2rag/importer/utils/import_state.py @@ -0,0 +1,128 @@ +""" +import_state.py + +Description: + Manages the persistent per-loader import state (last_import_timestamp). + The state file (import_state.json) is stored next to the importer_config.json. + + State file format: + { + "loader_id": { + "last_import_timestamp": "2026-04-27T10:00:00+00:00" + } + } + +Author: Kyrill Meyer +Institution: IFDT +Version: 0.0.1 +Creation Date: April 27, 2026 +Last Modified: April 27, 2026 +""" + +import json +import logging +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Optional + +logger = logging.getLogger("Learn2RAGImporter") + + +class ImportState: + """ + Manages import state (timestamps) per loader. + + Tracks the start timestamp of the last successful import run per loader_id. + Changes are held in memory until ``save_success()`` is called, so a failed + import does not advance the stored timestamp. + """ + + def __init__(self, state_file_path: str) -> None: + """ + Initialise ImportState and load existing state from disk if present. + + Args: + state_file_path (str): Absolute or relative path to the JSON state file + (e.g. ``/data/import_state.json``). + """ + self._path = Path(state_file_path) + self._state: Dict[str, Any] = {} + self._pending: Dict[str, datetime] = {} # in-memory only, not yet persisted + self._load() + + def _load(self) -> None: + if self._path.exists(): + try: + with self._path.open("r", encoding="utf-8") as f: + self._state = json.load(f) + logger.info("Import state loaded from %s", self._path) + except (json.JSONDecodeError, OSError) as e: + logger.warning("Could not load import state from %s: %s — starting fresh.", self._path, e) + self._state = {} + else: + self._state = {} + + def _save(self) -> None: + self._path.parent.mkdir(parents=True, exist_ok=True) + with self._path.open("w", encoding="utf-8") as f: + json.dump(self._state, f, ensure_ascii=False, indent=2) + logger.debug("Import state saved to %s", self._path) + + def get_last_import_time(self, loader_id: str) -> Optional[datetime]: + """ + Return the start timestamp of the last successful import for a loader. + + Args: + loader_id (str): Unique loader identifier as defined in the importer config. + + Returns: + Optional[datetime]: UTC-aware datetime of the last successful import start, + or ``None`` if no state exists for this loader. + """ + entry = self._state.get(loader_id) + if not entry: + return None + ts_str = entry.get("last_import_timestamp") + if not ts_str: + return None + try: + dt = datetime.fromisoformat(ts_str) + # If no timezone info is present, treat as UTC + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except ValueError as e: + logger.warning("Invalid timestamp for loader_id=%s: %s", loader_id, e) + return None + + def record_import_start(self, loader_id: str, timestamp: datetime) -> None: + """ + Record the start time of an ongoing import in memory (not yet persisted). + + Must be called before ``save_success()`` for the same loader_id. + + Args: + loader_id (str): Unique loader identifier. + timestamp (datetime): UTC-aware datetime representing the import start time. + """ + self._pending[loader_id] = timestamp + + def save_success(self, loader_id: str) -> None: + """ + Persist the previously recorded import start timestamp to disk. + + Must be preceded by a call to ``record_import_start()`` for the same + loader_id; raises ``AssertionError`` otherwise. + + Args: + loader_id (str): Unique loader identifier. + """ + assert loader_id in self._pending, ( + f"save_success() called for loader_id='{loader_id}' without prior record_import_start()" + ) + ts = self._pending.pop(loader_id) + self._state[loader_id] = { + "last_import_timestamp": ts.isoformat() + } + self._save() + logger.info("Import state saved for loader_id=%s (start_time=%s)", loader_id, ts.isoformat()) diff --git a/learn2rag/pipeline/ingestion.py b/learn2rag/pipeline/ingestion.py index 68a52d9..be36f1e 100644 --- a/learn2rag/pipeline/ingestion.py +++ b/learn2rag/pipeline/ingestion.py @@ -10,7 +10,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents.base import Document from .qdrant import Qdrant -from qdrant_client.models import PointStruct, Filter, FieldCondition, MatchValue, SparseVector, VectorParams, MultiVectorConfig, MultiVectorComparator, Distance +from qdrant_client.models import PointStruct, Filter, FieldCondition, MatchValue, SparseVector, VectorParams, MultiVectorConfig, MultiVectorComparator, Distance, FilterSelector from .embeddings import create_embeddings @@ -104,28 +104,137 @@ def payload(sample: dict[str, Any]) -> dict[str, str]: "document_id": sample["metadata"].get("document_id", "") } -def index(documents: list[Document], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: - all_documents = documents +def get_documents_by_loader_id(qdrant: Qdrant, collection_name: str, loader_id: str) -> dict[str, str]: + """ + Scroll all Qdrant points for a given loader and return their path-to-hash mapping. - # Split documents into chunks - logging.info('Splitting documents into chunks') - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=opt_config["chunk_size"], chunk_overlap=opt_config["chunk_overlap"] + Args: + qdrant (Qdrant): Authenticated Qdrant wrapper instance. + collection_name (str): Target collection name. + loader_id (str): Unique loader identifier to filter by. + + Returns: + dict[str, str]: Mapping of ``{source_path: content_hash}`` for every point + belonging to this loader. Returns an empty dict when the + collection does not exist. + """ + if not qdrant.client.collection_exists(collection_name): + return {} + scroll_filter = Filter( + must=[FieldCondition(key="loader_id", match=MatchValue(value=loader_id))] ) - chunks = text_splitter.split_documents(all_documents) + result: dict[str, str] = {} + offset = None + while True: + points, offset = qdrant.client.scroll( + collection_name=collection_name, + scroll_filter=scroll_filter, + limit=100, + offset=offset, + with_payload=True, + with_vectors=False, + ) + for point in points: + if point.payload: + path = point.payload.get("path", "") + content_hash = point.payload.get("content_hash", "") + if path: + result[path] = content_hash + if offset is None: + break + return result - collection_name = user_config["collection_name"] - # Init vector store - qdrant = Qdrant( +def delete_chunks_by_document(qdrant: Qdrant, collection_name: str, loader_id: str, path: str) -> None: + """ + Delete all Qdrant chunks that belong to a specific document. + + Matches on the combined filter ``loader_id == X AND path == Y``. + + Args: + qdrant (Qdrant): Authenticated Qdrant wrapper instance. + collection_name (str): Target collection name. + loader_id (str): Unique loader identifier. + path (str): Source path / URL of the document whose chunks should be deleted. + """ + if not qdrant.client.collection_exists(collection_name): + return + logging.info('Deleting chunks for loader_id=%s path=%s', loader_id, path) + qdrant.client.delete( collection_name=collection_name, - opt_config=opt_config + points_selector=FilterSelector( + filter=Filter( + must=[ + FieldCondition(key="loader_id", match=MatchValue(value=loader_id)), + FieldCondition(key="path", match=MatchValue(value=path)), + ] + ) + ), + ) + + +def delete_all_chunks_by_loader_id(qdrant: Qdrant, collection_name: str, loader_id: str) -> None: + """ + Delete all Qdrant chunks that belong to a specific loader. + + Use this when a data source is removed from a pipeline to clean up all + associated vectors. + + Args: + qdrant (Qdrant): Authenticated Qdrant wrapper instance. + collection_name (str): Target collection name. + loader_id (str): Unique loader identifier whose chunks should be deleted. + """ + if not qdrant.client.collection_exists(collection_name): + return + logging.info('Deleting all chunks for loader_id=%s', loader_id) + qdrant.client.delete( + collection_name=collection_name, + points_selector=FilterSelector( + filter=Filter( + must=[ + FieldCondition(key="loader_id", match=MatchValue(value=loader_id)), + ] + ) + ), + ) + + +def ingest_batch(docs: list[Document], qdrant: Qdrant, user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: + """ + Chunk, embed, and bulk-insert a list of documents into Qdrant. + + Mirrors the behaviour of the original ``index()`` function but accepts an + already-constructed ``Qdrant`` instance instead of creating one internally. + Intended for use by ``process_delta_imports`` and other callers that manage + their own Qdrant connection. + + Points that already exist (identical ``loader_id``, ``path``, ``content_hash``, + and ``chunk_hash``) are skipped via ``point_exists()``. + + Args: + docs (list[Document]): Documents to ingest. May be a full initial load or + a filtered subset of changed documents. + qdrant (Qdrant): Authenticated Qdrant wrapper instance. + user_config (dict[str, Any]): User configuration dict (must contain + ``collection_name``). + opt_config (dict[str, Any]): Optimisation configuration dict (must contain + ``chunk_size``, ``chunk_overlap``, + ``embedding_model``, and ``search_mode``). + """ + collection_name = user_config["collection_name"] + all_documents = docs + + logging.info('Splitting documents into chunks') + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=opt_config["chunk_size"], chunk_overlap=opt_config["chunk_overlap"] ) + chunks = text_splitter.split_documents(all_documents) chunks_content = [chunk.page_content for chunk in chunks] if len(opt_config["multi_search"]) > 0 and opt_config["query_mode"] == "multi": - chunks_metadata = {} - embeddings_metadata = {} + chunks_metadata: dict[str, list[str]] = {} + embeddings_metadata: dict[str, Any] = {} for item in opt_config["multi_search"]: chunks_metadata[item] = list(get_chunks_metadata(chunks, item)) embeddings_metadata[item] = create_embeddings(chunks_metadata[item], opt_config["embedding_model"], opt_config["search_mode"]) @@ -135,8 +244,7 @@ def index(documents: list[Document], user_config: dict[str, Any], opt_config: di else: raise TypeError(f"dense_vecs must be np.ndarray, got {type(dense_vecs)}") - chunk_hash = [hashlib.md5(chunk.page_content.encode()).hexdigest() for chunk in chunks] - # Todo: handle different vector lengths for batch encoding when using sparse vectors + chunk_hash = [hashlib.md5(chunk.page_content.encode()).hexdigest() for chunk in chunks] logging.info('Creating embeddings...') embeddings = create_embeddings(chunks_content, opt_config["embedding_model"], opt_config["search_mode"]) @@ -193,4 +301,28 @@ def index(documents: list[Document], user_config: dict[str, Any], opt_config: di elif opt_config["query_mode"] == "multi": insert_multi(qdrant, collection_name, sample) else: - insert(qdrant, collection_name, sample) \ No newline at end of file + insert(qdrant, collection_name, sample) + + +def index(documents: list[Document], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: + """ + Ingest a list of documents — entry point for standalone pipeline operation. + + Creates a ``Qdrant`` instance internally and delegates to ``ingest_batch()``. + This function also serves as the replacement for the originally planned + ``ingest_document()`` helper: a single-document delta upsert is expressed as + ``index([doc], user_config, opt_config)`` without requiring a separate function. + + Called by ``pipeline/main.py`` and the ``/ingest`` HTTP endpoint. For the + delta-import path (which manages its own Qdrant connection), use + ``ingest_batch()`` directly. + + Args: + documents (list[Document]): One or more documents to ingest. + user_config (dict[str, Any]): User configuration dict (must contain + ``collection_name``). + opt_config (dict[str, Any]): Optimisation configuration dict. + """ + collection_name = user_config["collection_name"] + qdrant = Qdrant(collection_name=collection_name, opt_config=opt_config) + ingest_batch(documents, qdrant, user_config, opt_config) \ No newline at end of file From 920a5d01a0db9628e2cc9d214d63cd88daeea066 Mon Sep 17 00:00:00 2001 From: kymeyer Date: Tue, 28 Apr 2026 11:18:18 +0200 Subject: [PATCH 03/34] use store.py functions Co-authored-by: Copilot --- learn2rag/importer/loaders/process_loaders.py | 70 +++++-------- learn2rag/pipeline/ingestion.py | 97 +------------------ learn2rag/pipeline/store.py | 52 +++++++++- 3 files changed, 78 insertions(+), 141 deletions(-) diff --git a/learn2rag/importer/loaders/process_loaders.py b/learn2rag/importer/loaders/process_loaders.py index a6eb3bd..61e3014 100644 --- a/learn2rag/importer/loaders/process_loaders.py +++ b/learn2rag/importer/loaders/process_loaders.py @@ -13,10 +13,11 @@ import hashlib import logging -from typing import Callable, Dict, List, Any, TYPE_CHECKING +from typing import Dict, List, Any, TYPE_CHECKING if TYPE_CHECKING: - from learn2rag.pipeline.qdrant import Qdrant from learn2rag.importer.utils.import_state import ImportState +from learn2rag.pipeline.ingestion import index +from learn2rag.pipeline.store import get_document_hashes, delete_documents from ..globals import stop_loading from langchain_core.documents import Document from .directory_loader import load_from_directory @@ -165,7 +166,6 @@ def process_configuration_entries(config_entries: List[Dict[str, Any]]) -> List[ def process_delta_imports( config_entries: List[Dict[str, Any]], - qdrant: "Qdrant", user_config: Dict[str, Any], opt_config: Dict[str, Any], import_state: "ImportState", @@ -187,21 +187,12 @@ def process_delta_imports( Args: config_entries (List[Dict[str, Any]]): Loader configuration entries from the importer config file. - qdrant (Qdrant): Authenticated Qdrant wrapper instance - (``learn2rag.pipeline.qdrant.Qdrant``). user_config (Dict[str, Any]): User configuration dict (must contain ``collection_name``). opt_config (Dict[str, Any]): Optimisation configuration dict. import_state (ImportState): ImportState instance for timestamp management. """ from datetime import datetime, timezone - from learn2rag.pipeline.ingestion import ( - get_documents_by_loader_id, - delete_chunks_by_document, - ingest_batch, - ) - - collection_name = user_config.get("collection_name", opt_config.get("collection_name", "")) for entry in config_entries: if stop_loading: @@ -221,7 +212,7 @@ def process_delta_imports( import_state.record_import_start(loader_id, import_start) # Retrieve existing Qdrant documents for this loader: {source_path: content_hash} - existing_docs: Dict[str, str] = get_documents_by_loader_id(qdrant, collection_name, loader_id) + existing_docs: Dict[str, str] = get_document_hashes(loader_id, user_config, opt_config) is_initial = len(existing_docs) == 0 logger.info( @@ -258,10 +249,10 @@ def process_delta_imports( text_fields=text_fields, page_size=page_size, language=language, ) if is_initial: - ingest_batch(all_docs, qdrant, user_config, opt_config) + index(all_docs, user_config, opt_config) else: # Hash comparison: replace changed, remove deleted - _delta_by_hash(all_docs, existing_docs, qdrant, collection_name, loader_id, user_config, opt_config, delete_chunks_by_document, ingest_batch) + _delta_by_hash(all_docs, existing_docs, loader_id, user_config, opt_config) else: # 2-pass delta logger.info(f"Drupal '{loader_id}': 2-pass delta since {last_import_time.isoformat()}") @@ -274,7 +265,7 @@ def process_delta_imports( deleted_paths = [p for p in existing_docs if p not in current_ids] for path in deleted_paths: logger.info(f"Drupal '{loader_id}': deleting removed document {path}") - delete_chunks_by_document(qdrant, collection_name, loader_id, path) + delete_documents(loader_id, [path], user_config, opt_config) # Pass 2: load and index changed documents changed_docs = load_from_drupal( @@ -285,8 +276,8 @@ def process_delta_imports( ) for doc in changed_docs: source = doc.metadata.get("source", "") - delete_chunks_by_document(qdrant, collection_name, loader_id, source) - ingest_batch(changed_docs, qdrant, user_config, opt_config) + delete_documents(loader_id, [source], user_config, opt_config) + index(changed_docs, user_config, opt_config) logger.info(f"Drupal '{loader_id}': {len(deleted_paths)} deleted, {len(changed_docs)} updated") elif loader_type == "SharepointLoader": @@ -311,9 +302,9 @@ def process_delta_imports( loader_id=loader_id, ) if is_initial: - ingest_batch(all_docs, qdrant, user_config, opt_config) + index(all_docs, user_config, opt_config) else: - _delta_by_hash(all_docs, existing_docs, qdrant, collection_name, loader_id, user_config, opt_config, delete_chunks_by_document, ingest_batch) + _delta_by_hash(all_docs, existing_docs, loader_id, user_config, opt_config) else: logger.info(f"SharePoint '{loader_id}': 2-pass delta since {last_import_time.isoformat()}") # Pass 1: fetch all current URLs to detect deleted documents @@ -326,7 +317,7 @@ def process_delta_imports( deleted_paths = [p for p in existing_docs if p not in current_ids] for path in deleted_paths: logger.info(f"SharePoint '{loader_id}': deleting removed document {path}") - delete_chunks_by_document(qdrant, collection_name, loader_id, path) + delete_documents(loader_id, [path], user_config, opt_config) # Pass 2: load and index changed documents changed_docs = load_from_sharepoint( @@ -338,8 +329,8 @@ def process_delta_imports( ) for doc in changed_docs: source = doc.metadata.get("source", "") - delete_chunks_by_document(qdrant, collection_name, loader_id, source) - ingest_batch(changed_docs, qdrant, user_config, opt_config) + delete_documents(loader_id, [source], user_config, opt_config) + index(changed_docs, user_config, opt_config) logger.info(f"SharePoint '{loader_id}': {len(deleted_paths)} deleted, {len(changed_docs)} updated") # ---------------------------------------------------------------- @@ -357,9 +348,9 @@ def process_delta_imports( loader_id=loader_id, ) if is_initial: - ingest_batch(all_docs, qdrant, user_config, opt_config) + index(all_docs, user_config, opt_config) else: - _delta_by_hash(all_docs, existing_docs, qdrant, collection_name, loader_id, user_config, opt_config, delete_chunks_by_document, ingest_batch) + _delta_by_hash(all_docs, existing_docs, loader_id, user_config, opt_config) elif loader_type == "HTMLLoader": url = entry.get("url") @@ -369,9 +360,9 @@ def process_delta_imports( continue all_docs = load_html_content(url, depth=depth, loader_id=loader_id) if is_initial: - ingest_batch(all_docs, qdrant, user_config, opt_config) + index(all_docs, user_config, opt_config) else: - _delta_by_hash(all_docs, existing_docs, qdrant, collection_name, loader_id, user_config, opt_config, delete_chunks_by_document, ingest_batch) + _delta_by_hash(all_docs, existing_docs, loader_id, user_config, opt_config) elif loader_type == "CSVLoader": path = entry.get("path") @@ -383,9 +374,9 @@ def process_delta_imports( doc.metadata["loader_id"] = loader_id doc.metadata["content_hash"] = hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest() if is_initial: - ingest_batch(all_docs, qdrant, user_config, opt_config) + index(all_docs, user_config, opt_config) else: - _delta_by_hash(all_docs, existing_docs, qdrant, collection_name, loader_id, user_config, opt_config, delete_chunks_by_document, ingest_batch) + _delta_by_hash(all_docs, existing_docs, loader_id, user_config, opt_config) else: logger.error(f"process_delta_imports: unknown loader_type '{loader_type}' for loader_id '{loader_id}'") @@ -401,13 +392,9 @@ def process_delta_imports( def _delta_by_hash( all_docs: List[Document], existing_docs: Dict[str, str], - qdrant: "Qdrant", - collection_name: str, loader_id: str, user_config: Dict[str, Any], opt_config: Dict[str, Any], - delete_chunks_by_document: Callable[..., None], - ingest_batch: Callable[..., None], ) -> None: """ Hash-based delta import for normal loaders (DirectoryLoader, HTMLLoader, CSVLoader). @@ -422,16 +409,11 @@ def _delta_by_hash( Args: all_docs (List[Document]): All documents returned by the loader for this run. existing_docs (Dict[str, str]): Mapping of ``{source_url: content_hash}`` as - stored in Qdrant (from ``get_documents_by_loader_id``). - qdrant (Qdrant): Authenticated Qdrant wrapper instance. - collection_name (str): Target Qdrant collection name. + stored in Qdrant (from ``get_document_hashes``). loader_id (str): Unique loader identifier. - user_config (Dict[str, Any]): User configuration dict. + user_config (Dict[str, Any]): User configuration dict (must contain + ``collection_name``). opt_config (Dict[str, Any]): Optimisation configuration dict. - delete_chunks_by_document (Callable): Function with signature - ``(qdrant, collection, loader_id, path) -> None``. - ingest_batch (Callable): Function with signature - ``(docs, qdrant, user_config, opt_config) -> None``. """ # Group freshly loaded documents by source URL (1 source = N chunks) # Comparison is performed at source level using the combined content hash @@ -450,7 +432,7 @@ def _delta_by_hash( deleted_count = 0 for source in list(existing_docs.keys()): if source not in new_docs_by_source: - delete_chunks_by_document(qdrant, collection_name, loader_id, source) + delete_documents(loader_id, [source], user_config, opt_config) deleted_count += 1 # Re-index changed and new documents @@ -459,11 +441,11 @@ def _delta_by_hash( existing_hash = existing_docs.get(source) if existing_hash != new_hash_by_source[source]: if existing_hash is not None: - delete_chunks_by_document(qdrant, collection_name, loader_id, source) + delete_documents(loader_id, [source], user_config, opt_config) changed_docs.extend(docs) if changed_docs: - ingest_batch(changed_docs, qdrant, user_config, opt_config) + index(changed_docs, user_config, opt_config) logger.info( f"_delta_by_hash '{loader_id}': {deleted_count} deleted, " diff --git a/learn2rag/pipeline/ingestion.py b/learn2rag/pipeline/ingestion.py index be36f1e..a0d7013 100644 --- a/learn2rag/pipeline/ingestion.py +++ b/learn2rag/pipeline/ingestion.py @@ -10,7 +10,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents.base import Document from .qdrant import Qdrant -from qdrant_client.models import PointStruct, Filter, FieldCondition, MatchValue, SparseVector, VectorParams, MultiVectorConfig, MultiVectorComparator, Distance, FilterSelector +from qdrant_client.models import PointStruct, Filter, FieldCondition, MatchValue, SparseVector, VectorParams, MultiVectorConfig, MultiVectorComparator, Distance from .embeddings import create_embeddings @@ -104,101 +104,6 @@ def payload(sample: dict[str, Any]) -> dict[str, str]: "document_id": sample["metadata"].get("document_id", "") } -def get_documents_by_loader_id(qdrant: Qdrant, collection_name: str, loader_id: str) -> dict[str, str]: - """ - Scroll all Qdrant points for a given loader and return their path-to-hash mapping. - - Args: - qdrant (Qdrant): Authenticated Qdrant wrapper instance. - collection_name (str): Target collection name. - loader_id (str): Unique loader identifier to filter by. - - Returns: - dict[str, str]: Mapping of ``{source_path: content_hash}`` for every point - belonging to this loader. Returns an empty dict when the - collection does not exist. - """ - if not qdrant.client.collection_exists(collection_name): - return {} - scroll_filter = Filter( - must=[FieldCondition(key="loader_id", match=MatchValue(value=loader_id))] - ) - result: dict[str, str] = {} - offset = None - while True: - points, offset = qdrant.client.scroll( - collection_name=collection_name, - scroll_filter=scroll_filter, - limit=100, - offset=offset, - with_payload=True, - with_vectors=False, - ) - for point in points: - if point.payload: - path = point.payload.get("path", "") - content_hash = point.payload.get("content_hash", "") - if path: - result[path] = content_hash - if offset is None: - break - return result - - -def delete_chunks_by_document(qdrant: Qdrant, collection_name: str, loader_id: str, path: str) -> None: - """ - Delete all Qdrant chunks that belong to a specific document. - - Matches on the combined filter ``loader_id == X AND path == Y``. - - Args: - qdrant (Qdrant): Authenticated Qdrant wrapper instance. - collection_name (str): Target collection name. - loader_id (str): Unique loader identifier. - path (str): Source path / URL of the document whose chunks should be deleted. - """ - if not qdrant.client.collection_exists(collection_name): - return - logging.info('Deleting chunks for loader_id=%s path=%s', loader_id, path) - qdrant.client.delete( - collection_name=collection_name, - points_selector=FilterSelector( - filter=Filter( - must=[ - FieldCondition(key="loader_id", match=MatchValue(value=loader_id)), - FieldCondition(key="path", match=MatchValue(value=path)), - ] - ) - ), - ) - - -def delete_all_chunks_by_loader_id(qdrant: Qdrant, collection_name: str, loader_id: str) -> None: - """ - Delete all Qdrant chunks that belong to a specific loader. - - Use this when a data source is removed from a pipeline to clean up all - associated vectors. - - Args: - qdrant (Qdrant): Authenticated Qdrant wrapper instance. - collection_name (str): Target collection name. - loader_id (str): Unique loader identifier whose chunks should be deleted. - """ - if not qdrant.client.collection_exists(collection_name): - return - logging.info('Deleting all chunks for loader_id=%s', loader_id) - qdrant.client.delete( - collection_name=collection_name, - points_selector=FilterSelector( - filter=Filter( - must=[ - FieldCondition(key="loader_id", match=MatchValue(value=loader_id)), - ] - ) - ), - ) - def ingest_batch(docs: list[Document], qdrant: Qdrant, user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: """ diff --git a/learn2rag/pipeline/store.py b/learn2rag/pipeline/store.py index 56cec20..fb58083 100644 --- a/learn2rag/pipeline/store.py +++ b/learn2rag/pipeline/store.py @@ -90,7 +90,57 @@ def get_documents(loader_id: str, user_config: dict[str, Any], opt_config: dict[ break return [point.payload for point in points if point.payload is not None] return None - + + +def get_document_hashes(loader_id: str, user_config: dict[str, Any], opt_config: dict[str, Any]) -> dict[str, str]: + """ + Retrieve a path-to-content_hash mapping for all documents belonging to a loader. + + Scrolls the Qdrant collection and deduplicates by source path, keeping the last + seen content hash per path. Used by the delta-import orchestration to determine + which documents are new, changed, or deleted. + + Args: + loader_id (str): Unique loader identifier to filter by. + user_config (dict[str, Any]): User configuration dict (must contain + ``collection_name``). + opt_config (dict[str, Any]): Optimisation configuration dict. + + Returns: + dict[str, str]: Mapping of ``{source_path: content_hash}`` for every document + belonging to this loader. Returns an empty dict when the + collection does not exist. + """ + qdrant = Qdrant(user_config["collection_name"], opt_config) + collection_name = user_config["collection_name"] + if not qdrant.client.collection_exists(collection_name): + return {} + logging.info('Retrieving document hashes for loader_id: %s', loader_id) + scroll_filter = Filter( + must=[FieldCondition(key="loader_id", match=MatchValue(value=loader_id))] + ) + result: dict[str, str] = {} + offset = None + while True: + points, offset = qdrant.client.scroll( + collection_name=collection_name, + scroll_filter=scroll_filter, + limit=100, + offset=offset, + with_payload=True, + with_vectors=False, + ) + for point in points: + if point.payload: + path = point.payload.get("path", "") + content_hash = point.payload.get("content_hash", "") + if path: + result[path] = content_hash + if offset is None: + break + return result + + def update_documents(loader_id: str, documents: list[Document], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: qdrant = Qdrant(user_config["collection_name"], opt_config) if qdrant.client.collection_exists(user_config["collection_name"]): From 276726bea9f9930856e714654c0e8021809b3f90 Mon Sep 17 00:00:00 2001 From: kymeyer Date: Mon, 4 May 2026 14:38:08 +0200 Subject: [PATCH 04/34] use get_documents and switch to "source"-identifier Co-authored-by: Copilot --- learn2rag/importer/loaders/process_loaders.py | 137 +++++++++++------- learn2rag/importer/readme.md | 9 +- learn2rag/pipeline/ingestion.py | 4 +- learn2rag/pipeline/store.py | 54 +------ 4 files changed, 94 insertions(+), 110 deletions(-) diff --git a/learn2rag/importer/loaders/process_loaders.py b/learn2rag/importer/loaders/process_loaders.py index 61e3014..7bb332f 100644 --- a/learn2rag/importer/loaders/process_loaders.py +++ b/learn2rag/importer/loaders/process_loaders.py @@ -6,9 +6,9 @@ Author: Kyrill Meyer Institution: IFDT -Version: 0.0.6 +Version: 0.0.7 Creation Date: June 10, 2025 -Last Modified: April 24, 2026 +Last Modified: May 4, 2026 """ import hashlib @@ -17,7 +17,7 @@ if TYPE_CHECKING: from learn2rag.importer.utils.import_state import ImportState from learn2rag.pipeline.ingestion import index -from learn2rag.pipeline.store import get_document_hashes, delete_documents +from learn2rag.pipeline.store import get_documents, delete_documents, update_documents from ..globals import stop_loading from langchain_core.documents import Document from .directory_loader import load_from_directory @@ -70,9 +70,10 @@ def process_configuration_entries(config_entries: List[Dict[str, Any]]) -> List[ logger.error("Missing 'path' for 'CSVLoader' in configuration entry.") continue documents = load_from_csv(path) - # CSVLoader does not set loader_id or content_hash — populate them here + # CSVLoader does not set loader_id, source or content_hash — populate them here for doc in documents: doc.metadata["loader_id"] = loader_id + doc.metadata["source"] = path doc.metadata["content_hash"] = hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest() logger.info(f"Loaded {len(documents)} documents from {path} using {loader_type}.") elif loader_type == "HTMLLoader": @@ -211,13 +212,14 @@ def process_delta_imports( import_start = datetime.now(timezone.utc) import_state.record_import_start(loader_id, import_start) - # Retrieve existing Qdrant documents for this loader: {source_path: content_hash} - existing_docs: Dict[str, str] = get_document_hashes(loader_id, user_config, opt_config) - is_initial = len(existing_docs) == 0 + # Retrieve existing Qdrant documents for this loader and build {source: combined_hash} map + payloads: List[Dict[str, Any]] = get_documents(loader_id, user_config, opt_config) or [] + existing_map: Dict[str, str] = _build_existing_map(payloads) + is_initial = len(existing_map) == 0 logger.info( f"Delta import '{loader_id}': is_initial={is_initial}, " - f"last_import_time={last_import_time}, existing_docs={len(existing_docs)}" + f"last_import_time={last_import_time}, existing_docs={len(existing_map)}" ) # ---------------------------------------------------------------- @@ -252,7 +254,7 @@ def process_delta_imports( index(all_docs, user_config, opt_config) else: # Hash comparison: replace changed, remove deleted - _delta_by_hash(all_docs, existing_docs, loader_id, user_config, opt_config) + _delta_by_source(all_docs, existing_map, loader_id, user_config, opt_config) else: # 2-pass delta logger.info(f"Drupal '{loader_id}': 2-pass delta since {last_import_time.isoformat()}") @@ -262,10 +264,10 @@ def process_delta_imports( auth_type=auth_type, username=username, password=password, token=token, page_size=page_size, language=language, )) - deleted_paths = [p for p in existing_docs if p not in current_ids] - for path in deleted_paths: - logger.info(f"Drupal '{loader_id}': deleting removed document {path}") - delete_documents(loader_id, [path], user_config, opt_config) + deleted_paths = [p for p in existing_map if p not in current_ids] + if deleted_paths: + logger.info(f"Drupal '{loader_id}': deleting {len(deleted_paths)} removed documents") + delete_documents(loader_id, deleted_paths, user_config, opt_config) # Pass 2: load and index changed documents changed_docs = load_from_drupal( @@ -274,9 +276,9 @@ def process_delta_imports( text_fields=text_fields, page_size=page_size, language=language, since=last_import_time, ) - for doc in changed_docs: - source = doc.metadata.get("source", "") - delete_documents(loader_id, [source], user_config, opt_config) + sources_to_delete = [doc.metadata.get("source", "") for doc in changed_docs] + if sources_to_delete: + delete_documents(loader_id, sources_to_delete, user_config, opt_config) index(changed_docs, user_config, opt_config) logger.info(f"Drupal '{loader_id}': {len(deleted_paths)} deleted, {len(changed_docs)} updated") @@ -304,7 +306,7 @@ def process_delta_imports( if is_initial: index(all_docs, user_config, opt_config) else: - _delta_by_hash(all_docs, existing_docs, loader_id, user_config, opt_config) + _delta_by_source(all_docs, existing_map, loader_id, user_config, opt_config) else: logger.info(f"SharePoint '{loader_id}': 2-pass delta since {last_import_time.isoformat()}") # Pass 1: fetch all current URLs to detect deleted documents @@ -314,10 +316,10 @@ def process_delta_imports( folder_id=folder_id, recursive=recursive, auth_with_token=auth_with_token, reset_token=reset_token, tenant_id=tenant_id, site_id=site_id, )) - deleted_paths = [p for p in existing_docs if p not in current_ids] - for path in deleted_paths: - logger.info(f"SharePoint '{loader_id}': deleting removed document {path}") - delete_documents(loader_id, [path], user_config, opt_config) + deleted_paths = [p for p in existing_map if p not in current_ids] + if deleted_paths: + logger.info(f"SharePoint '{loader_id}': deleting {len(deleted_paths)} removed documents") + delete_documents(loader_id, deleted_paths, user_config, opt_config) # Pass 2: load and index changed documents changed_docs = load_from_sharepoint( @@ -327,9 +329,9 @@ def process_delta_imports( reset_token=reset_token, tenant_id=tenant_id, site_id=site_id, loader_id=loader_id, since=last_import_time, ) - for doc in changed_docs: - source = doc.metadata.get("source", "") - delete_documents(loader_id, [source], user_config, opt_config) + sources_to_delete = [doc.metadata.get("source", "") for doc in changed_docs] + if sources_to_delete: + delete_documents(loader_id, sources_to_delete, user_config, opt_config) index(changed_docs, user_config, opt_config) logger.info(f"SharePoint '{loader_id}': {len(deleted_paths)} deleted, {len(changed_docs)} updated") @@ -350,7 +352,7 @@ def process_delta_imports( if is_initial: index(all_docs, user_config, opt_config) else: - _delta_by_hash(all_docs, existing_docs, loader_id, user_config, opt_config) + _delta_by_source(all_docs, existing_map, loader_id, user_config, opt_config) elif loader_type == "HTMLLoader": url = entry.get("url") @@ -362,7 +364,7 @@ def process_delta_imports( if is_initial: index(all_docs, user_config, opt_config) else: - _delta_by_hash(all_docs, existing_docs, loader_id, user_config, opt_config) + _delta_by_source(all_docs, existing_map, loader_id, user_config, opt_config) elif loader_type == "CSVLoader": path = entry.get("path") @@ -372,11 +374,12 @@ def process_delta_imports( all_docs = load_from_csv(path) for doc in all_docs: doc.metadata["loader_id"] = loader_id + doc.metadata["source"] = path doc.metadata["content_hash"] = hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest() if is_initial: index(all_docs, user_config, opt_config) else: - _delta_by_hash(all_docs, existing_docs, loader_id, user_config, opt_config) + _delta_by_source(all_docs, existing_map, loader_id, user_config, opt_config) else: logger.error(f"process_delta_imports: unknown loader_type '{loader_type}' for loader_id '{loader_id}'") @@ -389,9 +392,39 @@ def process_delta_imports( logger.error(f"process_delta_imports: error processing loader '{loader_id}': {e}", exc_info=True) -def _delta_by_hash( +def _build_existing_map(payloads: List[Dict[str, Any]]) -> Dict[str, str]: + """ + Build a ``{source: combined_content_hash}`` mapping from Qdrant payloads. + + Groups payloads by ``source`` field and computes a combined hash per source + from the sorted individual ``content_hash`` values. This makes the comparison + stable regardless of chunk order. + + Args: + payloads (List[Dict[str, Any]]): Raw Qdrant payloads as returned by + ``get_documents()``. + + Returns: + Dict[str, str]: Mapping of ``{source: combined_hash}``. + """ + # Use a set to deduplicate content_hash values: all chunks of one document + # share the same content_hash, so duplicates must be collapsed before combining. + chunks_by_source: Dict[str, set] = {} + for payload in payloads: + source = payload.get("source", "") + content_hash = payload.get("content_hash", "") + if source: + chunks_by_source.setdefault(source, set()).add(content_hash) + result: Dict[str, str] = {} + for source, hashes in chunks_by_source.items(): + combined = "".join(sorted(hashes)) + result[source] = hashlib.sha256(combined.encode("utf-8")).hexdigest() + return result + + +def _delta_by_source( all_docs: List[Document], - existing_docs: Dict[str, str], + existing_map: Dict[str, str], loader_id: str, user_config: Dict[str, Any], opt_config: Dict[str, Any], @@ -399,56 +432,52 @@ def _delta_by_hash( """ Hash-based delta import for normal loaders (DirectoryLoader, HTMLLoader, CSVLoader). - Groups freshly loaded documents by ``source`` URL, computes a combined content hash + Groups freshly loaded documents by ``source``, computes a combined content hash per source, and then: - - Deletes Qdrant chunks for sources that no longer exist in the new load. - - Re-indexes sources whose combined hash has changed (delete old + ingest new). + - Deletes Qdrant chunks (bulk) for sources that no longer exist in the new load. + - Calls ``update_documents`` for sources whose combined hash has changed or that + are entirely new (update_documents handles delete-then-reindex internally). - Leaves unchanged sources untouched. Args: all_docs (List[Document]): All documents returned by the loader for this run. - existing_docs (Dict[str, str]): Mapping of ``{source_url: content_hash}`` as - stored in Qdrant (from ``get_document_hashes``). + existing_map (Dict[str, str]): Mapping of ``{source: combined_hash}`` as + built by ``_build_existing_map()``. loader_id (str): Unique loader identifier. user_config (Dict[str, Any]): User configuration dict (must contain ``collection_name``). opt_config (Dict[str, Any]): Optimisation configuration dict. """ - # Group freshly loaded documents by source URL (1 source = N chunks) - # Comparison is performed at source level using the combined content hash + # Group freshly loaded documents by source (1 source = N chunks) new_docs_by_source: Dict[str, List[Document]] = {} for doc in all_docs: - source = doc.metadata.get("source", "") + source: str = doc.metadata.get("source", "") new_docs_by_source.setdefault(source, []).append(doc) - # Compute a combined content hash per source by concatenating all chunk hashes + # Compute combined hash per source from sorted individual content_hashes new_hash_by_source: Dict[str, str] = {} for source, docs in new_docs_by_source.items(): - combined = "".join(d.metadata.get("content_hash", d.page_content) for d in docs) + hashes = sorted(d.metadata.get("content_hash", d.page_content) for d in docs) + combined = "".join(hashes) new_hash_by_source[source] = hashlib.sha256(combined.encode("utf-8")).hexdigest() - # Remove documents that are no longer present in the fresh load - deleted_count = 0 - for source in list(existing_docs.keys()): - if source not in new_docs_by_source: - delete_documents(loader_id, [source], user_config, opt_config) - deleted_count += 1 + # Bulk-delete sources that are no longer present in the fresh load + deleted_sources: List[str] = [s for s in existing_map if s not in new_docs_by_source] + if deleted_sources: + delete_documents(loader_id, deleted_sources, user_config, opt_config) - # Re-index changed and new documents + # Update changed and new sources via update_documents (delete-then-reindex) changed_docs: List[Document] = [] for source, docs in new_docs_by_source.items(): - existing_hash = existing_docs.get(source) - if existing_hash != new_hash_by_source[source]: - if existing_hash is not None: - delete_documents(loader_id, [source], user_config, opt_config) + if existing_map.get(source) != new_hash_by_source[source]: changed_docs.extend(docs) if changed_docs: - index(changed_docs, user_config, opt_config) + update_documents(loader_id, changed_docs, user_config, opt_config) + changed_source_count = len(set(d.metadata.get("source", "") for d in changed_docs)) logger.info( - f"_delta_by_hash '{loader_id}': {deleted_count} deleted, " - f"{len(changed_docs)} chunks re-indexed from " - f"{len(set(d.metadata.get('source','') for d in changed_docs))} changed sources" + f"_delta_by_source '{loader_id}': {len(deleted_sources)} deleted, " + f"{len(changed_docs)} chunks re-indexed from {changed_source_count} changed sources" ) diff --git a/learn2rag/importer/readme.md b/learn2rag/importer/readme.md index fd9eade..0a222a5 100644 --- a/learn2rag/importer/readme.md +++ b/learn2rag/importer/readme.md @@ -404,9 +404,14 @@ where - v0.0.6 - removed permission information from metadata in directory loader - added SharePoint Loader -- v0.0.7 +- v0.0.7 - added type checks - v0.0.8 - added loader_id for all loaders - improved error handling and dependency checking for directory loader - - added drupal_loader \ No newline at end of file + - added drupal_loader +- v0.0.9 + - unified `source` field as document identifier across all loaders in metadata (directory: file path, HTML: URL, SharePoint: web URL, Drupal: node URL, CSV: file path) + - delta import now uses `get_documents` from the pipeline + - hash comparison now uses sorted chunk hashes per source for stable results + - **Breaking change:** Qdrant payload field renamed from `path` → `source`; existing collections must be deleted and re-imported \ No newline at end of file diff --git a/learn2rag/pipeline/ingestion.py b/learn2rag/pipeline/ingestion.py index a0d7013..d4f5cfa 100644 --- a/learn2rag/pipeline/ingestion.py +++ b/learn2rag/pipeline/ingestion.py @@ -31,7 +31,7 @@ def point_exists(qdrant: Qdrant, collection_name: str, loader_id: str, path: str filter = Filter( must=[ FieldCondition(key="loader_id", match=MatchValue(value=loader_id)), - FieldCondition(key="path", match=MatchValue(value=path)), + FieldCondition(key="source", match=MatchValue(value=path)), FieldCondition(key="content_hash", match=MatchValue(value=content_hash)), FieldCondition(key="chunk_hash", match=MatchValue(value=chunk_hash)), ] @@ -95,7 +95,7 @@ def insert_multi(qdrant: Qdrant, collection_name: str, sample: dict[str, Any]) - def payload(sample: dict[str, Any]) -> dict[str, str]: return { "content": sample["page_content"], - "path": sample["metadata"]["source"], + "source": sample["metadata"]["source"], "content_hash": sample["metadata"]["content_hash"], "chunk_hash": sample["chunk_hash"], "title": sample["metadata"].get("title",""), diff --git a/learn2rag/pipeline/store.py b/learn2rag/pipeline/store.py index fb58083..f14c926 100644 --- a/learn2rag/pipeline/store.py +++ b/learn2rag/pipeline/store.py @@ -1,8 +1,7 @@ import logging from typing import Any -from langchain_core import documents -from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny, FilterSelector +from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector from langchain_core.documents.base import Document from learn2rag.pipeline.ingestion import index @@ -49,7 +48,7 @@ def delete_documents(loader_id: str, paths: list[str], user_config: dict[str, An match=MatchValue(value=loader_id), ), FieldCondition( - key="path", + key="source", match=MatchValue(value=path) ), ], @@ -92,55 +91,6 @@ def get_documents(loader_id: str, user_config: dict[str, Any], opt_config: dict[ return None -def get_document_hashes(loader_id: str, user_config: dict[str, Any], opt_config: dict[str, Any]) -> dict[str, str]: - """ - Retrieve a path-to-content_hash mapping for all documents belonging to a loader. - - Scrolls the Qdrant collection and deduplicates by source path, keeping the last - seen content hash per path. Used by the delta-import orchestration to determine - which documents are new, changed, or deleted. - - Args: - loader_id (str): Unique loader identifier to filter by. - user_config (dict[str, Any]): User configuration dict (must contain - ``collection_name``). - opt_config (dict[str, Any]): Optimisation configuration dict. - - Returns: - dict[str, str]: Mapping of ``{source_path: content_hash}`` for every document - belonging to this loader. Returns an empty dict when the - collection does not exist. - """ - qdrant = Qdrant(user_config["collection_name"], opt_config) - collection_name = user_config["collection_name"] - if not qdrant.client.collection_exists(collection_name): - return {} - logging.info('Retrieving document hashes for loader_id: %s', loader_id) - scroll_filter = Filter( - must=[FieldCondition(key="loader_id", match=MatchValue(value=loader_id))] - ) - result: dict[str, str] = {} - offset = None - while True: - points, offset = qdrant.client.scroll( - collection_name=collection_name, - scroll_filter=scroll_filter, - limit=100, - offset=offset, - with_payload=True, - with_vectors=False, - ) - for point in points: - if point.payload: - path = point.payload.get("path", "") - content_hash = point.payload.get("content_hash", "") - if path: - result[path] = content_hash - if offset is None: - break - return result - - def update_documents(loader_id: str, documents: list[Document], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: qdrant = Qdrant(user_config["collection_name"], opt_config) if qdrant.client.collection_exists(user_config["collection_name"]): From 08c3c81885e4fe8b941cc246dbd9b40adcfcced1 Mon Sep 17 00:00:00 2001 From: kymeyer Date: Wed, 6 May 2026 12:41:25 +0200 Subject: [PATCH 05/34] Update for doc-specific hashing Co-authored-by: Copilot --- .../importer/loaders/directory_loader.py | 46 +++- learn2rag/importer/loaders/html_loader.py | 100 ++++++--- learn2rag/importer/loaders/process_loaders.py | 9 +- .../importer/loaders/sharepoint_loader.py | 18 +- learn2rag/importer/tests/test_loaders.py | 212 +++++++++++++++++- 5 files changed, 339 insertions(+), 46 deletions(-) diff --git a/learn2rag/importer/loaders/directory_loader.py b/learn2rag/importer/loaders/directory_loader.py index 39c3add..93c0da4 100644 --- a/learn2rag/importer/loaders/directory_loader.py +++ b/learn2rag/importer/loaders/directory_loader.py @@ -5,10 +5,10 @@ This module handles loading documents from directories. Author: Kyrill Meyer -Version: 0.0.4 +Version: 0.0.6 Institution: IFDT Creation Date: June 10, 2025 -Last Modified: March 17, 2026 +Last Modified: Mai 05, 2026 """ import hashlib import logging @@ -16,7 +16,7 @@ from datetime import datetime from typing import List, Union from ..globals import stop_loading -from langchain_community.document_loaders import DirectoryLoader +from langchain_community.document_loaders import DirectoryLoader, PyPDFDirectoryLoader from langchain_core.documents import Document # supress pdfminer-Warnings @@ -74,7 +74,6 @@ def load_from_directory(path: str, recursive: Union[bool, str], silent_errors: b "*.docx", "*.pptx", "*.xlsx", - "*.pdf", "*.txt", "*.csv", "*.html", @@ -84,6 +83,11 @@ def load_from_directory(path: str, recursive: Union[bool, str], silent_errors: b "*.epub", ] ) + pdf_loader = PyPDFDirectoryLoader( + path, + recursive=recursive, + silent_errors=silent_errors, + ) #loader = DirectoryLoader(path, show_progress=True, loader_kwargs=text_loader_kwargs, recursive=recursive, glob=["*.csv", "*.docx", "*.eml", "*.epub", "*.html", "*.json", "*.md", "*.odt", "*.pdf", "*.ppt", "*.pptx", "*.rst", "*.rtf", "*.txt", "*.tsv", "*.cls", "*.xlsx", "*.xml"]) @@ -93,7 +97,25 @@ def load_from_directory(path: str, recursive: Union[bool, str], silent_errors: b #loader = DirectoryLoader(path, show_progress=True, silent_errors=True, recursive=False) try: - loaded_documents = loader.load() + other_docs = loader.load() + # Merge PDF pages: PyPDFDirectoryLoader returns one Document per page. + # We combine all pages of the same file into one Document so that delta-import + # deduplication works on a 1:1 source→document basis (same as all other loaders). + from collections import defaultdict as _defaultdict + pdf_pages = pdf_loader.load() + pdf_by_file: dict = _defaultdict(list) + for _p in pdf_pages: + pdf_by_file[_p.metadata["source"]].append(_p) + pdf_docs = [] + for _src, _pages in pdf_by_file.items(): + _merged = Document( + page_content="\n\n".join(p.page_content for p in _pages), + metadata={**_pages[0].metadata, "total_pages": len(_pages)}, + ) + _merged.metadata.pop("page", None) + _merged.metadata.pop("page_label", None) + pdf_docs.append(_merged) + loaded_documents = other_docs + pdf_docs except Exception as e: logger.error(f"Error loading documents from directory: {e}") return [] @@ -103,9 +125,14 @@ def load_from_directory(path: str, recursive: Union[bool, str], silent_errors: b logger.info("Loading process stopped by user.") break try: - # generate a unique hash for the document content if isinstance(doc, Document): - content_hash = hashlib.sha256(doc.page_content.encode('utf-8')).hexdigest() + # Hash raw file bytes so all chunks of the same file share one stable hash. + # Required for correct deduplication in _build_existing_map / _delta_by_source. + try: + with open(doc.metadata["source"], "rb") as _f: + content_hash = hashlib.sha256(_f.read()).hexdigest() + except OSError: + content_hash = hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest() doc.metadata["content_hash"] = content_hash # get file metadata @@ -154,7 +181,4 @@ def load_from_directory(path: str, recursive: Union[bool, str], silent_errors: b else: logger.warning(f"No documents found in directory: {path}") - return documents - - - + return documents \ No newline at end of file diff --git a/learn2rag/importer/loaders/html_loader.py b/learn2rag/importer/loaders/html_loader.py index 4fb9b36..6f5b434 100644 --- a/learn2rag/importer/loaders/html_loader.py +++ b/learn2rag/importer/loaders/html_loader.py @@ -6,34 +6,55 @@ Author: Kyrill Meyer Institution: IFDT -Version: 0.0.5 +Version: 0.0.7 Creation Date: July 28, 2025 -Last Modified: March 17, 2026 +Last Modified: May 05, 2026 """ import hashlib -from bs4 import BeautifulSoup +import os +import tempfile +import warnings +from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning from datetime import datetime + +warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) from typing import List, Optional, Set +from urllib.parse import urlparse, urljoin from ..globals import stop_loading from langchain_community.document_loaders import UnstructuredHTMLLoader from langchain_core.documents import Document import logging -import os import requests # initialize logger logger = logging.getLogger("Learn2RAGImporter") -def load_html_content(url: str, depth: int = 0, visited: Optional[Set[str]] = None, loader_id: str = "N/A") -> List[Document]: + +def _is_same_site(url: str, base_url: str) -> bool: + """Check if url is on the same domain and under the same path prefix as base_url.""" + parsed_url = urlparse(url) + parsed_base = urlparse(base_url) + if parsed_url.scheme not in ('http', 'https'): + return False + if parsed_url.netloc != parsed_base.netloc: + return False + base_path = parsed_base.path.rstrip('/') + return not base_path or parsed_url.path.startswith(base_path) + + +def load_html_content(url: str, depth: int = 0, visited: Optional[Set[str]] = None, loader_id: str = "N/A", _base_url: Optional[str] = None, skipped: Optional[Set[str]] = None) -> List[Document]: """ Load HTML content from a URL and optionally follow links recursively. Args: url (str): The URL of the HTML page to load. depth (int): The depth of link traversal (default is 0). + Use -1 to crawl the entire site (all links on the same domain + and under the same path as the root URL). visited (set): A set of visited URLs to avoid duplicates. + _base_url: Internal parameter to track the root URL for domain filtering. Returns: list: A list of LangChain Document objects with extracted content. @@ -41,68 +62,91 @@ def load_html_content(url: str, depth: int = 0, visited: Optional[Set[str]] = No if visited is None: visited = set() + if _base_url is None: + _base_url = url + + if skipped is None: + skipped = set() + if url in visited: logger.info(f"Skipping already visited URL: {url}") return [] visited.add(url) documents = [] - temp_file = "temp.html" - try: # Load the main page content response = requests.get(url) response.raise_for_status() # Save the HTML content to a temporary file for UnstructuredHTMLLoader - with open(temp_file, "w", encoding="utf-8") as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".html", encoding="utf-8", delete=False) as f: f.write(response.text) + temp_file = f.name # Use UnstructuredHTMLLoader to extract content loader = UnstructuredHTMLLoader(temp_file) page_documents = loader.load() + # Compute one hash for the entire page so all sub-documents share the same value. + # This ensures that get_documents_by_loader_id can safely deduplicate by source URL + # without ambiguity caused by different hashes for chunks of the same page. + page_hash = hashlib.sha256(response.text.encode('utf-8')).hexdigest() # Extract meta properties using BeautifulSoup soup = BeautifulSoup(response.text, "html.parser") meta_tags = {meta.get("name", meta.get("property", "")): meta.get("content", "") for meta in soup.find_all("meta") if meta.get("content")} - for doc in page_documents: + # Merge all extracted elements into a single Document per URL so that + # delta-import deduplication always works on a 1:1 source→document basis. + valid_docs = [d for d in page_documents if isinstance(d, Document)] + if not valid_docs: + logger.warning(f"No valid documents extracted from {url}") + else: if stop_loading: logger.info("Loading process stopped by user.") - break - # Generate a unique hash for the document content - if isinstance(doc, Document): - content_hash = hashlib.sha256(doc.page_content.encode('utf-8')).hexdigest() - doc.metadata["content_hash"] = content_hash else: - logger.warning(f"Document is not of type Document: {type(doc)}. Skipping.") - continue - doc.metadata["source"] = url - doc.metadata["process_date"] = datetime.now().strftime("%Y-%m-%d") - doc.metadata["process_time"] = datetime.now().strftime("%H:%M:%S") - doc.metadata["loader_type"] = "HTMLLoader" - doc.metadata["meta_properties"] = meta_tags - doc.metadata["loader_id"] = loader_id - documents.extend(page_documents) + merged_content = "\n\n".join(d.page_content for d in valid_docs) + merged_doc = Document( + page_content=merged_content, + metadata={ + "source": url, + "content_hash": page_hash, + "process_date": datetime.now().strftime("%Y-%m-%d"), + "process_time": datetime.now().strftime("%H:%M:%S"), + "loader_type": "HTMLLoader", + "meta_properties": meta_tags, + "loader_id": loader_id, + }, + ) + documents.append(merged_doc) logger.info(f"Loaded content from {url}") - # If depth > 0, extract links and process them recursively - if depth > 0: + # If depth > 0 or depth == -1, extract links and process them recursively + if depth > 0 or depth == -1: soup = BeautifulSoup(response.text, "html.parser") links = [a["href"] for a in soup.find_all("a", href=True)] for link in links: + if stop_loading: + break if isinstance(link, str): # Resolve relative URLs - absolute_link = requests.compat.urljoin(url, link) - documents.extend(load_html_content(absolute_link, depth=depth - 1, visited=visited)) + absolute_link = urljoin(url, link) + if depth == -1: + # Only follow links on the same domain and under the root path + if not _is_same_site(absolute_link, _base_url): + skipped.add(absolute_link) + continue + documents.extend(load_html_content(absolute_link, depth=-1, visited=visited, loader_id=loader_id, _base_url=_base_url, skipped=skipped)) + else: + documents.extend(load_html_content(absolute_link, depth=depth - 1, visited=visited, loader_id=loader_id, _base_url=_base_url, skipped=skipped)) except Exception as e: logger.error(f"Error loading content from {url}: {e}") finally: # Delete the temporary file - if os.path.exists(temp_file): + if 'temp_file' in locals() and os.path.exists(temp_file): os.remove(temp_file) logger.debug(f"Temporary file {temp_file} deleted.") diff --git a/learn2rag/importer/loaders/process_loaders.py b/learn2rag/importer/loaders/process_loaders.py index 7bb332f..dd82c64 100644 --- a/learn2rag/importer/loaders/process_loaders.py +++ b/learn2rag/importer/loaders/process_loaders.py @@ -455,10 +455,15 @@ def _delta_by_source( source: str = doc.metadata.get("source", "") new_docs_by_source.setdefault(source, []).append(doc) - # Compute combined hash per source from sorted individual content_hashes + # Compute combined hash per source from sorted, deduplicated content_hashes. + # Deduplication is required because a loader may return multiple Document objects + # for the same source (e.g. PyPDFLoader per page, UnstructuredHTMLLoader per + # section), all carrying the same file/page hash after our loader fixes. + # _build_existing_map deduplicates via set on the Qdrant side — we must match + # that behaviour here to keep both sides comparable. new_hash_by_source: Dict[str, str] = {} for source, docs in new_docs_by_source.items(): - hashes = sorted(d.metadata.get("content_hash", d.page_content) for d in docs) + hashes = sorted(set(d.metadata.get("content_hash", d.page_content) for d in docs)) combined = "".join(hashes) new_hash_by_source[source] = hashlib.sha256(combined.encode("utf-8")).hexdigest() diff --git a/learn2rag/importer/loaders/sharepoint_loader.py b/learn2rag/importer/loaders/sharepoint_loader.py index 7c521bf..f05abd8 100644 --- a/learn2rag/importer/loaders/sharepoint_loader.py +++ b/learn2rag/importer/loaders/sharepoint_loader.py @@ -33,6 +33,9 @@ def _parse_file(file_path: Path, original_item: Any, loader_id: str = "N/A") -> Parses file using the robust UnstructuredFileLoader. """ docs: List[Document] = [] + # One hash for the entire file so all chunks share the same value, + # enabling unambiguous deduplication by source URL in get_documents_by_loader_id. + file_hash = hashlib.sha256(file_path.read_bytes()).hexdigest() # Check file extension (lowercase) suffix = file_path.suffix.lower() @@ -73,7 +76,16 @@ def _parse_file(file_path: Path, original_item: Any, loader_id: str = "N/A") -> elif suffix == ".pdf": logger.info(f"Detected PDF file: {file_path.name} - using PyPDFLoader") loader = PyPDFLoader(str(file_path)) - docs = loader.load() + _pages = loader.load() + # Merge all pages into one Document so delta-import deduplication works on + # a 1:1 source→document basis (same as all other loaders). + _merged_pdf = Document( + page_content="\n\n".join(p.page_content for p in _pages), + metadata={**(_pages[0].metadata if _pages else {}), "total_pages": len(_pages)}, + ) + _merged_pdf.metadata.pop("page", None) + _merged_pdf.metadata.pop("page_label", None) + docs = [_merged_pdf] # SPECIAL HANDLING FOR IMAGES (Skip due to broken OCR environment) elif suffix in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]: @@ -108,7 +120,7 @@ def _parse_file(file_path: Path, original_item: Any, loader_id: str = "N/A") -> "modified": str(original_item.modified), "loader": "SharePointLoader", "loader_id": loader_id, - "content_hash": hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest(), + "content_hash": file_hash, }) return docs @@ -146,7 +158,7 @@ def _parse_file(file_path: Path, original_item: Any, loader_id: str = "N/A") -> "modified": str(original_item.modified), "loader": "SharePointLoader", "loader_id": loader_id, - "content_hash": hashlib.sha256(doc.page_content.encode("utf-8")).hexdigest(), + "content_hash": file_hash, }) return docs diff --git a/learn2rag/importer/tests/test_loaders.py b/learn2rag/importer/tests/test_loaders.py index 4cd62c3..1b61940 100644 --- a/learn2rag/importer/tests/test_loaders.py +++ b/learn2rag/importer/tests/test_loaders.py @@ -1,11 +1,14 @@ +import json import pathlib import unittest +from unittest.mock import patch, MagicMock -from ..loaders.html_loader import load_html_content +from ..loaders.directory_loader import load_from_directory +from ..loaders.html_loader import load_html_content, _is_same_site class ImporterLoadersTestCase(unittest.TestCase): def test_remote_url(self) -> None: - docs = load_html_content('https://dice-research.org') + docs = load_html_content('https://learn2rag.de') assert len(docs) == 1 doc, = docs assert 'source' in doc.metadata @@ -25,3 +28,208 @@ def test_remote_url(self) -> None: # assert doc.page_content == 'Data URI content' # TODO: actual tests + + def test_import_directory(self) -> None: + """Loads files from C:\\tmp\\importtest, prints metadata incl. content_hash, + and verifies that all chunks of the same file share one stable hash. + + Intentionally Qdrant-free: only the loader and hash consistency are tested. + Set env var SKIP_HASH_ASSERT=1 to print-only without assertions (debugging). + """ + import os + from collections import defaultdict + + path = r"C:\tmp\importtest" + if not pathlib.Path(path).is_dir(): + self.skipTest(f"Test directory not found: {path}") + + skip_assert = os.environ.get("SKIP_HASH_ASSERT", "0") == "1" + + docs = load_from_directory(path, recursive=False, loader_id="test_import") + + def _safe(text: str, limit: int = 500) -> str: + """Truncate and replace unencodable characters for safe terminal output.""" + return text[:limit].encode("cp1252", errors="replace").decode("cp1252") + + print(f"\n=== {len(docs)} document(s) loaded ===") + + # Group by source to detect per-file hash consistency + by_source: dict = defaultdict(list) + for doc in docs: + by_source[doc.metadata.get("source", "?")].append(doc) + + for i, doc in enumerate(docs, start=1): + print(f"\n--- Document {i} ---") + print(f"Metadata: {json.dumps(doc.metadata, indent=2, default=str)}") + print(f"page_content (first 500 characters):\n{_safe(doc.page_content)}") + + print(f"\n=== Hash consistency per source ===") + for source, source_docs in sorted(by_source.items()): + hashes = {d.metadata.get("content_hash", "MISSING") for d in source_docs} + status = "OK" if len(hashes) == 1 else "MISMATCH" + print(f"[{status}] {source} ({len(source_docs)} chunk(s)) hashes={hashes}") + + self.assertTrue(len(docs) > 0, "No documents found in: " + path) + + if not skip_assert: + # All chunks of the same file must share one hash + for source, source_docs in by_source.items(): + hashes = {d.metadata.get("content_hash") for d in source_docs} + self.assertEqual(len(hashes), 1, + f"Hash mismatch for '{source}': {hashes}") + else: + print("\n[SKIP_HASH_ASSERT=1] Hash assertion skipped.") + + +class HtmlLoaderLearn2RagFullCrawlTestCase(unittest.TestCase): + """Integration test: full site crawl of https://learn2rag.de with depth=-1.""" + + def test_full_site_crawl(self) -> None: + """Crawls the entire learn2rag.de domain and prints all discovered pages.""" + root_url = "https://learn2rag.de" + skipped: set = set() + docs = load_html_content(root_url, depth=-1, loader_id="learn2rag_full", skipped=skipped) + + from collections import defaultdict + by_url: dict = defaultdict(list) + for doc in docs: + by_url[doc.metadata.get("source", "?")].append(doc) + + visited_urls = set(by_url.keys()) + for i, doc in enumerate(docs, start=1): + print(f"\n--- Document {i}: {doc.metadata.get('source')} ---") + print(f"page_content (first 300 characters):\n{doc.page_content[:300]}") + + print(f"\n{'=' * 60}") + print(f"SUMMARY") + print(f"{'=' * 60}") + print(f" Integrated (unique pages loaded): {len(visited_urls)}") + print(f" Skipped (off-site links): {len(skipped)}") + print(f" Total documents (incl. duplicates): {len(docs)}") + print(f"\n Documents per URL:") + for url in sorted(visited_urls): + count = len(by_url[url]) + print(f" [{count} doc(s)] {url}") + print(f"\n Skipped URLs (sample, max 20):") + for url in sorted(skipped)[:20]: + print(f" [--] {url}") + if len(skipped) > 20: + print(f" ... and {len(skipped) - 20} more") + print(f"{'=' * 60}") + + # At least the root page must have been loaded + self.assertTrue(len(docs) >= 1, "No documents found") + # All documents must stay on the learn2rag.de domain + for url in visited_urls: + self.assertIn("learn2rag.de", url, f"External URL found: {url}") + # Metadata must be complete + for doc in docs: + self.assertIn("loader_id", doc.metadata) + self.assertIn("content_hash", doc.metadata) + self.assertEqual(doc.metadata["loader_id"], "learn2rag_full") + + +class IsSameSiteTestCase(unittest.TestCase): + """Unit tests for _is_same_site — no network access.""" + + def test_same_domain_no_path(self) -> None: + self.assertTrue(_is_same_site("https://example.com/page", "https://example.com")) + + def test_same_domain_with_path_prefix(self) -> None: + self.assertTrue(_is_same_site("https://example.com/docs/guide", "https://example.com/docs/")) + + def test_different_path_prefix(self) -> None: + self.assertFalse(_is_same_site("https://example.com/blog/post", "https://example.com/docs/")) + + def test_different_domain(self) -> None: + self.assertFalse(_is_same_site("https://other.com/docs/page", "https://example.com/docs/")) + + def test_non_http_scheme(self) -> None: + self.assertFalse(_is_same_site("mailto:info@example.com", "https://example.com")) + + def test_anchor_link(self) -> None: + # Fragment-only links resolve to the same page and remain on-site + self.assertTrue(_is_same_site("https://example.com/docs/page#section", "https://example.com/docs/")) + + +def _make_response(text: str, status_code: int = 200) -> MagicMock: + """Helper: creates a fake requests.Response.""" + resp = MagicMock() + resp.status_code = status_code + resp.text = text + resp.raise_for_status = MagicMock() + return resp + + +ROOT_HTML = """ +Root +

Root page

+ Page 1 + Page 2 + Blog (other subtree) + External + +""" + +PAGE1_HTML = """ +Page 1 +

Page 1 content

+ Page 2 (already visited) + +""" + +PAGE2_HTML = """ +Page 2 +

Page 2 content

+ +""" + + +class HtmlLoaderDepthMinusOneTestCase(unittest.TestCase): + """Tests for depth=-1 (full site crawl) using mocked HTTP requests.""" + + def _fake_get(self, url: str, **kwargs) -> MagicMock: + pages = { + "https://example.com/docs/": ROOT_HTML, + "https://example.com/docs/page1": PAGE1_HTML, + "https://example.com/docs/page2": PAGE2_HTML, + } + return _make_response(pages.get(url, "Not found")) + + @patch("learn2rag.importer.loaders.html_loader.requests.get") + def test_crawls_same_subtree(self, mock_get: MagicMock) -> None: + """depth=-1 should only visit URLs under /docs/, not /blog/ or external.com.""" + mock_get.side_effect = self._fake_get + docs = load_html_content("https://example.com/docs/", depth=-1, loader_id="test") + + visited_urls = {doc.metadata["source"] for doc in docs} + print(f"\nVisited URLs: {visited_urls}") + + # Expected: root + page1 + page2 + self.assertIn("https://example.com/docs/", visited_urls) + self.assertIn("https://example.com/docs/page1", visited_urls) + self.assertIn("https://example.com/docs/page2", visited_urls) + + # Not expected: different subtree and external domain + self.assertNotIn("https://example.com/blog/post", visited_urls) + self.assertNotIn("https://external.com/", visited_urls) + + @patch("learn2rag.importer.loaders.html_loader.requests.get") + def test_no_duplicate_visits(self, mock_get: MagicMock) -> None: + """Each URL is loaded only once (page2 is linked from both root and page1).""" + mock_get.side_effect = self._fake_get + docs = load_html_content("https://example.com/docs/", depth=-1, loader_id="test") + + sources = [doc.metadata["source"] for doc in docs] + self.assertEqual(len(sources), len(set(sources)), "Duplicate URLs found") + + @patch("learn2rag.importer.loaders.html_loader.requests.get") + def test_metadata_set_correctly(self, mock_get: MagicMock) -> None: + """All documents have loader_id, content_hash and loader_type set.""" + mock_get.side_effect = self._fake_get + docs = load_html_content("https://example.com/docs/", depth=-1, loader_id="test_meta") + + for doc in docs: + self.assertEqual(doc.metadata.get("loader_id"), "test_meta") + self.assertIn("content_hash", doc.metadata) + self.assertEqual(doc.metadata.get("loader_type"), "HTMLLoader") \ No newline at end of file From 2ef59c289b7d16de5c30831c8479b0b33ae4ad40 Mon Sep 17 00:00:00 2001 From: Carolin Walter Date: Wed, 6 May 2026 13:11:00 +0000 Subject: [PATCH 06/34] simplify loading pdf in one document Co-authored-by: Copilot --- .../importer/loaders/directory_loader.py | 30 +++++-------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/learn2rag/importer/loaders/directory_loader.py b/learn2rag/importer/loaders/directory_loader.py index 93c0da4..8c1d2f1 100644 --- a/learn2rag/importer/loaders/directory_loader.py +++ b/learn2rag/importer/loaders/directory_loader.py @@ -16,7 +16,7 @@ from datetime import datetime from typing import List, Union from ..globals import stop_loading -from langchain_community.document_loaders import DirectoryLoader, PyPDFDirectoryLoader +from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader from langchain_core.documents import Document # supress pdfminer-Warnings @@ -63,7 +63,7 @@ def load_from_directory(path: str, recursive: Union[bool, str], silent_errors: b recursive = recursive.lower() == "true" - text_loader_kwargs = {"autodetect_encoding": True, "detect_language_per_element": False} + text_loader_kwargs = {"autodetect_encoding": True, "detect_language_per_element": False, "mode": "single"} loader = DirectoryLoader( path, show_progress=True, @@ -83,14 +83,16 @@ def load_from_directory(path: str, recursive: Union[bool, str], silent_errors: b "*.epub", ] ) - pdf_loader = PyPDFDirectoryLoader( + # use pypdf instead of unstructured[pdf] for better performance and stability, especially with large PDFs + pdf_loader = DirectoryLoader( path, + glob="*.pdf", + loader_cls=PyPDFLoader, + loader_kwargs={"mode": "single"}, recursive=recursive, silent_errors=silent_errors, ) - #loader = DirectoryLoader(path, show_progress=True, loader_kwargs=text_loader_kwargs, recursive=recursive, glob=["*.csv", "*.docx", "*.eml", "*.epub", "*.html", "*.json", "*.md", "*.odt", "*.pdf", "*.ppt", "*.pptx", "*.rst", "*.rtf", "*.txt", "*.tsv", "*.cls", "*.xlsx", "*.xml"]) - #external dependencies # doc - requires libreoffice # epub - requires pandoc @@ -98,23 +100,7 @@ def load_from_directory(path: str, recursive: Union[bool, str], silent_errors: b #loader = DirectoryLoader(path, show_progress=True, silent_errors=True, recursive=False) try: other_docs = loader.load() - # Merge PDF pages: PyPDFDirectoryLoader returns one Document per page. - # We combine all pages of the same file into one Document so that delta-import - # deduplication works on a 1:1 source→document basis (same as all other loaders). - from collections import defaultdict as _defaultdict - pdf_pages = pdf_loader.load() - pdf_by_file: dict = _defaultdict(list) - for _p in pdf_pages: - pdf_by_file[_p.metadata["source"]].append(_p) - pdf_docs = [] - for _src, _pages in pdf_by_file.items(): - _merged = Document( - page_content="\n\n".join(p.page_content for p in _pages), - metadata={**_pages[0].metadata, "total_pages": len(_pages)}, - ) - _merged.metadata.pop("page", None) - _merged.metadata.pop("page_label", None) - pdf_docs.append(_merged) + pdf_docs = pdf_loader.load() loaded_documents = other_docs + pdf_docs except Exception as e: logger.error(f"Error loading documents from directory: {e}") From 505b272c2c6461d544f3f2da14157c69aeac3660 Mon Sep 17 00:00:00 2001 From: Carolin Walter Date: Thu, 7 May 2026 13:14:05 +0000 Subject: [PATCH 07/34] adapt get_documents() to the needs of process_loaders Co-authored-by: Copilot --- learn2rag/importer/loaders/process_loaders.py | 40 ++----------------- learn2rag/pipeline/store.py | 19 ++++++--- 2 files changed, 18 insertions(+), 41 deletions(-) diff --git a/learn2rag/importer/loaders/process_loaders.py b/learn2rag/importer/loaders/process_loaders.py index dd82c64..0bbcc79 100644 --- a/learn2rag/importer/loaders/process_loaders.py +++ b/learn2rag/importer/loaders/process_loaders.py @@ -212,9 +212,8 @@ def process_delta_imports( import_start = datetime.now(timezone.utc) import_state.record_import_start(loader_id, import_start) - # Retrieve existing Qdrant documents for this loader and build {source: combined_hash} map - payloads: List[Dict[str, Any]] = get_documents(loader_id, user_config, opt_config) or [] - existing_map: Dict[str, str] = _build_existing_map(payloads) + # Retrieve existing Qdrant documents for this loader as {source: content_hash} map + existing_map: Dict[str, str] = get_documents(loader_id, user_config, opt_config) is_initial = len(existing_map) == 0 logger.info( @@ -392,36 +391,6 @@ def process_delta_imports( logger.error(f"process_delta_imports: error processing loader '{loader_id}': {e}", exc_info=True) -def _build_existing_map(payloads: List[Dict[str, Any]]) -> Dict[str, str]: - """ - Build a ``{source: combined_content_hash}`` mapping from Qdrant payloads. - - Groups payloads by ``source`` field and computes a combined hash per source - from the sorted individual ``content_hash`` values. This makes the comparison - stable regardless of chunk order. - - Args: - payloads (List[Dict[str, Any]]): Raw Qdrant payloads as returned by - ``get_documents()``. - - Returns: - Dict[str, str]: Mapping of ``{source: combined_hash}``. - """ - # Use a set to deduplicate content_hash values: all chunks of one document - # share the same content_hash, so duplicates must be collapsed before combining. - chunks_by_source: Dict[str, set] = {} - for payload in payloads: - source = payload.get("source", "") - content_hash = payload.get("content_hash", "") - if source: - chunks_by_source.setdefault(source, set()).add(content_hash) - result: Dict[str, str] = {} - for source, hashes in chunks_by_source.items(): - combined = "".join(sorted(hashes)) - result[source] = hashlib.sha256(combined.encode("utf-8")).hexdigest() - return result - - def _delta_by_source( all_docs: List[Document], existing_map: Dict[str, str], @@ -442,8 +411,7 @@ def _delta_by_source( Args: all_docs (List[Document]): All documents returned by the loader for this run. - existing_map (Dict[str, str]): Mapping of ``{source: combined_hash}`` as - built by ``_build_existing_map()``. + existing_map (Dict[str, str]): Mapping of ``{source: content_hash}``. loader_id (str): Unique loader identifier. user_config (Dict[str, Any]): User configuration dict (must contain ``collection_name``). @@ -454,7 +422,7 @@ def _delta_by_source( for doc in all_docs: source: str = doc.metadata.get("source", "") new_docs_by_source.setdefault(source, []).append(doc) - + # TODO: make sure that only one document is returned per source from the loader (what about docx, pptx?) with this combinded hashes are not required # Compute combined hash per source from sorted, deduplicated content_hashes. # Deduplication is required because a loader may return multiple Document objects # for the same source (e.g. PyPDFLoader per page, UnstructuredHTMLLoader per diff --git a/learn2rag/pipeline/store.py b/learn2rag/pipeline/store.py index f14c926..4fef469 100644 --- a/learn2rag/pipeline/store.py +++ b/learn2rag/pipeline/store.py @@ -56,9 +56,10 @@ def delete_documents(loader_id: str, paths: list[str], user_config: dict[str, An ), ) -def get_documents(loader_id: str, user_config: dict[str, Any], opt_config: dict[str, Any]) -> list[dict[str, Any]]|None: - """Retrieve documents from the vector store based on loader_id.""" +def get_documents(loader_id: str, user_config: dict[str, Any], opt_config: dict[str, Any]) -> dict[str, str]: + """Retrieve documents from the vector store and return a {source: content_hash} mapping.""" qdrant = Qdrant(user_config["collection_name"], opt_config) + path_hash_dict: dict[str, str] = {} if qdrant.client.collection_exists(user_config["collection_name"]): logging.info('Scrolling through collection to retrieve documents with loader_id: %s', loader_id) filter = Filter( @@ -78,7 +79,7 @@ def get_documents(loader_id: str, user_config: dict[str, Any], opt_config: dict[ scroll_filter=filter, limit=100, offset=offset, - with_payload=True, + with_payload=["source", "content_hash"], with_vectors=False, ) @@ -87,8 +88,16 @@ def get_documents(loader_id: str, user_config: dict[str, Any], opt_config: dict[ if offset is None: break - return [point.payload for point in points if point.payload is not None] - return None + + for point in points: + if point.payload: + source = point.payload.get("source", "") + content_hash = point.payload.get("content_hash", "") + if source and source not in path_hash_dict: + path_hash_dict[source] = content_hash + + return path_hash_dict + def update_documents(loader_id: str, documents: list[Document], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: From afebe20f2a9258d79cb79829c923742bae6ae7fd Mon Sep 17 00:00:00 2001 From: kymeyer Date: Mon, 11 May 2026 08:03:47 +0200 Subject: [PATCH 08/34] remove combined-hash --- learn2rag/importer/loaders/process_loaders.py | 22 ++-- learn2rag/importer/tests/test_loaders.py | 111 +++++++++++------- 2 files changed, 76 insertions(+), 57 deletions(-) diff --git a/learn2rag/importer/loaders/process_loaders.py b/learn2rag/importer/loaders/process_loaders.py index 0bbcc79..1c48bce 100644 --- a/learn2rag/importer/loaders/process_loaders.py +++ b/learn2rag/importer/loaders/process_loaders.py @@ -417,23 +417,19 @@ def _delta_by_source( ``collection_name``). opt_config (Dict[str, Any]): Optimisation configuration dict. """ - # Group freshly loaded documents by source (1 source = N chunks) + # Group freshly loaded documents by source (1 source = 1 Document after loader merging) new_docs_by_source: Dict[str, List[Document]] = {} for doc in all_docs: source: str = doc.metadata.get("source", "") new_docs_by_source.setdefault(source, []).append(doc) - # TODO: make sure that only one document is returned per source from the loader (what about docx, pptx?) with this combinded hashes are not required - # Compute combined hash per source from sorted, deduplicated content_hashes. - # Deduplication is required because a loader may return multiple Document objects - # for the same source (e.g. PyPDFLoader per page, UnstructuredHTMLLoader per - # section), all carrying the same file/page hash after our loader fixes. - # _build_existing_map deduplicates via set on the Qdrant side — we must match - # that behaviour here to keep both sides comparable. - new_hash_by_source: Dict[str, str] = {} - for source, docs in new_docs_by_source.items(): - hashes = sorted(set(d.metadata.get("content_hash", d.page_content) for d in docs)) - combined = "".join(hashes) - new_hash_by_source[source] = hashlib.sha256(combined.encode("utf-8")).hexdigest() + + # All loaders guarantee exactly one Document per source (PDF pages merged, + # HTML elements merged), so content_hash on docs[0] is directly comparable + # to the value returned by get_documents() from Qdrant. + new_hash_by_source: Dict[str, str] = { + source: docs[0].metadata.get("content_hash", "") + for source, docs in new_docs_by_source.items() + } # Bulk-delete sources that are no longer present in the fresh load deleted_sources: List[str] = [s for s in existing_map if s not in new_docs_by_source] diff --git a/learn2rag/importer/tests/test_loaders.py b/learn2rag/importer/tests/test_loaders.py index 1b61940..f45f333 100644 --- a/learn2rag/importer/tests/test_loaders.py +++ b/learn2rag/importer/tests/test_loaders.py @@ -1,97 +1,120 @@ import json +import os import pathlib +import shutil +import sys +import tempfile import unittest +from collections import defaultdict +from typing import ClassVar, DefaultDict, List, Optional, Set from unittest.mock import patch, MagicMock +from langchain_core.documents import Document from ..loaders.directory_loader import load_from_directory from ..loaders.html_loader import load_html_content, _is_same_site +# Set RUN_INTEGRATION_TESTS=1 to run tests that require network access. +_RUN_INTEGRATION: bool = os.environ.get("RUN_INTEGRATION_TESTS", "0") == "1" + + class ImporterLoadersTestCase(unittest.TestCase): + """Tests for directory and HTML loaders. Runs fully offline without Qdrant.""" + + test_path: ClassVar[str] + _temp_dir: ClassVar[Optional[str]] + + @classmethod + def setUpClass(cls) -> None: + env_path = os.environ.get("TEST_IMPORT_PATH", "") + if env_path and pathlib.Path(env_path).is_dir(): + cls.test_path = env_path + cls._temp_dir = None + else: + cls._temp_dir = tempfile.mkdtemp(prefix="learn2rag_test_") + cls.test_path = cls._temp_dir + (pathlib.Path(cls._temp_dir) / "sample.txt").write_text( + "This is a test document.\nIt has multiple lines of content.\nLine three.", + encoding="utf-8", + ) + + @classmethod + def tearDownClass(cls) -> None: + if cls._temp_dir: + shutil.rmtree(cls._temp_dir, ignore_errors=True) + + @unittest.skipUnless(_RUN_INTEGRATION, "Set RUN_INTEGRATION_TESTS=1 to run") def test_remote_url(self) -> None: - docs = load_html_content('https://learn2rag.de') + docs = load_html_content("https://learn2rag.de") assert len(docs) == 1 doc, = docs - assert 'source' in doc.metadata - assert 'The DICE group at Paderborn University' in doc.page_content - - # def test_local_file(self): - # path = pathlib.Path(__file__).parent.resolve() / 'html' - # docs = load_html_content((path / 'local_file.html').as_uri()) - # assert len(docs) == 1 - # doc, = docs - # assert doc.page_content == 'Local file content' - - # def test_data_uri(self): - # docs = load_html_content('data:text/html;charset=utf-8,%3Cbody%3E%3Cp%3EData%20URI%20content%3C%2Fp%3E%3C%2Fbody%3E') - # assert len(docs) == 1 - # doc, = docs - # assert doc.page_content == 'Data URI content' - - # TODO: actual tests + assert "source" in doc.metadata + assert "The DICE group at Paderborn University" in doc.page_content def test_import_directory(self) -> None: - """Loads files from C:\\tmp\\importtest, prints metadata incl. content_hash, - and verifies that all chunks of the same file share one stable hash. + """Loads files from test_path, prints metadata incl. content_hash, and + verifies that each source yields exactly one Document with a stable hash. Intentionally Qdrant-free: only the loader and hash consistency are tested. Set env var SKIP_HASH_ASSERT=1 to print-only without assertions (debugging). """ - import os - from collections import defaultdict - - path = r"C:\tmp\importtest" - if not pathlib.Path(path).is_dir(): - self.skipTest(f"Test directory not found: {path}") - skip_assert = os.environ.get("SKIP_HASH_ASSERT", "0") == "1" - docs = load_from_directory(path, recursive=False, loader_id="test_import") + docs: List[Document] = load_from_directory( + self.test_path, recursive=False, loader_id="test_import" + ) def _safe(text: str, limit: int = 500) -> str: """Truncate and replace unencodable characters for safe terminal output.""" - return text[:limit].encode("cp1252", errors="replace").decode("cp1252") + encoding = sys.stdout.encoding or "utf-8" + return text[:limit].encode(encoding, errors="replace").decode(encoding) - print(f"\n=== {len(docs)} document(s) loaded ===") + print(f"\n=== {len(docs)} document(s) loaded from '{self.test_path}' ===") - # Group by source to detect per-file hash consistency - by_source: dict = defaultdict(list) + by_source: DefaultDict[str, List[Document]] = defaultdict(list) for doc in docs: by_source[doc.metadata.get("source", "?")].append(doc) for i, doc in enumerate(docs, start=1): print(f"\n--- Document {i} ---") print(f"Metadata: {json.dumps(doc.metadata, indent=2, default=str)}") - print(f"page_content (first 500 characters):\n{_safe(doc.page_content)}") + print(f"page_content (first 500 chars):\n{_safe(doc.page_content)}") - print(f"\n=== Hash consistency per source ===") + print("\n=== Hash consistency per source ===") for source, source_docs in sorted(by_source.items()): - hashes = {d.metadata.get("content_hash", "MISSING") for d in source_docs} + hashes: Set[Optional[str]] = { + d.metadata.get("content_hash", "MISSING") for d in source_docs + } status = "OK" if len(hashes) == 1 else "MISMATCH" - print(f"[{status}] {source} ({len(source_docs)} chunk(s)) hashes={hashes}") + print(f"[{status}] {source} ({len(source_docs)} doc(s)) hashes={hashes}") - self.assertTrue(len(docs) > 0, "No documents found in: " + path) + self.assertGreater(len(docs), 0, f"No documents found in: {self.test_path}") if not skip_assert: - # All chunks of the same file must share one hash for source, source_docs in by_source.items(): - hashes = {d.metadata.get("content_hash") for d in source_docs} - self.assertEqual(len(hashes), 1, - f"Hash mismatch for '{source}': {hashes}") + hashes2: Set[Optional[str]] = {d.metadata.get("content_hash") for d in source_docs} + self.assertEqual( + len(hashes2), 1, f"Hash mismatch for '{source}': {hashes2}" + ) + self.assertEqual( + len(source_docs), + 1, + f"Expected 1 Document per source, got {len(source_docs)} for '{source}'", + ) else: print("\n[SKIP_HASH_ASSERT=1] Hash assertion skipped.") +@unittest.skipUnless(_RUN_INTEGRATION, "Set RUN_INTEGRATION_TESTS=1 to run") class HtmlLoaderLearn2RagFullCrawlTestCase(unittest.TestCase): """Integration test: full site crawl of https://learn2rag.de with depth=-1.""" def test_full_site_crawl(self) -> None: """Crawls the entire learn2rag.de domain and prints all discovered pages.""" root_url = "https://learn2rag.de" - skipped: set = set() + skipped: Set[str] = set() docs = load_html_content(root_url, depth=-1, loader_id="learn2rag_full", skipped=skipped) - from collections import defaultdict - by_url: dict = defaultdict(list) + by_url: DefaultDict[str, List[Document]] = defaultdict(list) for doc in docs: by_url[doc.metadata.get("source", "?")].append(doc) From 3be4a20b1f40eb4c5f8dca78f1838b133321a90a Mon Sep 17 00:00:00 2001 From: kymeyer Date: Mon, 11 May 2026 08:13:24 +0200 Subject: [PATCH 09/34] type-safety improvements / comments --- learn2rag/importer/loaders/directory_loader.py | 6 +++--- learn2rag/importer/loaders/process_loaders.py | 7 ++++--- learn2rag/importer/tests/test_loaders.py | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/learn2rag/importer/loaders/directory_loader.py b/learn2rag/importer/loaders/directory_loader.py index 8c1d2f1..aeb600b 100644 --- a/learn2rag/importer/loaders/directory_loader.py +++ b/learn2rag/importer/loaders/directory_loader.py @@ -87,7 +87,7 @@ def load_from_directory(path: str, recursive: Union[bool, str], silent_errors: b pdf_loader = DirectoryLoader( path, glob="*.pdf", - loader_cls=PyPDFLoader, + loader_cls=PyPDFLoader, # type: ignore[arg-type] loader_kwargs={"mode": "single"}, recursive=recursive, silent_errors=silent_errors, @@ -112,8 +112,8 @@ def load_from_directory(path: str, recursive: Union[bool, str], silent_errors: b break try: if isinstance(doc, Document): - # Hash raw file bytes so all chunks of the same file share one stable hash. - # Required for correct deduplication in _build_existing_map / _delta_by_source. + # Hash raw file bytes so the Document carries a stable, source-level hash + # directly comparable to the value stored in Qdrant by get_documents(). try: with open(doc.metadata["source"], "rb") as _f: content_hash = hashlib.sha256(_f.read()).hexdigest() diff --git a/learn2rag/importer/loaders/process_loaders.py b/learn2rag/importer/loaders/process_loaders.py index 1c48bce..025585e 100644 --- a/learn2rag/importer/loaders/process_loaders.py +++ b/learn2rag/importer/loaders/process_loaders.py @@ -401,11 +401,12 @@ def _delta_by_source( """ Hash-based delta import for normal loaders (DirectoryLoader, HTMLLoader, CSVLoader). - Groups freshly loaded documents by ``source``, computes a combined content hash - per source, and then: + Each loader guarantees exactly one Document per source (PDF pages merged, HTML + elements merged), so ``content_hash`` on that Document is the raw-file hash and + directly comparable to the value stored in Qdrant by ``get_documents()``. - Deletes Qdrant chunks (bulk) for sources that no longer exist in the new load. - - Calls ``update_documents`` for sources whose combined hash has changed or that + - Calls ``update_documents`` for sources whose content_hash has changed or that are entirely new (update_documents handles delete-then-reindex internally). - Leaves unchanged sources untouched. diff --git a/learn2rag/importer/tests/test_loaders.py b/learn2rag/importer/tests/test_loaders.py index f45f333..63408b2 100644 --- a/learn2rag/importer/tests/test_loaders.py +++ b/learn2rag/importer/tests/test_loaders.py @@ -6,7 +6,7 @@ import tempfile import unittest from collections import defaultdict -from typing import ClassVar, DefaultDict, List, Optional, Set +from typing import Any, ClassVar, DefaultDict, List, Optional, Set from unittest.mock import patch, MagicMock from langchain_core.documents import Document @@ -211,7 +211,7 @@ def _make_response(text: str, status_code: int = 200) -> MagicMock: class HtmlLoaderDepthMinusOneTestCase(unittest.TestCase): """Tests for depth=-1 (full site crawl) using mocked HTTP requests.""" - def _fake_get(self, url: str, **kwargs) -> MagicMock: + def _fake_get(self, url: str, **kwargs: Any) -> MagicMock: pages = { "https://example.com/docs/": ROOT_HTML, "https://example.com/docs/page1": PAGE1_HTML, From 7a60321116d5bf272061ca37fb191f3526db3f66 Mon Sep 17 00:00:00 2001 From: Carolin Walter Date: Wed, 13 May 2026 13:17:45 +0000 Subject: [PATCH 10/34] changed naming: path to source --- learn2rag/pipeline/store.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/learn2rag/pipeline/store.py b/learn2rag/pipeline/store.py index 4fef469..2ec3be7 100644 --- a/learn2rag/pipeline/store.py +++ b/learn2rag/pipeline/store.py @@ -31,13 +31,13 @@ def delete_collection(loader_id: str|None, user_config: dict[str, Any], opt_conf ), ) -def delete_documents(loader_id: str, paths: list[str], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: - """Delete documents from the vector store based on loader_id and paths.""" +def delete_documents(loader_id: str, docs: list[str], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: + """Delete documents from the vector store based on loader_id and their source. A source is the path to and the identification of one document.""" qdrant = Qdrant(user_config["collection_name"], opt_config) if qdrant.client.collection_exists(user_config["collection_name"]): - logging.info('Deleting documents with loader_id: %s and paths: %s', loader_id, paths) + logging.info('Deleting documents with loader_id: %s and paths: %s', loader_id, docs) # Delete points with the specified loader_id and paths - for path in paths: + for path in docs: qdrant.client.delete( collection_name=user_config["collection_name"], points_selector=FilterSelector( @@ -101,8 +101,9 @@ def get_documents(loader_id: str, user_config: dict[str, Any], opt_config: dict[ def update_documents(loader_id: str, documents: list[Document], user_config: dict[str, Any], opt_config: dict[str, Any]) -> None: + """Update documents in the vector store. This is done by deleting all chunks of the existing document based on source and loader_id, and then re-indexing the new document.""" qdrant = Qdrant(user_config["collection_name"], opt_config) if qdrant.client.collection_exists(user_config["collection_name"]): logging.info('Updating documents with loader_id: %s', loader_id) - delete_documents(loader_id, paths=[doc.metadata["source"] for doc in documents], user_config=user_config, opt_config=opt_config) + delete_documents(loader_id, docs=[doc.metadata["source"] for doc in documents], user_config=user_config, opt_config=opt_config) index(documents, user_config, opt_config) \ No newline at end of file From 05f60815f939380911f40a1c2539d5959bbdcbaa Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 08:25:05 +0200 Subject: [PATCH 11/34] Fix running the import from UI --- services/start-import | 1 - services/start-import.linux | 1 - services/start-import.windows.bat | 1 - 3 files changed, 3 deletions(-) diff --git a/services/start-import b/services/start-import index fcd2b27..7cbbcce 100755 --- a/services/start-import +++ b/services/start-import @@ -1,4 +1,3 @@ #!/bin/sh set -eu "$LEARN2RAG_PATH"/configurator learn2rag.importer --config "$IMPORTER_CONFIG" --logging-config "$STORAGE_PATH"/logging_config.yml -"$LEARN2RAG_PATH"/configurator learn2rag.pipeline.ingestion --logging-config "$STORAGE_PATH"/logging_config.yml diff --git a/services/start-import.linux b/services/start-import.linux index fcd2b27..7cbbcce 100755 --- a/services/start-import.linux +++ b/services/start-import.linux @@ -1,4 +1,3 @@ #!/bin/sh set -eu "$LEARN2RAG_PATH"/configurator learn2rag.importer --config "$IMPORTER_CONFIG" --logging-config "$STORAGE_PATH"/logging_config.yml -"$LEARN2RAG_PATH"/configurator learn2rag.pipeline.ingestion --logging-config "$STORAGE_PATH"/logging_config.yml diff --git a/services/start-import.windows.bat b/services/start-import.windows.bat index 9bcaf14..0a8861e 100644 --- a/services/start-import.windows.bat +++ b/services/start-import.windows.bat @@ -1,2 +1 @@ "%LEARN2RAG_PATH%/configurator.exe" learn2rag.importer --config "%IMPORTER_CONFIG%" --logging-config "%STORAGE_PATH%/logging_config.yml" -"%LEARN2RAG_PATH%/configurator.exe" learn2rag.pipeline.ingestion --logging-config "%STORAGE_PATH%/logging_config.yml" From 90f7dbea3d36b63016f439f0f93767bd49ad4c5a Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 09:39:42 +0200 Subject: [PATCH 12/34] Refactor calling of the modules from the main entrypoint --- learn2rag/__main__.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/learn2rag/__main__.py b/learn2rag/__main__.py index 21120a5..60267ab 100644 --- a/learn2rag/__main__.py +++ b/learn2rag/__main__.py @@ -58,12 +58,21 @@ def configure_logging(config_path: pathlib.Path, debug: bool) -> None: logging.debug('Learn2RAG launcher starting: %s, %s', args, rest) module = importlib.import_module(args.module) # TODO + module_args = tuple() + module_kwargs = {} if args.module == 'learn2rag.ollama_tool': # FIXME default config values - module.main(rest, config=config.get('OLLAMA', {'port': 11434})) + module_args = ( + rest, + ) + module_kwargs = {'config': config.get('OLLAMA', {'port': 11434})} elif args.module == 'learn2rag.importer': - module.main(module.ImporterArgumentParser().parse_args(rest)) + module_args = ( + module.ImporterArgumentParser().parse_args(rest), + ) elif args.module == 'learn2rag.ui': - module.main(config) - else: - module.main() + module_args = ( + config, + ) + + module.main(*module_args, **module_kwargs) From 3a2bfd1a3b34a6a4e2e1aff48691568fafca974a Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 10:01:57 +0200 Subject: [PATCH 13/34] Add repeatedly scheduled import running with `apscheduler` --- learn2rag/__main__.py | 20 +- .../compose/pipelines/continuous.yml | 177 ++++++++++++++++++ pyproject.toml | 1 + uv.lock | 14 ++ 4 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 learn2rag/ui/templates/compose/pipelines/continuous.yml diff --git a/learn2rag/__main__.py b/learn2rag/__main__.py index 60267ab..93afec2 100644 --- a/learn2rag/__main__.py +++ b/learn2rag/__main__.py @@ -6,10 +6,14 @@ import os import pathlib import sys +from datetime import datetime, timedelta from types import TracebackType from typing import Unpack import yaml +from apscheduler.schedulers.blocking import BlockingScheduler +from apscheduler.triggers.interval import IntervalTrigger +from pydantic import TypeAdapter class LauncherArgumentParser(argparse.ArgumentParser): @@ -17,6 +21,7 @@ def __init__(self) -> None: super().__init__() self.add_argument('module', type=str, nargs='?', default='learn2rag.ui') self.add_argument('--logging-config', type=pathlib.Path) + self.add_argument('--schedule-interval', type=TypeAdapter(timedelta).validate_python) def excepthook(*exc_info: Unpack[tuple[type[BaseException], BaseException, TracebackType | None]]) -> None: @@ -75,4 +80,17 @@ def configure_logging(config_path: pathlib.Path, debug: bool) -> None: config, ) - module.main(*module_args, **module_kwargs) + if args.schedule_interval: + scheduler = BlockingScheduler() + trigger = IntervalTrigger(seconds=args.schedule_interval.total_seconds()) + scheduler.add_job( + module.main, + trigger, + next_run_time=datetime.utcnow(), + max_instances=1, + args=module_args, + kwargs=module_kwargs, + ) + scheduler.start() + else: + module.main(*module_args, **module_kwargs) diff --git a/learn2rag/ui/templates/compose/pipelines/continuous.yml b/learn2rag/ui/templates/compose/pipelines/continuous.yml new file mode 100644 index 0000000..bbc1727 --- /dev/null +++ b/learn2rag/ui/templates/compose/pipelines/continuous.yml @@ -0,0 +1,177 @@ +name: continuous +label: Continuous +ports: + # TODO: labels in the interface currently assume a specific port order + - ui + - qdrant_http + - pipeline + - open_webui_pipelines +ui_url: '{{learn2rag_scheme}}://{{learn2rag_hostname}}:{{ports.ui}}/' +files: + - path: '{{storage_path}}/qdrant_config.yml' + content: | + log_level: INFO + service: + api_key: '{{qdrant_api_key}}' + grpc_port: null + http_port: {{ports.qdrant_http}} + host: '127.0.0.1' + telemetry_disabled: true + + - path: '{{storage_path}}/importer_config.json' + content: '{{import_config | tojson}}' + + - path: '{{storage_path}}/basic_user_config.json' + content: | + { + "collection_name": "learn2rag", + "imported_documents_file_path": "loaded_documents.json", + "llm": "{{language_model.model}}" + } + + - path: '{{storage_path}}/logging_config.yml' + content: | + version: 1 + formatters: + simple: + format: "%(asctime)s %(levelname)-8s %(name)s %(message)s" + colored: + class: colorlog.ColoredFormatter + format: "%(log_color)s%(asctime)s %(levelname)-8s %(name)s %(message)s" + handlers: + display: + class: logging.StreamHandler + level: INFO + formatter: colored + stream: ext://sys.stderr + errors_file: + class: logging.FileHandler + level: ERROR + formatter: simple + filename: '{{storage_path}}/logs/error.log' + encoding: utf-8 + {% if debug_logging %} + debug_file: + class: logging.FileHandler + level: DEBUG + formatter: simple + filename: '{{storage_path}}/logs/debug.log' + encoding: utf-8 + {% endif %} + root: + level: DEBUG + handlers: + - display + - errors_file + {% if debug_logging %} + - debug_file + {% endif %} + + - path: '{{storage_path}}/logs/.keep' + content: '' + +services: + open-webui: + working_dir: '{{storage_path}}' + command: + - '{{learn2rag_path}}/services/start-open-webui{% if is_windows %}.exe{% endif %}' + - 'serve' + - '--host' + - '0.0.0.0' + - '--port' + - '{{ports.ui}}' + environment: + LEARN2RAG_PATH: '{{learn2rag_path}}' + DATA_DIR: '{{storage_path}}/open-webui-data' + DEFAULT_LOCALE: 'de' + ENABLE_OPENAI_API: 'True' + OPENAI_API_BASE_URL: http://127.0.0.1:{{ports.pipeline}} + OPENAI_API_KEY: '0p3n-w3bu!' + ENABLE_OLLAMA_API: 'False' + ENABLE_PERSISTENT_CONFIG: 'False' + ENABLE_TITLE_GENERATION: 'False' + GLOBAL_LOG_LEVEL: 'WARNING' + OFFLINE_MODE: 'True' + USER_PERMISSIONS_CHAT_FILE_UPLOAD: 'False' + USER_PERMISSIONS_CHAT_TEMPORARY_ENFORCED: 'True' + USER_PERMISSIONS_FEATURES_CODE_INTERPRETER: 'False' + USER_PERMISSIONS_FEATURES_IMAGE_GENERATION: 'False' + USER_PERMISSIONS_FEATURES_WEB_SEARCH: 'False' + #!!! {% if config.get("SIMPLE_AUTH") %} + ENABLE_SIGNUP: 'False' + WEBUI_AUTH: 'True' + WEBUI_ADMIN_EMAIL: '{{config["SIMPLE_AUTH"].get("username", "")}}' + WEBUI_ADMIN_NAME: '{{config["SIMPLE_AUTH"].get("username", "")}}' + WEBUI_ADMIN_PASSWORD: '{{config["SIMPLE_AUTH"].get("password", "")}}' + #!!! {% else %} + # https://docs.openwebui.com/getting-started/env-configuration/#webui_auth + # "turning off authentication is only possible for fresh installations without any existing users" + ENABLE_SIGNUP: 'True' + WEBUI_AUTH: 'False' + #!!! {% endif %} + WEBUI_NAME: 'Learn2RAG' + #!!! {% if config.get("TLS") %} + UVICORN_SSL_CERTFILE: '{{ config["TLS"]["CERTFILE"] }}' + UVICORN_SSL_KEYFILE: '{{ config["TLS"]["KEYFILE"] }}' + #!!! {% endif %} + healthcheck: + # TODO: We only support ['CMD', 'curl', '-f', ...] + test: ['CMD', 'curl', '-f', '{{learn2rag_scheme}}://localhost:{{ports.ui}}/health'] + qdrant: + working_dir: '{{storage_path}}' + command: + - '{{learn2rag_path}}/services/qdrant/qdrant{% if is_windows %}.exe{% endif %}' + - '--config-path' + - '{{storage_path}}/qdrant_config.yml' + # https://qdrant.tech/documentation/guides/configuration/ + environment: + QDRANT__LOG_LEVEL: 'ERROR' + QDRANT__SERVICE__HOST: '127.0.0.1' + QDRANT__SERVICE__HTTP_PORT: '{{ports.qdrant_http}}' + QDRANT__TELEMETRY_DISABLED: 'true' + import: + working_dir: '{{storage_path}}' + command: + - '{{learn2rag_path}}/configurator{% if is_windows %}.exe{% endif %}' + - '--logging-config' + - '{{storage_path}}/logging_config.yml' + - '--schedule-interval' + - 'PT5M' + - 'learn2rag.importer' + - '--config' + - '{{storage_path}}/importer_config.json' + environment: + LEARN2RAG_PATH: '{{learn2rag_path}}' + STORAGE_PATH: '{{storage_path}}' + QDRANT__SERVICE__HTTP_PORT: '{{ports.qdrant_http}}' + QDRANT__SERVICE__API_KEY: '{{qdrant_api_key}}' + PIPELINE_USER_CONFIG: '{{storage_path}}/basic_user_config.json' + IMPORTER_CONFIG: '{{storage_path}}/importer_config.json' + PIPELINE_OPT_CONFIG: '{{learn2rag_path}}/learn2rag/pipeline/opt_config.json' + LANGCHAIN_API_KEY: '1' + LANGCHAIN_TRACING_V2: 'false' + LLM_API_TYPE: '{{language_model.api}}' + LLM_API_URL: '{{ language_model.url }}' + LLM_API_TOKEN: '{{language_model.token}}' + LLM_API_MODEL: '{{language_model.model}}' + rag: + working_dir: '{{storage_path}}' + command: + - '{{learn2rag_path}}/configurator{% if is_windows %}.exe{% endif %}' + - 'learn2rag.pipeline' + - '--logging-config' + - '{{storage_path}}/logging_config.yml' + environment: + LEARN2RAG_PATH: '{{learn2rag_path}}' + LEARN2RAG_PIPELINE_PORT: '{{ports.pipeline}}' + QDRANT__SERVICE__HTTP_PORT: '{{ports.qdrant_http}}' + QDRANT__SERVICE__API_KEY: '{{qdrant_api_key}}' + PIPELINE_USER_CONFIG: '{{storage_path}}/basic_user_config.json' + IMPORTER_CONFIG: '{{storage_path}}/importer_config.json' + PIPELINE_OPT_CONFIG: '{{learn2rag_path}}/learn2rag/pipeline/opt_config.json' + LANGCHAIN_API_KEY: '1' + LANGCHAIN_TRACING_V2: 'false' + LLM_API_TYPE: '{{language_model.api}}' + LLM_API_URL: '{{language_model.url}}' + LLM_API_TOKEN: '{{language_model.token}}' + LLM_API_MODEL: '{{language_model.model}}' diff --git a/pyproject.toml b/pyproject.toml index feb8e7f..d31cb0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -188,6 +188,7 @@ dependencies = [ "python-pptx (>=1.0.2,<2.0.0)", "msoffcrypto-tool (>=6.0.0,<7.0.0)", "ebooklib (>=0.20,<0.21)", + "apscheduler==3.11.2", ] description = "" license = "MIT" diff --git a/uv.lock b/uv.lock index ba2dd60..4e57930 100644 --- a/uv.lock +++ b/uv.lock @@ -130,6 +130,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" }, ] +[[package]] +name = "apscheduler" +version = "3.11.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tzlocal" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/07/12/3e4389e5920b4c1763390c6d371162f3784f86f85cd6d6c1bfe68eef14e2/apscheduler-3.11.2.tar.gz", hash = "sha256:2a9966b052ec805f020c8c4c3ae6e6a06e24b1bf19f2e11d91d8cca0473eef41", size = 108683, upload-time = "2025-12-22T00:39:34.884Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/64/2e54428beba8d9992aa478bb8f6de9e4ecaa5f8f513bcfd567ed7fb0262d/apscheduler-3.11.2-py3-none-any.whl", hash = "sha256:ce005177f741409db4e4dd40a7431b76feb856b9dd69d57e0da49d6715bfd26d", size = 64439, upload-time = "2025-12-22T00:39:33.303Z" }, +] + [[package]] name = "asttokens" version = "3.0.1" @@ -1671,6 +1683,7 @@ dependencies = [ { name = "annotated-types" }, { name = "antlr4-python3-runtime" }, { name = "anyio" }, + { name = "apscheduler" }, { name = "attrs" }, { name = "azure-identity" }, { name = "backoff" }, @@ -1859,6 +1872,7 @@ requires-dist = [ { name = "annotated-types", specifier = "==0.7.0" }, { name = "antlr4-python3-runtime", specifier = "==4.9.3" }, { name = "anyio", specifier = "==4.10.0" }, + { name = "apscheduler", specifier = "==3.11.2" }, { name = "attrs", specifier = "==25.3.0" }, { name = "azure-identity", specifier = ">=1.25.1,<2.0.0" }, { name = "backoff", specifier = "==2.2.1" }, From 0f9e1971e80c281ea013b40db2e1d4b40c4ae72f Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 10:48:09 +0200 Subject: [PATCH 14/34] Fix the first import not starting immediately --- learn2rag/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learn2rag/__main__.py b/learn2rag/__main__.py index 93afec2..1afb1ba 100644 --- a/learn2rag/__main__.py +++ b/learn2rag/__main__.py @@ -86,7 +86,7 @@ def configure_logging(config_path: pathlib.Path, debug: bool) -> None: scheduler.add_job( module.main, trigger, - next_run_time=datetime.utcnow(), + next_run_time=datetime.now(), max_instances=1, args=module_args, kwargs=module_kwargs, From a01f3d6857384f838d39ffdbf0f8fd2fd55643ad Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 10:49:34 +0200 Subject: [PATCH 15/34] Fix the look of an input --- learn2rag/ui/templates/pipelines_add.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/learn2rag/ui/templates/pipelines_add.html b/learn2rag/ui/templates/pipelines_add.html index 4323205..4ae9f59 100644 --- a/learn2rag/ui/templates/pipelines_add.html +++ b/learn2rag/ui/templates/pipelines_add.html @@ -30,9 +30,9 @@
-
- +
+
{{gettext('Optional')}} From 5a2280f0ff7bd308df99473e7f51498bbd25b36c Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 10:50:04 +0200 Subject: [PATCH 16/34] Add the import interval input in the interface --- learn2rag/ui/__init__.py | 1 + learn2rag/ui/templates/compose/pipelines/continuous.yml | 2 +- learn2rag/ui/templates/pipelines_add.html | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/learn2rag/ui/__init__.py b/learn2rag/ui/__init__.py index 77b988a..82aeb1d 100644 --- a/learn2rag/ui/__init__.py +++ b/learn2rag/ui/__init__.py @@ -396,6 +396,7 @@ def pipeline_create() -> 'str | werkzeug.wrappers.response.Response': data.pop('import', None) data['ports'] = [int(port) for port in request.form.getlist("ports") if port] data['sources'] = request.form.getlist('sources') + data['import_schedule_interval_hours'] = float(data['import_schedule_interval_hours']) name = learn2rag.data.create_entry(app.instance_path, 'pipelines', data) flash(pgettext('flash', 'Added a new pipeline configuration: %(label)s', label=label)) if request.form.get('import'): diff --git a/learn2rag/ui/templates/compose/pipelines/continuous.yml b/learn2rag/ui/templates/compose/pipelines/continuous.yml index bbc1727..9173674 100644 --- a/learn2rag/ui/templates/compose/pipelines/continuous.yml +++ b/learn2rag/ui/templates/compose/pipelines/continuous.yml @@ -136,7 +136,7 @@ services: - '--logging-config' - '{{storage_path}}/logging_config.yml' - '--schedule-interval' - - 'PT5M' + - 'PT{{ (pipeline.import_schedule_interval_hours * 60) | round | int }}M' - 'learn2rag.importer' - '--config' - '{{storage_path}}/importer_config.json' diff --git a/learn2rag/ui/templates/pipelines_add.html b/learn2rag/ui/templates/pipelines_add.html index 4ae9f59..3dfe0db 100644 --- a/learn2rag/ui/templates/pipelines_add.html +++ b/learn2rag/ui/templates/pipelines_add.html @@ -30,6 +30,13 @@
+
+ + +
From faa6d02b683fc94222074b239c6b7329aa35d56c Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 11:37:06 +0200 Subject: [PATCH 17/34] Fix typing --- learn2rag/__main__.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/learn2rag/__main__.py b/learn2rag/__main__.py index 1afb1ba..b2f60d3 100644 --- a/learn2rag/__main__.py +++ b/learn2rag/__main__.py @@ -8,11 +8,13 @@ import sys from datetime import datetime, timedelta from types import TracebackType -from typing import Unpack +from typing import Any, Unpack import yaml -from apscheduler.schedulers.blocking import BlockingScheduler -from apscheduler.triggers.interval import IntervalTrigger +# TODO: apscheduler 4.x would come with py.typed +# https://github.com/agronholm/apscheduler/issues/648#issuecomment-1195304357 +from apscheduler.schedulers.blocking import BlockingScheduler # type: ignore[import-untyped] +from apscheduler.triggers.interval import IntervalTrigger # type: ignore[import-untyped] from pydantic import TypeAdapter @@ -63,7 +65,7 @@ def configure_logging(config_path: pathlib.Path, debug: bool) -> None: logging.debug('Learn2RAG launcher starting: %s, %s', args, rest) module = importlib.import_module(args.module) # TODO - module_args = tuple() + module_args: tuple[Any, ...] = () module_kwargs = {} if args.module == 'learn2rag.ollama_tool': # FIXME default config values From 42207c1de7bc41116acaf919bcd7b36f40ac4150 Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 11:57:16 +0200 Subject: [PATCH 18/34] Use a default interval value when missing --- learn2rag/ui/templates/compose/pipelines/continuous.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learn2rag/ui/templates/compose/pipelines/continuous.yml b/learn2rag/ui/templates/compose/pipelines/continuous.yml index 9173674..e8821da 100644 --- a/learn2rag/ui/templates/compose/pipelines/continuous.yml +++ b/learn2rag/ui/templates/compose/pipelines/continuous.yml @@ -136,7 +136,7 @@ services: - '--logging-config' - '{{storage_path}}/logging_config.yml' - '--schedule-interval' - - 'PT{{ (pipeline.import_schedule_interval_hours * 60) | round | int }}M' + - 'PT{{ ((pipeline.import_schedule_interval_hours or 12) * 60) | round | int }}M' - 'learn2rag.importer' - '--config' - '{{storage_path}}/importer_config.json' From 8d8cf99dadee53a901be45d7482222139b272f9f Mon Sep 17 00:00:00 2001 From: kymeyer Date: Mon, 18 May 2026 13:36:22 +0200 Subject: [PATCH 19/34] switch to single mode for pdf import --- learn2rag/importer/loaders/sharepoint_loader.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/learn2rag/importer/loaders/sharepoint_loader.py b/learn2rag/importer/loaders/sharepoint_loader.py index f05abd8..7d3b2ed 100644 --- a/learn2rag/importer/loaders/sharepoint_loader.py +++ b/learn2rag/importer/loaders/sharepoint_loader.py @@ -75,17 +75,8 @@ def _parse_file(file_path: Path, original_item: Any, loader_id: str = "N/A") -> # SPECIAL HANDLING FOR PDF (using PyPDFLoader to avoid unstructured dependencies) elif suffix == ".pdf": logger.info(f"Detected PDF file: {file_path.name} - using PyPDFLoader") - loader = PyPDFLoader(str(file_path)) - _pages = loader.load() - # Merge all pages into one Document so delta-import deduplication works on - # a 1:1 source→document basis (same as all other loaders). - _merged_pdf = Document( - page_content="\n\n".join(p.page_content for p in _pages), - metadata={**(_pages[0].metadata if _pages else {}), "total_pages": len(_pages)}, - ) - _merged_pdf.metadata.pop("page", None) - _merged_pdf.metadata.pop("page_label", None) - docs = [_merged_pdf] + loader = PyPDFLoader(str(file_path), mode="single") + docs = loader.load() # SPECIAL HANDLING FOR IMAGES (Skip due to broken OCR environment) elif suffix in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]: From e04c6af9c24f0950282e615e59a87095c08f448b Mon Sep 17 00:00:00 2001 From: kymeyer Date: Mon, 18 May 2026 14:22:15 +0200 Subject: [PATCH 20/34] update version and remove unused import --- learn2rag/importer/loaders/sharepoint_loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/learn2rag/importer/loaders/sharepoint_loader.py b/learn2rag/importer/loaders/sharepoint_loader.py index 7d3b2ed..05c6398 100644 --- a/learn2rag/importer/loaders/sharepoint_loader.py +++ b/learn2rag/importer/loaders/sharepoint_loader.py @@ -7,14 +7,14 @@ and Site-Specific contexts. Author: Kyrill Meyer -Version: 0.0.6 +Version: 0.0.7 Institution: IFDT Creation Date: January 14, 2026 -Last Modified Date: April 24, 2026 +Last Modified Date: May 18, 2026 """ + import hashlib import logging -import os import tempfile import shutil from datetime import datetime, timezone From b3eb4550a3cdf70779933fc7747626f5a02480db Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 14:46:36 +0200 Subject: [PATCH 21/34] Do not specify the pipeline port in the first steps guide --- learn2rag/ui/templates/firststeps_pipelines.html | 1 - 1 file changed, 1 deletion(-) diff --git a/learn2rag/ui/templates/firststeps_pipelines.html b/learn2rag/ui/templates/firststeps_pipelines.html index 8167d51..1c37988 100644 --- a/learn2rag/ui/templates/firststeps_pipelines.html +++ b/learn2rag/ui/templates/firststeps_pipelines.html @@ -12,7 +12,6 @@

You do not have any pipelines configured

{% for name, source in sources.items() %} {% endfor %} - From d7644bafaf6e9f4d9c7bc5ffab7eb1a2e75b525e Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 14:47:54 +0200 Subject: [PATCH 22/34] Add missing interval in the first steps guide --- learn2rag/ui/templates/firststeps_pipelines.html | 1 + 1 file changed, 1 insertion(+) diff --git a/learn2rag/ui/templates/firststeps_pipelines.html b/learn2rag/ui/templates/firststeps_pipelines.html index 1c37988..16d6909 100644 --- a/learn2rag/ui/templates/firststeps_pipelines.html +++ b/learn2rag/ui/templates/firststeps_pipelines.html @@ -12,6 +12,7 @@

You do not have any pipelines configured

{% for name, source in sources.items() %} {% endfor %} + From ef71113ae747dafcce4964f1d30ba3feabeb1dd3 Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 14:48:25 +0200 Subject: [PATCH 23/34] Run the continuous pipeline from the first steps guide --- learn2rag/ui/__init__.py | 6 +++--- learn2rag/ui/templates/firststeps_pipelines.html | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/learn2rag/ui/__init__.py b/learn2rag/ui/__init__.py index 82aeb1d..c53f941 100644 --- a/learn2rag/ui/__init__.py +++ b/learn2rag/ui/__init__.py @@ -393,16 +393,16 @@ def pipelines_list() -> 'str | werkzeug.wrappers.response.Response': def pipeline_create() -> 'str | werkzeug.wrappers.response.Response': label = request.form['label'] data: dict[str, Any] = request.form.to_dict() - data.pop('import', None) + data.pop('now', None) data['ports'] = [int(port) for port in request.form.getlist("ports") if port] data['sources'] = request.form.getlist('sources') data['import_schedule_interval_hours'] = float(data['import_schedule_interval_hours']) name = learn2rag.data.create_entry(app.instance_path, 'pipelines', data) flash(pgettext('flash', 'Added a new pipeline configuration: %(label)s', label=label)) - if request.form.get('import'): + if request.form.get('now'): pipeline = learn2rag.data.get_entry(app.instance_path, 'pipelines', name) assert pipeline is not None - start_pipeline(name, pipeline, 'import') + start_pipeline(name, pipeline, 'continuous') return redirect(url_for('pipelines_list')) def start_pipeline(name: str, pipeline: dict[str, Any], template_name: str) -> None: diff --git a/learn2rag/ui/templates/firststeps_pipelines.html b/learn2rag/ui/templates/firststeps_pipelines.html index 16d6909..5d0632e 100644 --- a/learn2rag/ui/templates/firststeps_pipelines.html +++ b/learn2rag/ui/templates/firststeps_pipelines.html @@ -13,7 +13,7 @@

You do not have any pipelines configured

{% endfor %} - +
From badb7eacb7eb94380b4751274d175244784f10c1 Mon Sep 17 00:00:00 2001 From: denkv Date: Mon, 18 May 2026 14:48:55 +0200 Subject: [PATCH 24/34] Change the interface for starting pipelines --- .../compose/pipelines/continuous.yml | 2 +- .../ui/templates/compose/pipelines/import.yml | 2 +- .../templates/compose/pipelines/pipeline.yml | 2 +- learn2rag/ui/templates/pipelines_list.html | 22 ++++++++++++++----- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/learn2rag/ui/templates/compose/pipelines/continuous.yml b/learn2rag/ui/templates/compose/pipelines/continuous.yml index e8821da..2e719c8 100644 --- a/learn2rag/ui/templates/compose/pipelines/continuous.yml +++ b/learn2rag/ui/templates/compose/pipelines/continuous.yml @@ -1,5 +1,5 @@ name: continuous -label: Continuous +label: Start ports: # TODO: labels in the interface currently assume a specific port order - ui diff --git a/learn2rag/ui/templates/compose/pipelines/import.yml b/learn2rag/ui/templates/compose/pipelines/import.yml index 839f08c..a05f728 100644 --- a/learn2rag/ui/templates/compose/pipelines/import.yml +++ b/learn2rag/ui/templates/compose/pipelines/import.yml @@ -1,5 +1,5 @@ name: import -label: Import +label: Start one-time import ports: # TODO: UI port is not used; labels in the interface currently assume a specific port order - ui diff --git a/learn2rag/ui/templates/compose/pipelines/pipeline.yml b/learn2rag/ui/templates/compose/pipelines/pipeline.yml index cb16548..f3ab50e 100644 --- a/learn2rag/ui/templates/compose/pipelines/pipeline.yml +++ b/learn2rag/ui/templates/compose/pipelines/pipeline.yml @@ -1,5 +1,5 @@ name: pipeline -label: RAG +label: Start only the chat ports: # TODO: labels in the interface currently assume a specific port order - ui diff --git a/learn2rag/ui/templates/pipelines_list.html b/learn2rag/ui/templates/pipelines_list.html index 7dbcf1a..0827daf 100644 --- a/learn2rag/ui/templates/pipelines_list.html +++ b/learn2rag/ui/templates/pipelines_list.html @@ -1,3 +1,9 @@ +{% macro pipeline_start(name, project, cls) %} +
+ +
+{% endmacro %} + @@ -44,11 +50,7 @@ {% endfor %} From c1a5acb5841471862de4a5e4d97a3ec6342afb47 Mon Sep 17 00:00:00 2001 From: denkv Date: Tue, 19 May 2026 12:23:05 +0200 Subject: [PATCH 28/34] Fixme --- learn2rag/evaluation/tools.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/learn2rag/evaluation/tools.py b/learn2rag/evaluation/tools.py index b91d0d7..b927bc1 100644 --- a/learn2rag/evaluation/tools.py +++ b/learn2rag/evaluation/tools.py @@ -74,7 +74,9 @@ def ingest_dataset_documents(dataset_name: str) -> None: 'imported_documents_file_path': documents_path, 'llm': None, } - learn2rag.pipeline.ingestion.index(user_config, opt_config) + raise NotImplementedError() + # FIXME + # learn2rag.pipeline.ingestion.index(user_config, opt_config) def read_dataset_qa(dataset_name: str, subdirectory: str, split: str | None=None) -> Any: From 1611c361ef11bf695869e72db2201a4b9ab36dac Mon Sep 17 00:00:00 2001 From: Hanna Reder Date: Tue, 19 May 2026 12:01:34 +0000 Subject: [PATCH 29/34] exclude main.py from mypy test --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d31cb0a..5a33ce6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -205,7 +205,10 @@ build-backend = "setuptools.build_meta" requires = ["setuptools >= 77.0.3"] [tool.mypy] files = "learn2rag" -exclude = "^learn2rag/pipeline/scripts/" +exclude = [ + "^learn2rag/pipeline/scripts/", + "^learn2rag/pipeline/main\\.py$", +] strict = true [[tool.mypy.overrides]] module = [ From 7440ff282392687d7350f4f319528b3637508b4e Mon Sep 17 00:00:00 2001 From: Hanna Reder Date: Tue, 19 May 2026 12:25:05 +0000 Subject: [PATCH 30/34] exclude main.py from mypy test --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 5a33ce6..1624596 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -215,6 +215,9 @@ module = [ "datasets.*", ] ignore_missing_imports = true +[[tool.mypy.overrides]] +module = ["learn2rag.pipeline.main"] +ignore_errors = true [tool.pytest.ini_options] log_cli = true log_cli_level = "DEBUG" From a156c5240590264dd4be86554df2e6e6a8f330fd Mon Sep 17 00:00:00 2001 From: denkv Date: Tue, 19 May 2026 16:23:51 +0200 Subject: [PATCH 31/34] Fix immediately starting the import when pipeline is configured --- learn2rag/ui/templates/pipelines_add.html | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/learn2rag/ui/templates/pipelines_add.html b/learn2rag/ui/templates/pipelines_add.html index 3dfe0db..36bd296 100644 --- a/learn2rag/ui/templates/pipelines_add.html +++ b/learn2rag/ui/templates/pipelines_add.html @@ -65,12 +65,11 @@
- +
- {{gettext('You can always do the data import later.')}}
From 6f391f2b4d978089145b12b4be39778498c1d278 Mon Sep 17 00:00:00 2001 From: denkv Date: Tue, 19 May 2026 16:34:53 +0200 Subject: [PATCH 32/34] Fix input label --- learn2rag/ui/templates/pipelines_add.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learn2rag/ui/templates/pipelines_add.html b/learn2rag/ui/templates/pipelines_add.html index 36bd296..7915694 100644 --- a/learn2rag/ui/templates/pipelines_add.html +++ b/learn2rag/ui/templates/pipelines_add.html @@ -66,7 +66,7 @@
-
From a6de58d65d0c38f9ef74361daf771611add9053c Mon Sep 17 00:00:00 2001 From: kymeyer Date: Wed, 20 May 2026 15:53:52 +0200 Subject: [PATCH 33/34] update main importer script +tests +typesafe --- learn2rag/importer/loaders/drupal_loader.py | 2 +- learn2rag/importer/main.py | 100 +++++++++++--- learn2rag/importer/readme.md | 71 +++++++++- learn2rag/importer/tests/test_loaders.py | 144 ++++++++++++++++++++ 4 files changed, 290 insertions(+), 27 deletions(-) diff --git a/learn2rag/importer/loaders/drupal_loader.py b/learn2rag/importer/loaders/drupal_loader.py index 9c70868..e274783 100644 --- a/learn2rag/importer/loaders/drupal_loader.py +++ b/learn2rag/importer/loaders/drupal_loader.py @@ -78,7 +78,7 @@ def _build_session(auth_type: str, username: str, password: str, token: str) -> def _html_to_text(html: str) -> str: """Strip HTML tags and return plain text.""" soup = BeautifulSoup(html, "html.parser") - return soup.get_text(separator="\n", strip=True) + return str(soup.get_text(separator="\n", strip=True)) def _extract_page_content(attributes: Dict[str, Any], text_fields: List[str]) -> str: diff --git a/learn2rag/importer/main.py b/learn2rag/importer/main.py index 90d2ea9..e309161 100644 --- a/learn2rag/importer/main.py +++ b/learn2rag/importer/main.py @@ -6,9 +6,9 @@ Author: Kyrill Meyer Institution: IFDT -Version: 0.0.3 +Version: 0.0.4 Creation Date: June 10, 2025 -Last Modified: February 20, 2026 +Last Modified: May 20, 2026 """ import argparse @@ -22,7 +22,9 @@ from .config.config_constants import LOGS_DIR, VERSION from .utils.logging_setup import setup_logging from .utils.config_loader import load_json_config, validate_config_entry -from .loaders.process_loaders import process_configuration_entries +from .loaders.process_loaders import process_configuration_entries, process_delta_imports +from .utils.import_state import ImportState +from learn2rag.pipeline.ingestion import index logger = logging.getLogger("Learn2RAGImporter") @@ -31,17 +33,51 @@ class ImporterArgumentParser(argparse.ArgumentParser): + """Argument parser for the Learn2RAG importer. + + Arguments: + --config Path to the importer config JSON file. + Default: config.json bundled with the package. + --state-file Path to the import-state JSON file that persists per-loader + last-import timestamps across runs. + Default: import_state.json placed next to --config. + --delta Run a delta import instead of a full import. + Intelligent loaders (Drupal, SharePoint) fetch only documents + changed since the last run; plain loaders (Directory, HTML, CSV) + perform a SHA-256 content-hash comparison against the current + Qdrant index and only update changed documents. + On the very first run (no state file) a full import is performed + automatically. + --save-documents Write all loaded documents to loaded_documents.json in the + current working directory after a full import. + Intended for debugging and backwards compatibility only; + disabled by default. + + Environment variables (read by main(), not CLI arguments): + PIPELINE_USER_CONFIG Path to the pipeline user_config.json. + Default: learn2rag/pipeline/user_config.json + PIPELINE_OPT_CONFIG Path to the pipeline opt_config.json. + Default: learn2rag/pipeline/opt_config.json + """ + def __init__(self) -> None: super().__init__() json_config_path = importlib.resources.files("learn2rag.importer.config") / "config.json" - self.add_argument('--config', default=str(json_config_path)) + self.add_argument('--config', default=str(json_config_path), + help='path to the importer config JSON (default: bundled config.json)') + self.add_argument('--state-file', default=None, + help='path to the import state JSON file (default: import_state.json next to --config)') + self.add_argument('--delta', action='store_true', + help='perform a delta import instead of a full import') + self.add_argument('--save-documents', action='store_true', + help='write loaded documents to loaded_documents.json (debug/backwards-compat only)') def init(args: argparse.Namespace) -> None: # Display a small textual description about the app print("------------------------------------------------------------") print("Learn2RAG Importer - DataImporter for Learn2RAG.") - print(f"Version: {VERSION} | Author: IFDT (KM) | Date: February 20, 2026\n") + print(f"Version: {VERSION} | Author: IFDT (KM) | Date: May 20, 2026\n") print("https://github.com/Learn2RAG/") print("------------------------------------------------------------\n") @@ -66,33 +102,59 @@ def init(args: argparse.Namespace) -> None: #main function to run the application def main(args: argparse.Namespace) -> None: statusLogger.info('Import started') - # Load JSON configuration try: config = load_json_config(args.config) logger.debug("Configuration loaded successfully, starting validation...") - # Validate each entry in the configuration + # load pipeline configuration for user and opt settings, needed for delta-import and indexing + user_config_path = os.environ.get("PIPELINE_USER_CONFIG", "learn2rag/pipeline/user_config.json") + opt_config_path = os.environ.get("PIPELINE_OPT_CONFIG", "learn2rag/pipeline/opt_config.json") + with open(user_config_path, "r", encoding="utf-8") as f: + user_config = json.load(f) + with open(opt_config_path, "r", encoding="utf-8") as f: + opt_config = json.load(f) + logger.debug("Pipeline configuration loaded from '%s' and '%s'.", user_config_path, opt_config_path) + + # save import state file next to the importer config if not explicitly specified + + state_file_path = args.state_file if args.state_file else str(Path(args.config).parent / "import_state.json") + import_state = ImportState(state_file_path) + + # validate configuration entries before processing to avoid partial imports and ensure all issues are caught upfront validation_errors = False - for index, entry in enumerate(config.get("loaders", []), start=1): + for entry_idx, entry in enumerate(config.get("loaders", []), start=1): try: loader_type = entry.get("loader_type", "Unknown") - logger.debug(f"Validated configuration entry {index}: {loader_type}") + logger.debug(f"Validated configuration entry {entry_idx}: {loader_type}") validate_config_entry(entry) except ValueError as e: - logger.error(f"Validation error in configuration entry {index}: {e}") + logger.error(f"Validation error in configuration entry {entry_idx}: {e}") validation_errors = True - # Process configuration entries and load documents if not validation_errors: - all_documents = process_configuration_entries(config.get("loaders", [])) - logger.debug(f"Total documents loaded: {len(all_documents)}") - - # Optional: Speichern der Dokumente in einer Datei - output_path = "loaded_documents.json" - with open(output_path, "w", encoding="utf-8") as f: - json.dump([{"metadata": doc.metadata, "content": doc.page_content} for doc in all_documents], f, ensure_ascii=False, indent=4) + if args.delta: + # Delta-Import: Hash-/Timestamp-comparison, direct ingest in Qdrant, + logger.info("Running delta import (state file: %s)", state_file_path) + process_delta_imports( + config_entries=config.get("loaders", []), + user_config=user_config, + opt_config=opt_config, + import_state=import_state, + ) + else: + # full import: all documents load and directly ingest in Qdrant + logger.info("Running full import") + all_documents = process_configuration_entries(config.get("loaders", [])) + logger.debug(f"Total documents loaded: {len(all_documents)}") + index(all_documents, user_config, opt_config) + + # JSON-Dump für Rückwärtskompatibilität (nur mit --save-documents) + if args.save_documents: + output_path = "loaded_documents.json" + with open(output_path, "w", encoding="utf-8") as f: + json.dump([{"metadata": doc.metadata, "content": doc.page_content} for doc in all_documents], f, ensure_ascii=False, indent=4) + logger.debug('Documents saved to: %s', output_path) - logger.info('Documents saved to: %s', output_path) statusLogger.info('Import finished') else: logger.error("Configuration validation failed. No documents were processed.") diff --git a/learn2rag/importer/readme.md b/learn2rag/importer/readme.md index d0b7352..955aaa7 100644 --- a/learn2rag/importer/readme.md +++ b/learn2rag/importer/readme.md @@ -1,9 +1,9 @@ # Learn2RAG Importer -An importer for document sources used within the Learn2RAG pipeline. It reads a `config.json`, delegates loading to the appropriate loader, enriches documents with metadata, and writes results to `loaded_documents.json`. +An importer for document sources used within the Learn2RAG pipeline. It reads a `config.json`, delegates loading to the appropriate loader, enriches documents with metadata, and ingests them directly into Qdrant. **Author:** IFDT, KM -**Version:** 0.0.5 +**Version:** 0.0.9 --- @@ -48,8 +48,8 @@ graph TD G --> I H --> I I --> J[Enrich metadata] - J --> K[loaded_documents.json] - K --> L[Pipeline input] + J --> L[Qdrant] + J -.->|--save-documents| K[loaded_documents.json] classDef purple fill:#9370db,stroke:#000000,stroke-width:2px A:::purple @@ -71,9 +71,57 @@ graph TD ## Running the importer ```bash -python -m learn2rag.importer +python -m learn2rag.importer [OPTIONS] ``` +### CLI options + +| Option | Default | Description | +|---|---|---| +| `--config PATH` | bundled `config.json` | Path to the importer config JSON file | +| `--state-file PATH` | `import_state.json` next to `--config` | Path to the import-state JSON file that persists per-loader timestamps across runs | +| `--delta` | off | Run a delta import instead of a full import (see [Delta import](#delta-import)) | +| `--save-documents` | off | Write all loaded documents to `loaded_documents.json` in the working directory (debug / backwards compatibility) | + +### Environment variables + +The importer reads the pipeline configuration directly from environment variables. These must be set (or the default paths must exist) before running. + +| Variable | Default | Description | +|---|---|---| +| `PIPELINE_USER_CONFIG` | `learn2rag/pipeline/user_config.json` | Path to the pipeline `user_config.json` | +| `PIPELINE_OPT_CONFIG` | `learn2rag/pipeline/opt_config.json` | Path to the pipeline `opt_config.json` | + +### Examples + +**Full import** — loads all documents and ingests them into Qdrant: +```bash +python -m learn2rag.importer --config /data/config.json +``` + +**Delta import** — only processes new or changed documents: +```bash +python -m learn2rag.importer --config /data/config.json --delta +``` + +**Full import with debug output:** +```bash +python -m learn2rag.importer --config /data/config.json --save-documents +``` + +--- + +## Delta import + +With `--delta` the importer uses a loader-specific strategy to minimise the number of documents re-processed: + +- **Intelligent loaders** (DrupalLoader, SharepointLoader): 2-pass approach — fetch all current document IDs to detect deletions, then load only documents changed since the last successful run via a server-side timestamp filter. +- **Plain loaders** (DirectoryLoader, HTMLLoader, CSVLoader): full load followed by SHA-256 content-hash comparison against the existing Qdrant index to detect additions, changes, and deletions. + +The import timestamp for each loader is only persisted after a **successful** run. A failed run will therefore be retried in full on the next call. + +On the **very first run** (empty state file or empty Qdrant collection) a full import is performed automatically regardless of the `--delta` flag. + --- ## Configuration @@ -98,7 +146,9 @@ Edit `config/config.json` to define one or more loaders. Each entry requires at ## Output -Results are written to `loaded_documents.json` in the project root. Each document entry contains a `metadata` object and a `content` string: +By default, documents are ingested **directly into Qdrant** without writing any local file. The `loaded_documents.json` file is only written when `--save-documents` is passed (useful for debugging or backwards compatibility). + +Each document entry contains a `metadata` object and a `content` string: ```json [ @@ -432,4 +482,11 @@ where - unified `source` field as document identifier across all loaders in metadata (directory: file path, HTML: URL, SharePoint: web URL, Drupal: node URL, CSV: file path) - delta import now uses `get_documents` from the pipeline - hash comparison now uses sorted chunk hashes per source for stable results - - **Breaking change:** Qdrant payload field renamed from `path` → `source`; existing collections must be deleted and re-imported \ No newline at end of file + - **Breaking change:** Qdrant payload field renamed from `path` → `source`; existing collections must be deleted and re-imported +- v0.1.0 + - documents are now ingested directly into Qdrant instead of being written to `loaded_documents.json` + - added `--delta` flag to run a delta import (hash/timestamp comparison, direct Qdrant update) + - added `--state-file` flag to override the path of the per-loader import-state JSON + - added `--save-documents` flag to optionally write `loaded_documents.json` for debugging / backwards compatibility + - pipeline configuration (`user_config`, `opt_config`) is now read from `PIPELINE_USER_CONFIG` / `PIPELINE_OPT_CONFIG` environment variables + - loop variable `index` renamed to `entry_idx` to avoid shadowing the `index()` import from `learn2rag.pipeline.ingestion` \ No newline at end of file diff --git a/learn2rag/importer/tests/test_loaders.py b/learn2rag/importer/tests/test_loaders.py index 734dbb5..8b14b98 100644 --- a/learn2rag/importer/tests/test_loaders.py +++ b/learn2rag/importer/tests/test_loaders.py @@ -255,3 +255,147 @@ def test_metadata_set_correctly(self, mock_get: MagicMock) -> None: self.assertEqual(doc.metadata.get("loader_id"), "test_meta") self.assertIn("content_hash", doc.metadata) self.assertEqual(doc.metadata.get("loader_type"), "HTMLLoader") + + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + +def _doc(source: str, content_hash: str) -> Document: + """Create a minimal Document with source and content_hash metadata.""" + return Document( + page_content="content", + metadata={"source": source, "content_hash": content_hash}, + ) + + +_USER_CONFIG: dict[str, str] = {"collection_name": "test_collection"} +_OPT_CONFIG: dict[str, Any] = {} + +# --------------------------------------------------------------------------- +# _delta_by_source unit tests (no Qdrant required) +# --------------------------------------------------------------------------- + +class DeltaBySourceTestCase(unittest.TestCase): + """Unit tests for _delta_by_source. + + All Qdrant calls (delete_documents, update_documents) are mocked so no + running Qdrant instance is required. + """ + + def setUp(self) -> None: + from ..loaders.process_loaders import _delta_by_source # local import to avoid top-level side effects + self._delta_by_source = _delta_by_source + self._patch_delete = patch("learn2rag.importer.loaders.process_loaders.delete_documents") + self._patch_update = patch("learn2rag.importer.loaders.process_loaders.update_documents") + self.mock_delete = self._patch_delete.start() + self.mock_update = self._patch_update.start() + + def tearDown(self) -> None: + self._patch_delete.stop() + self._patch_update.stop() + + # -- no changes -------------------------------------------------------- + + def test_no_changes_no_qdrant_calls(self) -> None: + """When nothing changed, neither delete nor update should be called.""" + existing_map = {"a.txt": "hash_a", "b.txt": "hash_b"} + all_docs = [_doc("a.txt", "hash_a"), _doc("b.txt", "hash_b")] + + self._delta_by_source(all_docs, existing_map, "loader1", _USER_CONFIG, _OPT_CONFIG) + + self.mock_delete.assert_not_called() + self.mock_update.assert_not_called() + + # -- new document ------------------------------------------------------ + + def test_new_document_is_updated(self) -> None: + """A source that does not exist in Qdrant yet must be passed to update_documents.""" + existing_map = {"a.txt": "hash_a"} + all_docs = [_doc("a.txt", "hash_a"), _doc("new.txt", "hash_new")] + + self._delta_by_source(all_docs, existing_map, "loader1", _USER_CONFIG, _OPT_CONFIG) + + self.mock_delete.assert_not_called() + updated_sources = {d.metadata["source"] for d in self.mock_update.call_args[0][1]} + self.assertIn("new.txt", updated_sources) + self.assertNotIn("a.txt", updated_sources) + + # -- changed document -------------------------------------------------- + + def test_changed_document_is_updated(self) -> None: + """A source whose hash differs from the stored one must be re-indexed.""" + existing_map = {"a.txt": "hash_a_old"} + all_docs = [_doc("a.txt", "hash_a_new")] + + self._delta_by_source(all_docs, existing_map, "loader1", _USER_CONFIG, _OPT_CONFIG) + + self.mock_delete.assert_not_called() + updated_sources = {d.metadata["source"] for d in self.mock_update.call_args[0][1]} + self.assertIn("a.txt", updated_sources) + + # -- deleted document -------------------------------------------------- + + def test_deleted_document_is_removed(self) -> None: + """A source present in Qdrant but absent from the fresh load must be deleted.""" + existing_map = {"a.txt": "hash_a", "gone.txt": "hash_gone"} + all_docs = [_doc("a.txt", "hash_a")] + + self._delta_by_source(all_docs, existing_map, "loader1", _USER_CONFIG, _OPT_CONFIG) + + deleted = self.mock_delete.call_args[0][1] + self.assertIn("gone.txt", deleted) + self.mock_update.assert_not_called() + + # -- mixed scenario ---------------------------------------------------- + + def test_mixed_new_changed_deleted_unchanged(self) -> None: + """Combined scenario: new, changed, deleted and unchanged in one call.""" + existing_map = { + "unchanged.txt": "hash_u", + "changed.txt": "hash_c_old", + "deleted.txt": "hash_d", + } + all_docs = [ + _doc("unchanged.txt", "hash_u"), # unchanged — must not be touched + _doc("changed.txt", "hash_c_new"), # changed — must be re-indexed + _doc("new.txt", "hash_n"), # new — must be indexed + # deleted.txt is absent # deleted — must be removed + ] + + self._delta_by_source(all_docs, existing_map, "loader1", _USER_CONFIG, _OPT_CONFIG) + + deleted = self.mock_delete.call_args[0][1] + self.assertIn("deleted.txt", deleted) + self.assertNotIn("unchanged.txt", deleted) + + updated_sources = {d.metadata["source"] for d in self.mock_update.call_args[0][1]} + self.assertIn("changed.txt", updated_sources) + self.assertIn("new.txt", updated_sources) + self.assertNotIn("unchanged.txt", updated_sources) + + # -- empty existing map (initial run) ---------------------------------- + + def test_initial_run_all_documents_indexed(self) -> None: + """On the first run (empty Qdrant) every document must be passed to update_documents.""" + existing_map: dict[str, str] = {} + all_docs = [_doc("a.txt", "hash_a"), _doc("b.txt", "hash_b")] + + self._delta_by_source(all_docs, existing_map, "loader1", _USER_CONFIG, _OPT_CONFIG) + + self.mock_delete.assert_not_called() + updated_sources = {d.metadata["source"] for d in self.mock_update.call_args[0][1]} + self.assertEqual(updated_sources, {"a.txt", "b.txt"}) + + # -- empty fresh load (all documents removed) -------------------------- + + def test_all_documents_removed(self) -> None: + """If the loader returns nothing, all existing sources must be deleted.""" + existing_map = {"a.txt": "hash_a", "b.txt": "hash_b"} + all_docs: list[Document] = [] + + self._delta_by_source(all_docs, existing_map, "loader1", _USER_CONFIG, _OPT_CONFIG) + + deleted = self.mock_delete.call_args[0][1] + self.assertCountEqual(deleted, ["a.txt", "b.txt"]) + self.mock_update.assert_not_called() From 854cd30953b00a0c0a5900d7850ff0177bbe721b Mon Sep 17 00:00:00 2001 From: Carolin Walter Date: Tue, 26 May 2026 09:51:09 +0000 Subject: [PATCH 34/34] adapted generate from path to source --- learn2rag/pipeline/generate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/learn2rag/pipeline/generate.py b/learn2rag/pipeline/generate.py index 64e3efc..cb913a6 100644 --- a/learn2rag/pipeline/generate.py +++ b/learn2rag/pipeline/generate.py @@ -18,7 +18,7 @@ def generate(query: str, search_results: list[ScoredPoint], opt_config: dict[str assert llm is not None if hasattr(search_results, "points"): search_results = search_results.points - context = "\n\n".join([context_template.format(source=result.payload['path'], content=result.payload['content']) for result in search_results]) # type: ignore[index] + context = "\n\n".join([context_template.format(source=result.payload['source'], content=result.payload['content']) for result in search_results]) # type: ignore[index] system_message = SystemMessagePromptTemplate.from_template(opt_config["prompt"]) user_message = HumanMessagePromptTemplate.from_template("{question}") prompt = ChatPromptTemplate.from_messages([system_message, user_message]) @@ -33,7 +33,7 @@ def generate_stream(query: str, search_results: list[ScoredPoint], opt_config: d if hasattr(search_results, "points"): search_results = search_results.points - context = "\n\n".join([context_template.format(source=result.payload['path'], content=result.payload['content']) for result in search_results]) # type: ignore[index] + context = "\n\n".join([context_template.format(source=result.payload['source'], content=result.payload['content']) for result in search_results]) # type: ignore[index] system_message = SystemMessagePromptTemplate.from_template(opt_config["prompt"]) user_message = HumanMessagePromptTemplate.from_template("{question}") prompt = ChatPromptTemplate.from_messages([system_message, user_message])
-