Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 62 additions & 2 deletions app/services/vector_store/extended_pg_vector.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import time
import logging
from typing import Optional
from typing import Optional, Any, Dict, List, Union
from sqlalchemy import event
from sqlalchemy import delete
from sqlalchemy.orm import Session
Expand All @@ -17,6 +17,63 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.setup_query_logging()

@staticmethod
def _sanitize_parameters_for_logging(
parameters: Union[Dict, List, tuple, Any]
) -> Any:
"""Sanitize parameters for logging by truncating embeddings and large values."""
if parameters is None:
return parameters

if isinstance(parameters, dict):
sanitized = {}
for key, value in parameters.items():
# Check if the key contains 'embedding' or if the value looks like an embedding vector
if "embedding" in str(key).lower() or (
isinstance(value, (list, tuple))
and len(value) > 10
and all(isinstance(x, (int, float)) for x in value[:10])
):
sanitized[key] = f"<embedding vector of length {len(value)}>"
elif isinstance(value, str) and len(value) > 500:
sanitized[key] = value[:500] + "... (truncated)"
elif isinstance(value, (dict, list, tuple)):
sanitized[key] = ExtendedPgVector._sanitize_parameters_for_logging(
value
)
else:
sanitized[key] = value
return sanitized
elif isinstance(parameters, (list, tuple)):
sanitized = []
# Check if this is a list of embeddings
if len(parameters) > 0 and all(
isinstance(item, (list, tuple))
and len(item) > 10
and all(isinstance(x, (int, float)) for x in item[: min(10, len(item))])
for item in parameters
):
return f"<{len(parameters)} embedding vectors>"

for item in parameters:
if (
isinstance(item, (list, tuple))
and len(item) > 10
and all(isinstance(x, (int, float)) for x in item[:10])
):
sanitized.append(f"<embedding vector of length {len(item)}>")
elif isinstance(item, str) and len(item) > 500:
sanitized.append(item[:500] + "... (truncated)")
elif isinstance(item, (dict, list, tuple)):
sanitized.append(
ExtendedPgVector._sanitize_parameters_for_logging(item)
)
else:
sanitized.append(item)
return type(parameters)(sanitized)
else:
return parameters

def setup_query_logging(self):
"""Enable query logging for this vector store only if DEBUG_PGVECTOR_QUERIES is set"""
# Only setup logging if the environment variable is set to a truthy value
Expand Down Expand Up @@ -45,7 +102,10 @@ def receive_before_cursor_execute(
if "langchain_pg_embedding" in statement:
context._query_start_time = time.time()
logger.info(f"STARTING QUERY: {statement}")
logger.info(f"PARAMETERS: {parameters}")
sanitized_params = ExtendedPgVector._sanitize_parameters_for_logging(
parameters
)
logger.info(f"PARAMETERS: {sanitized_params}")

@event.listens_for(Engine, "after_cursor_execute")
def receive_after_cursor_execute(
Expand Down
46 changes: 24 additions & 22 deletions app/utils/document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def cleanup_temp_encoding_file(loader) -> None:

:param loader: The document loader that may have created a temporary file
"""
if hasattr(loader, "_temp_filepath"):
if hasattr(loader, "_temp_filepath") and loader._temp_filepath is not None:
try:
os.remove(loader._temp_filepath)
except Exception as e:
Expand Down Expand Up @@ -90,7 +90,9 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
mode="w", encoding="utf-8", suffix=".csv", delete=False
) as temp_file:
# Read the original file with detected encoding
with open(filepath, "r", encoding=encoding, errors="replace") as original_file:
with open(
filepath, "r", encoding=encoding, errors="replace"
) as original_file:
content = original_file.read()
temp_file.write(content)

Expand All @@ -111,40 +113,40 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
elif file_ext == "rst":
loader = UnstructuredRSTLoader(filepath, mode="elements")
elif file_ext == "xml" or file_content_type in [
"application/xml",
"text/xml",
"application/xhtml+xml",
]:
"application/xml",
"text/xml",
"application/xhtml+xml",
]:
loader = UnstructuredXMLLoader(filepath)
elif file_ext in ["ppt", "pptx"] or file_content_type in [
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
]:
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
]:
loader = UnstructuredPowerPointLoader(filepath)
elif file_ext == "md" or file_content_type in [
"text/markdown",
"text/x-markdown",
"application/markdown",
"application/x-markdown",
]:
"text/markdown",
"text/x-markdown",
"application/markdown",
"application/x-markdown",
]:
loader = UnstructuredMarkdownLoader(filepath)
elif file_ext == "epub" or file_content_type == "application/epub+zip":
loader = UnstructuredEPubLoader(filepath)
elif file_ext in ["doc", "docx"] or file_content_type in [
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
]:
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]:
loader = Docx2txtLoader(filepath)
elif file_ext in ["xls", "xlsx"] or file_content_type in [
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]:
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]:
loader = UnstructuredExcelLoader(filepath)
elif file_ext == "json" or file_content_type == "application/json":
loader = TextLoader(filepath, autodetect_encoding=True)
elif file_ext in known_source_ext or (
file_content_type and file_content_type.find("text/") >= 0
):
file_content_type and file_content_type.find("text/") >= 0
):
loader = TextLoader(filepath, autodetect_encoding=True)
else:
loader = TextLoader(filepath, autodetect_encoding=True)
Expand Down