diff --git a/.gitignore b/.gitignore index 02414e2b..6fad4cde 100644 --- a/.gitignore +++ b/.gitignore @@ -48,5 +48,7 @@ nul /.github/plans *.xlsx /artifacts/tests +/artifacts/tmp/* +/artifacts/tmp scripts/agent.json scripts/me.json diff --git a/application/single_app/config.py b/application/single_app/config.py index dd712ec8..e8c682fe 100644 --- a/application/single_app/config.py +++ b/application/single_app/config.py @@ -94,7 +94,7 @@ EXECUTOR_TYPE = 'thread' EXECUTOR_MAX_WORKERS = 30 SESSION_TYPE = 'filesystem' -VERSION = "0.240.020" +VERSION = "0.240.056" SECRET_KEY = os.getenv('SECRET_KEY', 'dev-secret-key-change-in-production') diff --git a/application/single_app/functions_conversation_metadata.py b/application/single_app/functions_conversation_metadata.py index 1b83752d..0a993ee9 100644 --- a/application/single_app/functions_conversation_metadata.py +++ b/application/single_app/functions_conversation_metadata.py @@ -104,6 +104,53 @@ def _build_primary_context_from_scope_selection( return None +def _extract_document_id_from_search_result(doc): + """Resolve a stable parent document ID from a search result.""" + document_id = str(doc.get('document_id') or '').strip() + if document_id: + return document_id + + chunk_identifier = str(doc.get('id') or '').strip() + if not chunk_identifier: + return None + + if '_' in chunk_identifier: + return '_'.join(chunk_identifier.split('_')[:-1]) + + return chunk_identifier + + +def _build_last_grounded_document_refs(document_map): + """Build the exact reusable grounded document set for the latest search-backed turn.""" + grounded_refs = [] + + for document_id, doc_info in document_map.items(): + scope_info = doc_info.get('scope') or {} + scope_type = scope_info.get('scope') + scope_id = scope_info.get('id') + if not document_id or not scope_type or not scope_id: + continue + + ref = { + 'document_id': document_id, + 'scope': scope_type, + 'scope_id': scope_id, + 'file_name': doc_info.get('file_name'), + 'classification': doc_info.get('classification'), + } + + if scope_type == 'group': + ref['group_id'] = scope_id + elif scope_type == 'public': + ref['public_workspace_id'] = scope_id + else: + ref['user_id'] = scope_id + + grounded_refs.append(ref) + + return grounded_refs + + def collect_conversation_metadata(user_message, conversation_id, user_id, active_group_id=None, document_scope=None, selected_document_id=None, model_deployment=None, hybrid_search_enabled=False, @@ -179,20 +226,17 @@ def collect_conversation_metadata(user_message, conversation_id, user_id, active chunk_id = doc.get('id') doc_scope_result = _determine_document_scope(doc, user_id, active_group_id) classification = doc.get('document_classification', 'None') - - if chunk_id: - # Extract document ID from chunk ID (assumes format: doc_id_chunkNumber) - if '_' in chunk_id: - document_id = '_'.join(chunk_id.split('_')[:-1]) # Remove last part (chunk number) - else: - document_id = chunk_id # Use full ID if no underscore + document_id = _extract_document_id_from_search_result(doc) + + if document_id and chunk_id: # Initialize document entry if not exists if document_id not in document_map: document_map[document_id] = { 'scope': doc_scope_result, 'chunk_ids': [], - 'classification': classification + 'classification': classification, + 'file_name': doc.get('file_name') or doc.get('title') or 'Unknown Document' } # Add chunk ID to this document @@ -538,6 +582,9 @@ def collect_conversation_metadata(user_message, conversation_id, user_id, active current_tags[semantic_key] = semantic_tag # Update the tags array conversation_item['tags'] = list(current_tags.values()) + if document_map: + conversation_item['last_grounded_document_refs'] = _build_last_grounded_document_refs(document_map) + # --- Scope Lock Logic --- current_scope_locked = conversation_item.get('scope_locked') diff --git a/application/single_app/functions_documents.py b/application/single_app/functions_documents.py index 4dec6803..2ff2fc95 100644 --- a/application/single_app/functions_documents.py +++ b/application/single_app/functions_documents.py @@ -15,6 +15,540 @@ def allowed_file(filename, allowed_extensions=None): allowed_extensions = ALLOWED_EXTENSIONS return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in allowed_extensions + + +ARCHIVED_SCOPE_PREFIX = "__archived__::" +CURRENT_ALIAS_BLOB_PATH_MODE = "current_alias" +ARCHIVED_REVISION_BLOB_PATH_MODE = "archived_revision" + + +def _get_blob_container_name(group_id=None, public_workspace_id=None): + if public_workspace_id is not None: + return storage_account_public_documents_container_name + if group_id is not None: + return storage_account_group_documents_container_name + return storage_account_user_documents_container_name + + +def _get_document_scope_id(document_item=None, user_id=None, group_id=None, public_workspace_id=None): + if public_workspace_id is None and document_item is not None: + public_workspace_id = document_item.get("public_workspace_id") + if group_id is None and document_item is not None: + group_id = document_item.get("group_id") + if user_id is None and document_item is not None: + user_id = document_item.get("user_id") + + return public_workspace_id or group_id or user_id + + +def build_current_blob_path(blob_filename, user_id=None, group_id=None, public_workspace_id=None): + scope_id = _get_document_scope_id( + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + if not scope_id or not blob_filename: + return None + + return f"{scope_id}/{blob_filename}" + + +def build_archived_blob_path(document_item): + scope_id = _get_document_scope_id(document_item=document_item) + revision_family_id = document_item.get("revision_family_id") or document_item.get("id") + document_id = document_item.get("id") + file_name = document_item.get("file_name") + + if not scope_id or not revision_family_id or not document_id or not file_name: + return None + + return f"{scope_id}/{revision_family_id}/{document_id}/{file_name}" + + +def get_document_blob_storage_info(document_item, user_id=None, group_id=None, public_workspace_id=None, prefer_archived=False): + if not document_item: + return None, None + + container_name = document_item.get("blob_container") or _get_blob_container_name( + group_id=group_id or document_item.get("group_id"), + public_workspace_id=public_workspace_id or document_item.get("public_workspace_id"), + ) + + archived_blob_path = document_item.get("archived_blob_path") + blob_path = document_item.get("blob_path") + + if prefer_archived and archived_blob_path: + return container_name, archived_blob_path + + if blob_path: + return container_name, blob_path + + if document_item.get("blob_path_mode") == ARCHIVED_REVISION_BLOB_PATH_MODE and archived_blob_path: + return container_name, archived_blob_path + + return container_name, build_current_blob_path( + document_item.get("file_name"), + user_id=user_id or document_item.get("user_id"), + group_id=group_id or document_item.get("group_id"), + public_workspace_id=public_workspace_id or document_item.get("public_workspace_id"), + ) + + +def get_document_blob_delete_targets(document_item, user_id=None, group_id=None, public_workspace_id=None): + targets = [] + seen = set() + + container_name, primary_blob_path = get_document_blob_storage_info( + document_item, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + + for blob_path in [primary_blob_path, document_item.get("archived_blob_path")]: + if not container_name or not blob_path: + continue + + key = (container_name, blob_path) + if key in seen: + continue + + seen.add(key) + targets.append(key) + + return targets + + +def _get_blob_service_client(): + blob_service_client = CLIENTS.get("storage_account_office_docs_client") + if not blob_service_client: + raise Exception("Blob service client not available or not configured.") + return blob_service_client + + +def _blob_exists(container_name, blob_path): + if not container_name or not blob_path: + return False + + blob_service_client = _get_blob_service_client() + blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_path) + return blob_client.exists() + + +def _copy_blob_to_blob(source_container_name, source_blob_path, destination_container_name, destination_blob_path, overwrite=False): + if not source_container_name or not source_blob_path: + raise ValueError("Source blob reference is required") + if not destination_container_name or not destination_blob_path: + raise ValueError("Destination blob reference is required") + if source_container_name == destination_container_name and source_blob_path == destination_blob_path: + return destination_blob_path + + blob_service_client = _get_blob_service_client() + source_blob_client = blob_service_client.get_blob_client(container=source_container_name, blob=source_blob_path) + destination_blob_client = blob_service_client.get_blob_client(container=destination_container_name, blob=destination_blob_path) + + if destination_blob_client.exists() and not overwrite: + return destination_blob_path + if not source_blob_client.exists(): + raise FileNotFoundError(f"Source blob not found: {source_container_name}/{source_blob_path}") + + properties = source_blob_client.get_blob_properties() + source_metadata = dict(properties.metadata) if properties.metadata else None + temp_file_path = None + + try: + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + temp_file_path = temp_file.name + download_stream = source_blob_client.download_blob() + for chunk in download_stream.chunks(): + temp_file.write(chunk) + + with open(temp_file_path, "rb") as temp_file_handle: + destination_blob_client.upload_blob( + temp_file_handle, + overwrite=overwrite, + metadata=source_metadata, + ) + finally: + if temp_file_path and os.path.exists(temp_file_path): + os.remove(temp_file_path) + + return destination_blob_path + + +def _archive_previous_document_blob(previous_document, user_id=None, group_id=None, public_workspace_id=None): + if not previous_document: + return None + + container_name, current_blob_path = get_document_blob_storage_info( + previous_document, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + archived_blob_path = previous_document.get("archived_blob_path") or build_archived_blob_path(previous_document) + + if not container_name or not archived_blob_path: + return None + + archived_available = False + + if archived_blob_path == current_blob_path: + archived_available = _blob_exists(container_name, archived_blob_path) + elif _blob_exists(container_name, archived_blob_path): + archived_available = True + elif current_blob_path and _blob_exists(container_name, current_blob_path): + _copy_blob_to_blob( + container_name, + current_blob_path, + container_name, + archived_blob_path, + overwrite=False, + ) + archived_available = True + + if not archived_available: + print( + f"Warning: Could not archive prior revision blob for document {previous_document.get('id')}" + ) + return None + + previous_document["blob_container"] = container_name + previous_document["blob_path"] = archived_blob_path + previous_document["archived_blob_path"] = archived_blob_path + previous_document["blob_path_mode"] = ARCHIVED_REVISION_BLOB_PATH_MODE + return archived_blob_path + + +def _promote_document_blob_to_current_alias(promoted_document, user_id=None, group_id=None, public_workspace_id=None): + if not promoted_document: + return None + + container_name = promoted_document.get("blob_container") or _get_blob_container_name( + group_id=group_id or promoted_document.get("group_id"), + public_workspace_id=public_workspace_id or promoted_document.get("public_workspace_id"), + ) + current_blob_path = build_current_blob_path( + promoted_document.get("file_name"), + user_id=user_id or promoted_document.get("user_id"), + group_id=group_id or promoted_document.get("group_id"), + public_workspace_id=public_workspace_id or promoted_document.get("public_workspace_id"), + ) + source_blob_path = promoted_document.get("archived_blob_path") or promoted_document.get("blob_path") + + if not container_name or not current_blob_path: + return None + + if source_blob_path and source_blob_path != current_blob_path and _blob_exists(container_name, source_blob_path): + _copy_blob_to_blob( + container_name, + source_blob_path, + container_name, + current_blob_path, + overwrite=True, + ) + if not promoted_document.get("archived_blob_path"): + promoted_document["archived_blob_path"] = source_blob_path + + promoted_document["blob_container"] = container_name + promoted_document["blob_path"] = current_blob_path + promoted_document["blob_path_mode"] = CURRENT_ALIAS_BLOB_PATH_MODE + return current_blob_path + + +def _safe_int(value): + try: + return int(value) + except (TypeError, ValueError): + return 0 + + +def _get_documents_container(group_id=None, public_workspace_id=None): + if public_workspace_id is not None: + return cosmos_public_documents_container + if group_id is not None: + return cosmos_group_documents_container + return cosmos_user_documents_container + + +def _get_search_client(group_id=None, public_workspace_id=None): + if public_workspace_id is not None: + return CLIENTS["search_client_public"] + if group_id is not None: + return CLIENTS["search_client_group"] + return CLIENTS["search_client_user"] + + +def _get_document_family_key(document_item): + revision_family_id = document_item.get("revision_family_id") + if revision_family_id: + return revision_family_id + + scope_value = ( + document_item.get("public_workspace_id") + or document_item.get("group_id") + or document_item.get("user_id") + or "unknown" + ) + file_name = document_item.get("file_name", "") + return f"legacy::{scope_value}::{file_name}" + + +def _document_revision_sort_key(document_item): + return ( + _safe_int(document_item.get("version")), + str(document_item.get("upload_date") or ""), + _safe_int(document_item.get("_ts")), + ) + + +def _choose_current_document(family_documents): + explicitly_current = [doc for doc in family_documents if doc.get("is_current_version") is True] + candidate_pool = explicitly_current if explicitly_current else family_documents + return max(candidate_pool, key=_document_revision_sort_key) + + +def select_current_documents(documents): + families = {} + + for document_item in documents or []: + family_key = _get_document_family_key(document_item) + families.setdefault(family_key, []).append(document_item) + + current_documents = [] + for family_documents in families.values(): + current_documents.append(_choose_current_document(family_documents)) + + return current_documents + + +def sort_documents(documents, sort_by="_ts", sort_order="DESC"): + reverse = str(sort_order).lower() != "asc" + + def sort_key(document_item): + value = document_item.get(sort_by) + if sort_by == "_ts": + return _safe_int(value) + if value is None: + return "" + if isinstance(value, str): + return value.lower() + if isinstance(value, (int, float)): + return value + return str(value).lower() + + return sorted(documents or [], key=sort_key, reverse=reverse) + + +def _query_accessible_documents(user_id, group_id=None, public_workspace_id=None): + cosmos_container = _get_documents_container(group_id=group_id, public_workspace_id=public_workspace_id) + + if public_workspace_id is not None: + query = """ + SELECT * + FROM c + WHERE c.public_workspace_id = @public_workspace_id + """ + parameters = [ + {"name": "@public_workspace_id", "value": public_workspace_id} + ] + elif group_id is not None: + query = """ + SELECT * + FROM c + WHERE c.group_id = @group_id + OR ARRAY_CONTAINS(c.shared_group_ids, @group_id) + OR EXISTS(SELECT VALUE s FROM s IN c.shared_group_ids WHERE STARTSWITH(s, @group_id_prefix)) + """ + parameters = [ + {"name": "@group_id", "value": group_id}, + {"name": "@group_id_prefix", "value": f"{group_id},"} + ] + else: + query = """ + SELECT * + FROM c + WHERE c.user_id = @user_id + OR ARRAY_CONTAINS(c.shared_user_ids, @user_id) + OR EXISTS(SELECT VALUE s FROM s IN c.shared_user_ids WHERE STARTSWITH(s, @user_id_prefix)) + """ + parameters = [ + {"name": "@user_id", "value": user_id}, + {"name": "@user_id_prefix", "value": f"{user_id},"} + ] + + return list( + cosmos_container.query_items( + query=query, + parameters=parameters, + enable_cross_partition_query=True, + ) + ) + + +def _build_archived_scope_value(scope_value): + return f"{ARCHIVED_SCOPE_PREFIX}{scope_value}" + + +def set_document_chunk_visibility(document_item, active=True): + document_id = document_item.get("id") + group_id = document_item.get("group_id") + public_workspace_id = document_item.get("public_workspace_id") + user_id = document_item.get("user_id") + is_group = group_id is not None + is_public_workspace = public_workspace_id is not None + + if not document_id: + return 0 + + search_client = _get_search_client(group_id=group_id, public_workspace_id=public_workspace_id) + chunk_results = list( + search_client.search( + search_text="*", + filter=f"document_id eq '{document_id}'", + ) + ) + + if not chunk_results: + return 0 + + documents_to_update = [] + for chunk_item in chunk_results: + if is_public_workspace: + chunk_item["public_workspace_id"] = public_workspace_id if active else _build_archived_scope_value(public_workspace_id) + elif is_group: + chunk_item["group_id"] = group_id if active else _build_archived_scope_value(group_id) + chunk_item["shared_group_ids"] = document_item.get("shared_group_ids", []) if active else [] + else: + chunk_item["user_id"] = user_id if active else _build_archived_scope_value(user_id) + chunk_item["shared_user_ids"] = document_item.get("shared_user_ids", []) if active else [] + + documents_to_update.append(chunk_item) + + search_client.upload_documents(documents=documents_to_update) + return len(documents_to_update) + + +def normalize_document_revision_families(user_id, group_id=None, public_workspace_id=None, document_items=None): + documents = document_items if document_items is not None else _query_accessible_documents( + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + cosmos_container = _get_documents_container(group_id=group_id, public_workspace_id=public_workspace_id) + families = {} + changes_made = False + + for document_item in documents: + family_key = _get_document_family_key(document_item) + families.setdefault(family_key, []).append(document_item) + + for family_documents in families.values(): + if len(family_documents) <= 1: + continue + + current_document = _choose_current_document(family_documents) + revision_family_id = current_document.get("revision_family_id") or current_document.get("id") + + for document_item in family_documents: + expected_current = document_item.get("id") == current_document.get("id") + update_occurred = False + + if document_item.get("revision_family_id") != revision_family_id: + document_item["revision_family_id"] = revision_family_id + update_occurred = True + + if document_item.get("is_current_version") != expected_current: + document_item["is_current_version"] = expected_current + update_occurred = True + + if expected_current: + if document_item.get("search_visibility_state") == "archived": + set_document_chunk_visibility(document_item, active=True) + document_item["search_visibility_state"] = "active" + update_occurred = True + elif document_item.get("search_visibility_state") != "active": + document_item["search_visibility_state"] = "active" + update_occurred = True + else: + if document_item.get("search_visibility_state") != "archived": + set_document_chunk_visibility(document_item, active=False) + document_item["search_visibility_state"] = "archived" + update_occurred = True + + if update_occurred: + cosmos_container.upsert_item(document_item) + changes_made = True + + return changes_made + + +def _get_document_family_items_from_document(document_item, user_id, group_id=None, public_workspace_id=None): + cosmos_container = _get_documents_container(group_id=group_id, public_workspace_id=public_workspace_id) + file_name = document_item.get("file_name") + + if public_workspace_id is not None: + query = """ + SELECT * + FROM c + WHERE c.file_name = @file_name + AND c.public_workspace_id = @public_workspace_id + """ + parameters = [ + {"name": "@file_name", "value": file_name}, + {"name": "@public_workspace_id", "value": public_workspace_id}, + ] + elif group_id is not None: + owner_group_id = document_item.get("group_id") or group_id + query = """ + SELECT * + FROM c + WHERE c.file_name = @file_name + AND c.group_id = @group_id + """ + parameters = [ + {"name": "@file_name", "value": file_name}, + {"name": "@group_id", "value": owner_group_id}, + ] + else: + owner_user_id = document_item.get("user_id") or user_id + query = """ + SELECT * + FROM c + WHERE c.file_name = @file_name + AND c.user_id = @owner_user_id + """ + parameters = [ + {"name": "@file_name", "value": file_name}, + {"name": "@owner_user_id", "value": owner_user_id}, + ] + + return list( + cosmos_container.query_items( + query=query, + parameters=parameters, + enable_cross_partition_query=True, + ) + ) + + +def _build_carried_forward_metadata(document_item, is_group=False): + carried_forward = { + "title": document_item.get("title"), + "abstract": document_item.get("abstract"), + "keywords": document_item.get("keywords"), + "publication_date": document_item.get("publication_date"), + "authors": ensure_list(document_item.get("authors")), + "document_classification": document_item.get("document_classification", "None"), + "tags": document_item.get("tags", []), + } + + if is_group: + carried_forward["shared_group_ids"] = document_item.get("shared_group_ids", []) + else: + carried_forward["shared_user_ids"] = document_item.get("shared_user_ids", []) + + return carried_forward def create_document(file_name, user_id, document_id, num_file_chunks, status, group_id=None, public_workspace_id=None): current_time = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') @@ -64,14 +598,56 @@ def create_document(file_name, user_id, document_id, num_file_chunks, status, gr ] try: - existing_document = list( + existing_documents = list( cosmos_container.query_items( query=query, parameters=parameters, enable_cross_partition_query=True ) ) - version = existing_document[0]['version'] + 1 if existing_document else 1 + existing_documents = sorted(existing_documents, key=_document_revision_sort_key, reverse=True) + + latest_existing_document = existing_documents[0] if existing_documents else None + revision_family_id = latest_existing_document.get('revision_family_id') if latest_existing_document else None + revision_family_id = revision_family_id or (latest_existing_document.get('id') if latest_existing_document else document_id) + version = (_safe_int(latest_existing_document.get('version')) + 1) if latest_existing_document else 1 + + if latest_existing_document: + carried_forward = _build_carried_forward_metadata( + latest_existing_document, + is_group=is_group, + ) + else: + carried_forward = { + 'title': None, + 'abstract': None, + 'keywords': None, + 'publication_date': None, + 'authors': [], + 'document_classification': 'None', + 'tags': [], + 'shared_group_ids': [] if is_group else None, + 'shared_user_ids': [] if not is_group else None, + } + + for existing_document in existing_documents: + update_existing_document = False + + if existing_document.get('revision_family_id') != revision_family_id: + existing_document['revision_family_id'] = revision_family_id + update_existing_document = True + + if existing_document.get('is_current_version') is not False: + existing_document['is_current_version'] = False + update_existing_document = True + + if existing_document.get('search_visibility_state') != 'archived': + set_document_chunk_visibility(existing_document, active=False) + existing_document['search_visibility_state'] = 'archived' + update_existing_document = True + + if update_existing_document: + cosmos_container.upsert_item(existing_document) if is_public_workspace: document_metadata = { @@ -84,13 +660,25 @@ def create_document(file_name, user_id, document_id, num_file_chunks, status, gr "upload_date": current_time, "last_updated": current_time, "version": version, + "revision_family_id": revision_family_id, + "is_current_version": True, + "search_visibility_state": "active", "status": status, "percentage_complete": 0, - "document_classification": "None", + "document_classification": carried_forward.get("document_classification", "None"), "type": "document_metadata", "public_workspace_id": public_workspace_id, "user_id": user_id, - "tags": [] + "blob_container": _get_blob_container_name(public_workspace_id=public_workspace_id), + "blob_path": None, + "archived_blob_path": None, + "blob_path_mode": None, + "title": carried_forward.get("title"), + "abstract": carried_forward.get("abstract"), + "keywords": carried_forward.get("keywords"), + "publication_date": carried_forward.get("publication_date"), + "authors": ensure_list(carried_forward.get("authors")), + "tags": carried_forward.get("tags", []) } elif is_group: document_metadata = { @@ -103,13 +691,25 @@ def create_document(file_name, user_id, document_id, num_file_chunks, status, gr "upload_date": current_time, "last_updated": current_time, "version": version, + "revision_family_id": revision_family_id, + "is_current_version": True, + "search_visibility_state": "active", "status": status, "percentage_complete": 0, - "document_classification": "None", + "document_classification": carried_forward.get("document_classification", "None"), "type": "document_metadata", "group_id": group_id, - "shared_group_ids": [], - "tags": [] + "blob_container": _get_blob_container_name(group_id=group_id), + "blob_path": None, + "archived_blob_path": None, + "blob_path_mode": None, + "shared_group_ids": carried_forward.get("shared_group_ids", []), + "title": carried_forward.get("title"), + "abstract": carried_forward.get("abstract"), + "keywords": carried_forward.get("keywords"), + "publication_date": carried_forward.get("publication_date"), + "authors": ensure_list(carried_forward.get("authors")), + "tags": carried_forward.get("tags", []) } else: document_metadata = { @@ -122,15 +722,27 @@ def create_document(file_name, user_id, document_id, num_file_chunks, status, gr "upload_date": current_time, "last_updated": current_time, "version": version, + "revision_family_id": revision_family_id, + "is_current_version": True, + "search_visibility_state": "active", "status": status, "percentage_complete": 0, - "document_classification": "None", + "document_classification": carried_forward.get("document_classification", "None"), "type": "document_metadata", "user_id": user_id, - "shared_user_ids": [], + "blob_container": _get_blob_container_name(), + "blob_path": None, + "archived_blob_path": None, + "blob_path_mode": None, + "shared_user_ids": carried_forward.get("shared_user_ids", []), "embedding_tokens": 0, "embedding_model_deployment_name": None, - "tags": [] + "title": carried_forward.get("title"), + "abstract": carried_forward.get("abstract"), + "keywords": carried_forward.get("keywords"), + "publication_date": carried_forward.get("publication_date"), + "authors": ensure_list(carried_forward.get("authors")), + "tags": carried_forward.get("tags", []) } cosmos_container.upsert_item(document_metadata) @@ -1379,7 +1991,7 @@ def update_document(**kwargs): chunk_updates['title'] = existing_document.get('title') if 'authors' in updated_fields_requiring_chunk_sync: # Ensure authors is a list for the chunk metadata if needed - chunk_updates['author'] = existing_document.get('authors') + chunk_updates['author'] = ensure_list(existing_document.get('authors')) if 'file_name' in updated_fields_requiring_chunk_sync: chunk_updates['file_name'] = existing_document.get('file_name') if 'document_classification' in updated_fields_requiring_chunk_sync: @@ -1517,8 +2129,9 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, chunk_id = f"{document_id}_{page_number}" chunk_keywords = [] chunk_summary = "" - author = [] - title = "" + author = ensure_list(metadata.get('authors')) if metadata else [] + title = metadata.get('title', '') if metadata else '' + document_classification = metadata.get('document_classification', 'None') if metadata else 'None' # Check if this document has vision analysis and append it to chunk_text vision_analysis = metadata.get('vision_analysis') @@ -1567,7 +2180,7 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "page_number": page_number, "author": author, "title": title, - "document_classification": "None", + "document_classification": document_classification, "document_tags": metadata.get('tags', []), "chunk_sequence": page_number, # or you can keep an incremental idx "upload_date": current_time, @@ -1589,7 +2202,7 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "page_number": page_number, "author": author, "title": title, - "document_classification": "None", + "document_classification": document_classification, "document_tags": metadata.get('tags', []), "chunk_sequence": page_number, # or you can keep an incremental idx "upload_date": current_time, @@ -1613,7 +2226,7 @@ def save_chunks(page_text_content, page_number, file_name, user_id, document_id, "page_number": page_number, "author": author, "title": title, - "document_classification": "None", + "document_classification": document_classification, "document_tags": metadata.get('tags', []), "chunk_sequence": page_number, # or you can keep an incremental idx "upload_date": current_time, @@ -2023,7 +2636,10 @@ def update_chunk_metadata(chunk_id, user_id, group_id=None, public_workspace_id= for field in updatable_fields: if field in kwargs: - chunk_item[field] = kwargs[field] + if field == 'author': + chunk_item[field] = ensure_list(kwargs[field]) + else: + chunk_item[field] = kwargs[field] search_client.upload_documents(documents=[chunk_item]) @@ -2082,62 +2698,14 @@ def chunk_pdf(input_pdf_path: str, max_pages: int = 500) -> list: return chunks def get_documents(user_id, group_id=None, public_workspace_id=None): - is_group = group_id is not None - is_public_workspace = public_workspace_id is not None - - # Choose the correct cosmos_container and query parameters - if is_public_workspace: - cosmos_container = cosmos_public_documents_container - elif is_group: - cosmos_container = cosmos_group_documents_container - else: - cosmos_container = cosmos_user_documents_container - - if is_public_workspace: - query = """ - SELECT TOP 1 * - FROM c - WHERE c.public_workspace_id = @public_workspace_id - """ - parameters = [ - {"name": "@public_workspace_id", "value": public_workspace_id} - ] - elif is_group: - query = """ - SELECT * - FROM c - WHERE c.group_id = @group_id OR ARRAY_CONTAINS(c.shared_group_ids, @group_id) - """ - parameters = [ - {"name": "@group_id", "value": group_id} - ] - else: - query = """ - SELECT * - FROM c - WHERE c.user_id = @user_id OR ARRAY_CONTAINS(c.shared_user_ids, @user_id) - """ - parameters = [ - {"name": "@user_id", "value": user_id} - ] - try: - documents = list( - cosmos_container.query_items( - query=query, - parameters=parameters, - enable_cross_partition_query=True - ) + documents = _query_accessible_documents( + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, ) - - latest_documents = {} - - for doc in documents: - file_name = doc['file_name'] - if file_name not in latest_documents or doc['version'] > latest_documents[file_name]['version']: - latest_documents[file_name] = doc - - return jsonify({"documents": list(latest_documents.values())}), 200 + current_documents = sort_documents(select_current_documents(documents)) + return jsonify({"documents": current_documents}), 200 except Exception as e: return jsonify({'error': f'Error retrieving documents: {str(e)}'}), 500 @@ -2209,72 +2777,23 @@ def get_document(user_id, document_id, group_id=None, public_workspace_id=None): return jsonify(document_results[0]), 200 - except Exception as e: - return jsonify({'error': f'Error retrieving document: {str(e)}'}), 500 - -def get_latest_version(document_id, user_id, group_id=None, public_workspace_id=None): - is_group = group_id is not None - is_public_workspace = public_workspace_id is not None - - # Choose the correct cosmos_container and query parameters - if is_public_workspace: - cosmos_container = cosmos_public_documents_container - elif is_group: - cosmos_container = cosmos_group_documents_container - else: - cosmos_container = cosmos_user_documents_container - - if is_public_workspace: - query = """ - SELECT TOP 1 * - FROM c - WHERE c.id = @document_id - AND c.public_workspace_id = @public_workspace_id - ORDER BY c.version DESC - """ - parameters = [ - {"name": "@document_id", "value": document_id}, - {"name": "@public_workspace_id", "value": public_workspace_id} - ] - elif is_group: - query = """ - SELECT c.version - FROM c - WHERE c.id = @document_id - AND (c.group_id = @group_id OR ARRAY_CONTAINS(c.shared_group_ids, @group_id)) - ORDER BY c.version DESC - """ - parameters = [ - {"name": "@document_id", "value": document_id}, - {"name": "@group_id", "value": group_id} - ] - else: - query = """ - SELECT c.version - FROM c - WHERE c.id = @document_id - AND (c.user_id = @user_id OR ARRAY_CONTAINS(c.shared_user_ids, @user_id)) - ORDER BY c.version DESC - """ - parameters = [ - {"name": "@document_id", "value": document_id}, - {"name": "@user_id", "value": user_id} - ] - - try: - results = list( - cosmos_container.query_items( - query=query, - parameters=parameters, - enable_cross_partition_query=True - ) - ) - - if results: - return results[0]['version'] - else: - return None + except Exception as e: + return jsonify({'error': f'Error retrieving document: {str(e)}'}), 500 +def get_latest_version(document_id, user_id, group_id=None, public_workspace_id=None): + try: + target_document = _get_documents_container( + group_id=group_id, + public_workspace_id=public_workspace_id, + ).read_item(item=document_id, partition_key=document_id) + family_documents = _get_document_family_items_from_document( + target_document, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + current_document = _choose_current_document(family_documents) + return current_document.get('version') if current_document else None except Exception as e: return None @@ -2349,51 +2868,37 @@ def get_document_version(user_id, document_id, version, group_id=None, public_wo except Exception as e: return jsonify({'error': f'Error retrieving document version: {str(e)}'}), 500 -def delete_from_blob_storage(document_id, user_id, file_name, group_id=None, public_workspace_id=None): +def delete_from_blob_storage(document_item, user_id=None, group_id=None, public_workspace_id=None): """Delete a document from Azure Blob Storage.""" - is_group = group_id is not None - is_public_workspace = public_workspace_id is not None - - if is_public_workspace: - storage_account_container_name = storage_account_public_documents_container_name - elif is_group: - storage_account_container_name = storage_account_group_documents_container_name - else: - storage_account_container_name = storage_account_user_documents_container_name - + # Check if enhanced citations are enabled and blob client is available settings = get_settings() enable_enhanced_citations = settings.get("enable_enhanced_citations", False) - + if not enable_enhanced_citations: return # No need to proceed if enhanced citations are disabled - + try: - # Construct the blob path using the same format as in upload_to_blob - blob_path = f"{group_id}/{file_name}" if is_group else f"{user_id}/{file_name}" - - # Get the blob client blob_service_client = CLIENTS.get("storage_account_office_docs_client") if not blob_service_client: - print(f"Warning: Enhanced citations enabled but blob service client not configured.") - return - - # Get container client - container_client = blob_service_client.get_container_client(storage_account_container_name) - if not container_client: - print(f"Warning: Could not get container client for {storage_account_container_name}") + print("Warning: Enhanced citations enabled but blob service client not configured.") return - - # Get blob client - blob_client = container_client.get_blob_client(blob_path) - - # Delete the blob if it exists - if blob_client.exists(): - blob_client.delete_blob() - print(f"Successfully deleted blob at {blob_path}") - else: - print(f"No blob found at {blob_path} to delete") - + + delete_targets = get_document_blob_delete_targets( + document_item, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + + for container_name, blob_path in delete_targets: + blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_path) + if blob_client.exists(): + blob_client.delete_blob() + print(f"Successfully deleted blob at {container_name}/{blob_path}") + else: + print(f"No blob found at {container_name}/{blob_path} to delete") + except Exception as e: print(f"Error deleting document from blob storage: {str(e)}") # Don't raise the exception, as we want the Cosmos DB deletion to proceed @@ -2466,13 +2971,14 @@ def delete_document(user_id, document_id, group_id=None, public_workspace_id=Non if document_item.get('user_id') != user_id: raise Exception("Unauthorized access to document - only document owner can delete") - # Get the file name from the document to use for blob deletion - file_name = document_item.get('file_name') - # Delete from blob storage try: - if file_name: - delete_from_blob_storage(document_id, user_id, file_name, group_id, public_workspace_id) + delete_from_blob_storage( + document_item, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) except Exception as blob_error: # Log the error but continue with Cosmos DB deletion print(f"Error deleting from blob storage (continuing with document deletion): {str(blob_error)}") @@ -2488,6 +2994,81 @@ def delete_document(user_id, document_id, group_id=None, public_workspace_id=Non except Exception as e: raise + +def delete_document_revision(user_id, document_id, delete_mode="all_versions", group_id=None, public_workspace_id=None): + if delete_mode not in {"all_versions", "current_only"}: + raise ValueError("Unsupported delete mode") + + cosmos_container = _get_documents_container(group_id=group_id, public_workspace_id=public_workspace_id) + target_document = cosmos_container.read_item(item=document_id, partition_key=document_id) + + family_documents = _get_document_family_items_from_document( + target_document, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + current_document = _choose_current_document(family_documents) + target_is_current = current_document and current_document.get('id') == document_id + + if delete_mode == "all_versions": + deleted_document_ids = [] + for family_document in family_documents: + delete_document( + user_id=user_id, + document_id=family_document['id'], + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + delete_document_chunks( + document_id=family_document['id'], + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + deleted_document_ids.append(family_document['id']) + + return { + 'deleted_mode': 'all_versions', + 'deleted_document_ids': deleted_document_ids, + 'promoted_document_id': None, + } + + delete_document( + user_id=user_id, + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + delete_document_chunks( + document_id=document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + + promoted_document_id = None + if target_is_current: + remaining_documents = [doc for doc in family_documents if doc.get('id') != document_id] + if remaining_documents: + promoted_document = _choose_current_document(remaining_documents) + promoted_document['revision_family_id'] = target_document.get('revision_family_id') or promoted_document.get('revision_family_id') or promoted_document.get('id') + promoted_document['is_current_version'] = True + promoted_document['search_visibility_state'] = 'active' + _promote_document_blob_to_current_alias( + promoted_document, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + set_document_chunk_visibility(promoted_document, active=True) + cosmos_container.upsert_item(promoted_document) + promoted_document_id = promoted_document.get('id') + + return { + 'deleted_mode': 'current_only', + 'deleted_document_ids': [document_id], + 'promoted_document_id': promoted_document_id, + } + def delete_document_chunks(document_id, group_id=None, public_workspace_id=None): """Delete document chunks from Azure Cognitive Search index.""" @@ -2533,66 +3114,26 @@ def delete_document_version_chunks(document_id, version, group_id=None, public_w ) def get_document_versions(user_id, document_id, group_id=None, public_workspace_id=None): - """ Get all versions of a document for a user.""" - is_group = group_id is not None - is_public_workspace = public_workspace_id is not None - - if is_public_workspace: - cosmos_container = cosmos_public_documents_container - elif is_group: - cosmos_container = cosmos_group_documents_container - else: - cosmos_container = cosmos_user_documents_container - - if is_public_workspace: - query = """ - SELECT c.id, c.file_name, c.version, c.upload_date - FROM c - WHERE c.id = @document_id - AND c.public_workspace_id = @public_workspace_id - ORDER BY c.version DESC - """ - parameters = [ - {"name": "@document_id", "value": document_id}, - {"name": "@public_workspace_id", "value": public_workspace_id} - ] - elif is_group: - query = """ - SELECT c.id, c.file_name, c.version, c.upload_date - FROM c - WHERE c.id = @document_id - AND (c.group_id = @group_id OR ARRAY_CONTAINS(c.shared_group_ids, @group_id)) - ORDER BY c.version DESC - """ - parameters = [ - {"name": "@document_id", "value": document_id}, - {"name": "@group_id", "value": group_id} - ] - else: - query = """ - SELECT c.id, c.file_name, c.version, c.upload_date - FROM c - WHERE c.id = @document_id - AND (c.user_id = @user_id OR ARRAY_CONTAINS(c.shared_user_ids, @user_id)) - ORDER BY c.version DESC - """ - parameters = [ - {"name": "@document_id", "value": document_id}, - {"name": "@user_id", "value": user_id} - ] - try: - versions_results = list( - cosmos_container.query_items( - query=query, - parameters=parameters, - enable_cross_partition_query=True - ) + cosmos_container = _get_documents_container(group_id=group_id, public_workspace_id=public_workspace_id) + target_document = cosmos_container.read_item(item=document_id, partition_key=document_id) + family_documents = _get_document_family_items_from_document( + target_document, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, ) - - if not versions_results: - return [] - return versions_results + sorted_family = sorted(family_documents, key=_document_revision_sort_key, reverse=True) + return [ + { + 'id': doc.get('id'), + 'file_name': doc.get('file_name'), + 'version': doc.get('version'), + 'upload_date': doc.get('upload_date'), + 'is_current_version': doc.get('id') == _choose_current_document(family_documents).get('id'), + } + for doc in sorted_family + ] except Exception as e: return [] @@ -2697,7 +3238,7 @@ def process_metadata_extraction_background(document_id, user_id, group_id=None, "document_id": document_id, "user_id": user_id, "title": metadata.get('title'), - "authors": metadata.get('authors'), + "authors": ensure_list(metadata.get('authors')), "abstract": metadata.get('abstract'), "keywords": metadata.get('keywords'), "publication_date": metadata.get('publication_date'), @@ -3221,23 +3762,34 @@ def clean_json_codeFence(response_content: str) -> str: def ensure_list(value, delimiters=r"[;,]"): """ - Ensures the provided value is returned as a list of strings. - - If `value` is already a list, it is returned as-is. - - If `value` is a string, it is split on the given delimiters - (default: commas and semicolons). - - Otherwise, return an empty list. + Ensures the provided value is returned as a list of non-empty strings. + - If `value` is a list/tuple/set, items are normalized one by one. + - If `value` is a string, it is split on the given delimiters. + - If `value` is any other scalar, it is coerced to a single string item. + - Null and blank items are removed. """ - if isinstance(value, list): - return value - elif isinstance(value, str): - # Split on the given delimiters (commas, semicolons, etc.) - items = re.split(delimiters, value) - # Strip whitespace and remove empty strings - items = [item.strip() for item in items if item.strip()] - return items - else: + if value is None: return [] + if isinstance(value, str): + raw_items = re.split(delimiters, value) + elif isinstance(value, (list, tuple, set)): + raw_items = list(value) + else: + raw_items = [value] + + items = [] + for raw_item in raw_items: + if raw_item is None: + continue + + normalized_item = raw_item if isinstance(raw_item, str) else str(raw_item) + normalized_item = normalized_item.strip() + if normalized_item: + items.append(normalized_item) + + return items + def is_effectively_empty(value): """ Returns True if the value is 'worthless' or empty. @@ -3548,27 +4100,42 @@ def analyze_image_with_vision_model(image_path, user_id, document_id, settings): def upload_to_blob(temp_file_path, user_id, document_id, blob_filename, update_callback, group_id=None, public_workspace_id=None): """Uploads the file to Azure Blob Storage.""" - is_group = group_id is not None - is_public_workspace = public_workspace_id is not None - - if is_public_workspace: - storage_account_container_name = storage_account_public_documents_container_name - elif is_group: - storage_account_container_name = storage_account_group_documents_container_name - else: - storage_account_container_name = storage_account_user_documents_container_name - try: - if is_public_workspace: - blob_path = f"{public_workspace_id}/{blob_filename}" - elif is_group: - blob_path = f"{group_id}/{blob_filename}" - else: - blob_path = f"{user_id}/{blob_filename}" + cosmos_container = _get_documents_container(group_id=group_id, public_workspace_id=public_workspace_id) + current_document = cosmos_container.read_item(item=document_id, partition_key=document_id) + storage_account_container_name = current_document.get("blob_container") or _get_blob_container_name( + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + blob_path = build_current_blob_path( + blob_filename, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) - blob_service_client = CLIENTS.get("storage_account_office_docs_client") - if not blob_service_client: - raise Exception("Blob service client not available or not configured.") + previous_family_documents = [ + family_document + for family_document in _get_document_family_items_from_document( + current_document, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + if family_document.get("id") != document_id + ] + previous_document = max(previous_family_documents, key=_document_revision_sort_key) if previous_family_documents else None + if previous_document: + archived_blob_path = _archive_previous_document_blob( + previous_document, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + if archived_blob_path: + cosmos_container.upsert_item(previous_document) + + blob_service_client = _get_blob_service_client() blob_client = blob_service_client.get_blob_client( container=storage_account_container_name, @@ -3577,8 +4144,8 @@ def upload_to_blob(temp_file_path, user_id, document_id, blob_filename, update_c metadata = { "document_id": str(document_id), - "group_id": str(group_id) if is_group else None, - "user_id": str(user_id) if not is_group else None + "group_id": str(group_id) if group_id is not None else None, + "user_id": str(user_id) if group_id is None else None } metadata = {k: v for k, v in metadata.items() if v is not None} @@ -3588,6 +4155,13 @@ def upload_to_blob(temp_file_path, user_id, document_id, blob_filename, update_c with open(temp_file_path, "rb") as f: blob_client.upload_blob(f, overwrite=True, metadata=metadata) + current_document["blob_container"] = storage_account_container_name + current_document["blob_path"] = blob_path + current_document["blob_path_mode"] = CURRENT_ALIAS_BLOB_PATH_MODE + if current_document.get("archived_blob_path") is None: + current_document["archived_blob_path"] = None + cosmos_container.upsert_item(current_document) + print(f"Successfully uploaded {blob_filename} to blob storage at {blob_path}") return blob_path @@ -4809,6 +5383,173 @@ def process_json(document_id, user_id, temp_file_path, original_filename, enable # Return the count of chunks actually saved return total_chunks_saved, total_embedding_tokens, embedding_model_name +TABULAR_SCHEMA_SUMMARY_MAX_SHEETS = 8 +TABULAR_SCHEMA_SUMMARY_MAX_COLUMNS = 12 +TABULAR_SCHEMA_SUMMARY_MAX_PREVIEW_ROWS = 3 +TABULAR_SCHEMA_SUMMARY_MAX_CELL_CHARS = 60 + + +def _compact_tabular_schema_value(value, max_chars=TABULAR_SCHEMA_SUMMARY_MAX_CELL_CHARS): + text = "" if value is None else str(value) + text = " ".join(text.split()) + + if len(text) <= max_chars: + return text + + return f"{text[:max_chars - 3]}..." + + +def _compact_tabular_columns(columns, max_columns=TABULAR_SCHEMA_SUMMARY_MAX_COLUMNS): + normalized_columns = [ + _compact_tabular_schema_value(column, max_chars=80) or "(blank)" + for column in columns + ] + visible_columns = normalized_columns[:max_columns] + omitted_count = max(len(normalized_columns) - max_columns, 0) + + if omitted_count: + visible_columns.append(f"... +{omitted_count} more columns") + + return visible_columns + + +def _build_compact_tabular_preview(df_preview): + if df_preview is None or df_preview.empty: + return "[No preview rows available]" + + preview_df = df_preview.iloc[ + :TABULAR_SCHEMA_SUMMARY_MAX_PREVIEW_ROWS, + :TABULAR_SCHEMA_SUMMARY_MAX_COLUMNS, + ].copy() + preview_df.columns = [ + _compact_tabular_schema_value(column, max_chars=80) or "(blank)" + for column in preview_df.columns + ] + + for column in preview_df.columns: + preview_df[column] = preview_df[column].map( + lambda value: _compact_tabular_schema_value(value) + ) + + preview_text = preview_df.to_string(index=False) + omitted_column_count = max(len(df_preview.columns) - TABULAR_SCHEMA_SUMMARY_MAX_COLUMNS, 0) + if omitted_column_count: + preview_text += ( + f"\n[Preview truncated to the first {TABULAR_SCHEMA_SUMMARY_MAX_COLUMNS} columns; " + f"{omitted_column_count} additional columns omitted.]" + ) + + return preview_text + + +def _build_minimal_tabular_summary(temp_file_path, original_filename, file_ext): + plugin_note = "This file is stored in blob storage for detailed analysis via the Tabular Processing plugin." + + if file_ext == '.csv': + column_summary = "Column discovery unavailable" + try: + header_df = pandas.read_csv(temp_file_path, keep_default_na=False, dtype=str, nrows=0) + compact_columns = _compact_tabular_columns(header_df.columns.tolist()) + if compact_columns: + column_summary = ", ".join(compact_columns) + except Exception: + pass + + return ( + f"Tabular data file: {original_filename}\n" + f"Columns: {column_summary}\n" + f"{plugin_note}" + ) + + if file_ext in ('.xlsx', '.xls', '.xlsm'): + sheet_summary = "Sheet discovery unavailable" + try: + engine = 'openpyxl' if file_ext in ('.xlsx', '.xlsm') else 'xlrd' + excel_file = pandas.ExcelFile(temp_file_path, engine=engine) + visible_sheets = [ + _compact_tabular_schema_value(sheet_name, max_chars=80) + for sheet_name in excel_file.sheet_names[:TABULAR_SCHEMA_SUMMARY_MAX_SHEETS] + ] + omitted_sheet_count = max(len(excel_file.sheet_names) - TABULAR_SCHEMA_SUMMARY_MAX_SHEETS, 0) + + if visible_sheets: + sheet_summary = ", ".join(visible_sheets) + if omitted_sheet_count: + sheet_summary += f", ... +{omitted_sheet_count} more sheets" + except Exception: + pass + + return ( + f"Tabular workbook: {original_filename}\n" + f"Sheets: {sheet_summary}\n" + f"{plugin_note}" + ) + + return ( + f"Tabular file: {original_filename}\n" + f"{plugin_note}" + ) + + +def _build_tabular_schema_summary(temp_file_path, original_filename, file_ext): + plugin_note = "This file is available for detailed analysis via the Tabular Processing plugin." + + if file_ext == '.csv': + df_preview = pandas.read_csv( + temp_file_path, + keep_default_na=False, + dtype=str, + nrows=TABULAR_SCHEMA_SUMMARY_MAX_PREVIEW_ROWS, + ) + compact_columns = _compact_tabular_columns(df_preview.columns.tolist()) + preview_rows = _build_compact_tabular_preview(df_preview) + + return ( + f"Tabular data file: {original_filename}\n" + f"Columns ({len(df_preview.columns)}): {', '.join(compact_columns) if compact_columns else 'None'}\n" + f"Preview (first {min(len(df_preview), TABULAR_SCHEMA_SUMMARY_MAX_PREVIEW_ROWS)} rows):\n{preview_rows}\n\n" + f"{plugin_note}" + ) + + if file_ext in ('.xlsx', '.xls', '.xlsm'): + engine = 'openpyxl' if file_ext in ('.xlsx', '.xlsm') else 'xlrd' + excel_file = pandas.ExcelFile(temp_file_path, engine=engine) + visible_sheet_names = excel_file.sheet_names[:TABULAR_SCHEMA_SUMMARY_MAX_SHEETS] + omitted_sheet_count = max(len(excel_file.sheet_names) - TABULAR_SCHEMA_SUMMARY_MAX_SHEETS, 0) + workbook_sections = [] + + for sheet_name in visible_sheet_names: + df_preview = excel_file.parse( + sheet_name, + keep_default_na=False, + dtype=str, + nrows=TABULAR_SCHEMA_SUMMARY_MAX_PREVIEW_ROWS, + ) + compact_columns = _compact_tabular_columns(df_preview.columns.tolist()) + preview_rows = _build_compact_tabular_preview(df_preview) + workbook_sections.append( + f"Sheet: {_compact_tabular_schema_value(sheet_name, max_chars=80)}\n" + f"Columns ({len(df_preview.columns)}): {', '.join(compact_columns) if compact_columns else 'None'}\n" + f"Preview (first {min(len(df_preview), TABULAR_SCHEMA_SUMMARY_MAX_PREVIEW_ROWS)} rows):\n{preview_rows}" + ) + + sheet_summary = ", ".join( + _compact_tabular_schema_value(sheet_name, max_chars=80) + for sheet_name in visible_sheet_names + ) + if omitted_sheet_count: + sheet_summary += f", ... +{omitted_sheet_count} more sheets" + + return ( + f"Tabular workbook: {original_filename}\n" + f"Sheets ({len(excel_file.sheet_names)}): {sheet_summary if sheet_summary else 'None'}\n\n" + + "\n\n".join(workbook_sections) + + f"\n\n{plugin_note}" + ) + + raise ValueError(f"Unsupported tabular file type: {file_ext}") + + def process_single_tabular_sheet(df, document_id, user_id, file_name, update_callback, group_id=None, public_workspace_id=None): """Chunks a pandas DataFrame from a CSV or Excel sheet.""" is_group = group_id is not None @@ -4924,78 +5665,74 @@ def process_tabular(document_id, user_id, temp_file_path, original_filename, fil # When enhanced citations is on, index a single schema summary chunk # instead of row-by-row chunking. The tabular processing plugin handles analysis. if enable_enhanced_citations: - try: - if file_ext == '.csv': - df_preview = pandas.read_csv(temp_file_path, keep_default_na=False, dtype=str, nrows=5) - full_df = pandas.read_csv(temp_file_path, keep_default_na=False, dtype=str) - row_count = len(full_df) - columns = [str(column) for column in df_preview.columns] - preview_rows = df_preview.head(5).to_string(index=False) - - schema_summary = ( - f"Tabular data file: {original_filename}\n" - f"Columns ({len(columns)}): {', '.join(columns)}\n" - f"Total rows: {row_count}\n" - f"Preview (first 5 rows):\n{preview_rows}\n\n" - f"This file is available for detailed analysis via the Tabular Processing plugin." - ) - elif file_ext in ('.xlsx', '.xls', '.xlsm'): - engine = 'openpyxl' if file_ext in ('.xlsx', '.xlsm') else 'xlrd' - excel_file = pandas.ExcelFile(temp_file_path, engine=engine) - workbook_sections = [] - - for sheet_name in excel_file.sheet_names: - df_preview = excel_file.parse(sheet_name, keep_default_na=False, dtype=str, nrows=3) - full_df = excel_file.parse(sheet_name, keep_default_na=False, dtype=str) - columns = [str(column) for column in df_preview.columns] - preview_rows = df_preview.head(3).to_string(index=False) - workbook_sections.append( - f"Sheet: {sheet_name}\n" - f"Columns ({len(columns)}): {', '.join(columns)}\n" - f"Total rows: {len(full_df)}\n" - f"Preview (first 3 rows):\n{preview_rows}" - ) - - schema_summary = ( - f"Tabular workbook: {original_filename}\n" - f"Sheets ({len(excel_file.sheet_names)}): {', '.join(excel_file.sheet_names)}\n\n" - + "\n\n".join(workbook_sections) - + "\n\nThis workbook is available for detailed analysis via the Tabular Processing plugin." - ) - else: - raise ValueError(f"Unsupported tabular file type: {file_ext}") + save_args = { + "page_number": 1, + "file_name": original_filename, + "user_id": user_id, + "document_id": document_id, + } + if is_public_workspace: + save_args["public_workspace_id"] = public_workspace_id + elif is_group: + save_args["group_id"] = group_id + try: + schema_summary = _build_tabular_schema_summary( + temp_file_path, + original_filename, + file_ext, + ) update_callback(number_of_pages=1, status=f"Indexing schema summary for {original_filename}...") + except Exception as schema_error: + log_event( + f"[process_tabular] Error building bounded schema summary for {original_filename}; using compact fallback summary: {schema_error}", + level=logging.WARNING, + ) + schema_summary = _build_minimal_tabular_summary( + temp_file_path, + original_filename, + file_ext, + ) + update_callback(number_of_pages=1, status=f"Indexing compact schema summary for {original_filename}...") - save_args = { - "page_text_content": schema_summary, - "page_number": 1, - "file_name": original_filename, - "user_id": user_id, - "document_id": document_id - } - if is_public_workspace: - save_args["public_workspace_id"] = public_workspace_id - elif is_group: - save_args["group_id"] = group_id - + try: + save_args["page_text_content"] = schema_summary token_usage = save_chunks(**save_args) - total_chunks_saved = 1 - if token_usage: - total_embedding_tokens = token_usage.get('total_tokens', 0) - embedding_model_name = token_usage.get('model_deployment_name') + except Exception as schema_index_error: + minimal_summary = _build_minimal_tabular_summary( + temp_file_path, + original_filename, + file_ext, + ) - # Don't return here โ€” fall through to metadata extraction below - except Exception as e: - log_event(f"[process_tabular] Error creating schema summary, falling back to row-by-row: {e}", level=logging.WARNING) - # Fall through to existing row-by-row processing + if minimal_summary == schema_summary: + raise Exception( + f"Failed indexing enhanced tabular schema summary for {original_filename}: {schema_index_error}" + ) from schema_index_error + + log_event( + f"[process_tabular] Retrying compact schema summary for {original_filename} after schema summary indexing error: {schema_index_error}", + level=logging.WARNING, + ) + update_callback(number_of_pages=1, status=f"Retrying compact schema summary for {original_filename}...") - # Only do row-by-row chunking if schema-only didn't produce chunks - if total_chunks_saved == 0: + try: + save_args["page_text_content"] = minimal_summary + token_usage = save_chunks(**save_args) + except Exception as minimal_summary_error: + raise Exception( + f"Failed indexing enhanced tabular summary for {original_filename}: {minimal_summary_error}" + ) from minimal_summary_error + + total_chunks_saved = 1 + if token_usage: + total_embedding_tokens = token_usage.get('total_tokens', 0) + embedding_model_name = token_usage.get('model_deployment_name') + + # Only do row-by-row chunking when enhanced citations is disabled. + if total_chunks_saved == 0 and not enable_enhanced_citations: try: if file_ext == '.csv': - # Process CSV - # Read CSV, attempt to infer header, keep data as string initially df = pandas.read_csv( temp_file_path, keep_default_na=False, @@ -5025,7 +5762,6 @@ def process_tabular(document_id, user_id, temp_file_path, original_filename, fil total_chunks_saved = result elif file_ext in ('.xlsx', '.xls', '.xlsm'): - # Process Excel (potentially multiple sheets) excel_file = pandas.ExcelFile( temp_file_path, engine='openpyxl' if file_ext in ('.xlsx', '.xlsm') else 'xlrd' @@ -5036,11 +5772,7 @@ def process_tabular(document_id, user_id, temp_file_path, original_filename, fil accumulated_total_chunks = 0 for sheet_name in sheet_names: update_callback(status=f"Processing sheet '{sheet_name}'...") - # Read specific sheet, get values (not formulas), keep data as string - # Note: pandas typically reads values, not formulas by default. df = excel_file.parse(sheet_name, keep_default_na=False, dtype=str) - - # Create effective filename for this sheet effective_filename = f"{base_name}-{sheet_name}{ext}" if len(sheet_names) > 1 else original_filename args = { @@ -5066,7 +5798,7 @@ def process_tabular(document_id, user_id, temp_file_path, original_filename, fil else: accumulated_total_chunks += result - total_chunks_saved = accumulated_total_chunks # Total across all sheets + total_chunks_saved = accumulated_total_chunks except pandas.errors.EmptyDataError: log_event(f"[process_tabular] Warning: Tabular file or sheet is empty: {original_filename}", level=logging.WARNING) @@ -5074,7 +5806,6 @@ def process_tabular(document_id, user_id, temp_file_path, original_filename, fil except Exception as e: raise Exception(f"Failed processing Tabular file {original_filename}: {e}") - # Extract metadata if enabled and chunks were processed settings = get_settings() enable_extract_meta_data = settings.get('enable_extract_meta_data', False) if enable_extract_meta_data and total_chunks_saved > 0: @@ -5091,7 +5822,7 @@ def process_tabular(document_id, user_id, temp_file_path, original_filename, fil args["group_id"] = group_id document_metadata = extract_document_metadata(**args) - + if document_metadata: update_fields = {k: v for k, v in document_metadata.items() if v is not None and v != ""} if update_fields: @@ -5102,7 +5833,7 @@ def process_tabular(document_id, user_id, temp_file_path, original_filename, fil except Exception as e: print(f"Warning: Error extracting final metadata for Tabular document {document_id}: {str(e)}") update_callback(status=f"Processing complete (metadata extraction warning)") - + return total_chunks_saved, total_embedding_tokens, embedding_model_name def process_di_document(document_id, user_id, temp_file_path, original_filename, file_ext, enable_enhanced_citations, update_callback, group_id=None, public_workspace_id=None): @@ -6775,10 +7506,10 @@ def get_workspace_tags(user_id, group_id=None, public_workspace_id=None): workspace_type = 'personal' try: - # Query all documents with tags + # Query documents with enough metadata to collapse revisions to the current version. if is_public_workspace: query = """ - SELECT c.tags + SELECT c.id, c.file_name, c.version, c._ts, c.upload_date, c.tags, c.revision_family_id, c.is_current_version FROM c WHERE c.public_workspace_id = @partition_key AND IS_DEFINED(c.tags) @@ -6786,7 +7517,7 @@ def get_workspace_tags(user_id, group_id=None, public_workspace_id=None): """ elif is_group: query = """ - SELECT c.tags + SELECT c.id, c.file_name, c.version, c._ts, c.upload_date, c.tags, c.revision_family_id, c.is_current_version, c.group_id FROM c WHERE c.group_id = @partition_key AND IS_DEFINED(c.tags) @@ -6794,7 +7525,7 @@ def get_workspace_tags(user_id, group_id=None, public_workspace_id=None): """ else: query = """ - SELECT c.tags + SELECT c.id, c.file_name, c.version, c._ts, c.upload_date, c.tags, c.revision_family_id, c.is_current_version, c.user_id FROM c WHERE c.user_id = @partition_key AND IS_DEFINED(c.tags) @@ -6811,7 +7542,9 @@ def get_workspace_tags(user_id, group_id=None, public_workspace_id=None): ) ) - # Count tag occurrences + documents = select_current_documents(documents) + + # Count tag occurrences on current revisions only. tag_counts = {} for doc in documents: for tag in doc.get('tags', []): @@ -6990,22 +7723,16 @@ def propagate_tags_to_blob_metadata(document_id, tags, user_id, group_id=None, p cosmos_container = cosmos_user_documents_container doc_item = cosmos_container.read_item(document_id, partition_key=document_id) - file_name = doc_item.get('file_name') - if not file_name: - print(f"Warning: No file_name found for document {document_id}, skipping blob metadata update") + storage_account_container_name, blob_path = get_document_blob_storage_info( + doc_item, + user_id=user_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + if not blob_path: + print(f"Warning: No blob path found for document {document_id}, skipping blob metadata update") return - # Determine container and blob path - if is_public_workspace: - storage_account_container_name = storage_account_public_documents_container_name - blob_path = f"{public_workspace_id}/{file_name}" - elif is_group: - storage_account_container_name = storage_account_group_documents_container_name - blob_path = f"{group_id}/{file_name}" - else: - storage_account_container_name = storage_account_user_documents_container_name - blob_path = f"{user_id}/{file_name}" - blob_service_client = CLIENTS.get("storage_account_office_docs_client") if not blob_service_client: print(f"Warning: Blob service client not available, skipping blob metadata update") diff --git a/application/single_app/functions_search.py b/application/single_app/functions_search.py index 4ea75404..6851778f 100644 --- a/application/single_app/functions_search.py +++ b/application/single_app/functions_search.py @@ -120,6 +120,37 @@ def hybrid_search(query, user_id, document_id=None, document_ids=None, top_n=12, elif document_id: document_ids = [document_id] + normalization_changed = False + try: + from functions_documents import normalize_document_revision_families + + if doc_scope in ("all", "personal"): + normalization_changed = normalize_document_revision_families(user_id=user_id) or normalization_changed + + if doc_scope in ("all", "group") and active_group_ids: + for current_group_id in active_group_ids: + normalization_changed = normalize_document_revision_families( + user_id=user_id, + group_id=current_group_id, + ) or normalization_changed + + if doc_scope in ("all", "public"): + if doc_scope == "public" and active_public_workspace_id: + public_workspace_ids = [active_public_workspace_id] + else: + public_workspace_ids = get_user_visible_public_workspace_ids_from_settings(user_id) + + for workspace_id in public_workspace_ids: + normalization_changed = normalize_document_revision_families( + user_id=user_id, + public_workspace_id=workspace_id, + ) or normalization_changed + except Exception as normalization_error: + debug_print( + f"Revision normalization failed before search: {normalization_error}", + "SEARCH", + ) + # Build document ID filter clause doc_id_filter = None if document_ids and len(document_ids) > 0: @@ -144,13 +175,15 @@ def hybrid_search(query, user_id, document_id=None, document_ids=None, top_n=12, ) # Check cache first (pass scope parameters for correct partition key) - cached_results = get_cached_search_results( - cache_key, - user_id, - doc_scope, - active_group_ids=active_group_ids, - active_public_workspace_id=active_public_workspace_id - ) + cached_results = None + if not normalization_changed: + cached_results = get_cached_search_results( + cache_key, + user_id, + doc_scope, + active_group_ids=active_group_ids, + active_public_workspace_id=active_public_workspace_id + ) if cached_results is not None: debug_print( "Returning CACHED search results", @@ -361,38 +394,44 @@ def hybrid_search(query, user_id, document_id=None, document_ids=None, top_n=12, elif doc_scope == "personal": if doc_id_filter: + user_base_filter = ( + ( + f"(user_id eq '{user_id}' or shared_user_ids/any(u: u eq '{user_id},approved')) " + if enable_file_sharing else + f"user_id eq '{user_id}' " + ) + + f"and {doc_id_filter}" + ) + user_filter = f"{user_base_filter} and {tags_filter_clause}" if tags_filter_clause else user_base_filter + user_results = search_client_user.search( search_text=query, vector_queries=[vector_query], - filter=( - ( - f"(user_id eq '{user_id}' or shared_user_ids/any(u: u eq '{user_id},approved')) " - if enable_file_sharing else - f"user_id eq '{user_id}' " - ) + - f"and {doc_id_filter}" - ), + filter=user_filter, query_type="semantic", semantic_configuration_name="nexus-user-index-semantic-configuration", query_caption="extractive", query_answer="extractive", - select=["id", "chunk_text", "chunk_id", "file_name", "user_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] + select=["id", "chunk_text", "chunk_id", "file_name", "user_id", "version", "chunk_sequence", "upload_date", "document_classification", "document_tags", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] ) results = extract_search_results(user_results, top_n) else: + user_base_filter = ( + f"(user_id eq '{user_id}' or shared_user_ids/any(u: u eq '{user_id},approved')) " + if enable_file_sharing else + f"user_id eq '{user_id}' " + ) + user_filter = f"{user_base_filter} and {tags_filter_clause}" if tags_filter_clause else user_base_filter.strip() + user_results = search_client_user.search( search_text=query, vector_queries=[vector_query], - filter=( - f"(user_id eq '{user_id}' or shared_user_ids/any(u: u eq '{user_id},approved')) " - if enable_file_sharing else - f"user_id eq '{user_id}' " - ), + filter=user_filter, query_type="semantic", semantic_configuration_name="nexus-user-index-semantic-configuration", query_caption="extractive", query_answer="extractive", - select=["id", "chunk_text", "chunk_id", "file_name", "user_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] + select=["id", "chunk_text", "chunk_id", "file_name", "user_id", "version", "chunk_sequence", "upload_date", "document_classification", "document_tags", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] ) results = extract_search_results(user_results, top_n) @@ -402,33 +441,35 @@ def hybrid_search(query, user_id, document_id=None, document_ids=None, top_n=12, elif doc_id_filter: group_conditions = " or ".join([f"group_id eq '{gid}'" for gid in active_group_ids]) shared_conditions = " or ".join([f"shared_group_ids/any(g: g eq '{gid},approved')" for gid in active_group_ids]) + group_base_filter = f"({group_conditions} or {shared_conditions}) and {doc_id_filter}" + group_filter = f"{group_base_filter} and {tags_filter_clause}" if tags_filter_clause else group_base_filter + group_results = search_client_group.search( search_text=query, vector_queries=[vector_query], - filter=( - f"({group_conditions} or {shared_conditions}) and {doc_id_filter}" - ), + filter=group_filter, query_type="semantic", semantic_configuration_name="nexus-group-index-semantic-configuration", query_caption="extractive", query_answer="extractive", - select=["id", "chunk_text", "chunk_id", "file_name", "group_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] + select=["id", "chunk_text", "chunk_id", "file_name", "group_id", "version", "chunk_sequence", "upload_date", "document_classification", "document_tags", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] ) results = extract_search_results(group_results, top_n) else: group_conditions = " or ".join([f"group_id eq '{gid}'" for gid in active_group_ids]) shared_conditions = " or ".join([f"shared_group_ids/any(g: g eq '{gid},approved')" for gid in active_group_ids]) + group_base_filter = f"({group_conditions} or {shared_conditions})" + group_filter = f"{group_base_filter} and {tags_filter_clause}" if tags_filter_clause else group_base_filter + group_results = search_client_group.search( search_text=query, vector_queries=[vector_query], - filter=( - f"({group_conditions} or {shared_conditions})" - ), + filter=group_filter, query_type="semantic", semantic_configuration_name="nexus-group-index-semantic-configuration", query_caption="extractive", query_answer="extractive", - select=["id", "chunk_text", "chunk_id", "file_name", "group_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] + select=["id", "chunk_text", "chunk_id", "file_name", "group_id", "version", "chunk_sequence", "upload_date", "document_classification", "document_tags", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] ) results = extract_search_results(group_results, top_n) @@ -441,11 +482,13 @@ def hybrid_search(query, user_id, document_id=None, document_ids=None, top_n=12, if visible_public_workspace_ids: # Use 'or' conditions instead of 'in' operator for OData compatibility workspace_conditions = " or ".join([f"public_workspace_id eq '{id}'" for id in visible_public_workspace_ids]) - public_filter = f"({workspace_conditions}) and {doc_id_filter}" + public_base_filter = f"({workspace_conditions}) and {doc_id_filter}" else: # Fallback to active_public_workspace_id if no visible workspaces - public_filter = f"public_workspace_id eq '{active_public_workspace_id}' and {doc_id_filter}" - + public_base_filter = f"public_workspace_id eq '{active_public_workspace_id}' and {doc_id_filter}" + + public_filter = f"{public_base_filter} and {tags_filter_clause}" if tags_filter_clause else public_base_filter + public_results = search_client_public.search( search_text=query, vector_queries=[vector_query], @@ -454,22 +497,24 @@ def hybrid_search(query, user_id, document_id=None, document_ids=None, top_n=12, semantic_configuration_name="nexus-public-index-semantic-configuration", query_caption="extractive", query_answer="extractive", - select=["id", "chunk_text", "chunk_id", "file_name", "public_workspace_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] + select=["id", "chunk_text", "chunk_id", "file_name", "public_workspace_id", "version", "chunk_sequence", "upload_date", "document_classification", "document_tags", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] ) results = extract_search_results(public_results, top_n) else: # Get visible public workspace IDs from user settings visible_public_workspace_ids = get_user_visible_public_workspace_ids_from_settings(user_id) - + # Create filter for visible public workspaces if visible_public_workspace_ids: # Use 'or' conditions instead of 'in' operator for OData compatibility workspace_conditions = " or ".join([f"public_workspace_id eq '{id}'" for id in visible_public_workspace_ids]) - public_filter = f"({workspace_conditions})" + public_base_filter = f"({workspace_conditions})" else: # Fallback to active_public_workspace_id if no visible workspaces - public_filter = f"public_workspace_id eq '{active_public_workspace_id}'" - + public_base_filter = f"public_workspace_id eq '{active_public_workspace_id}'" + + public_filter = f"{public_base_filter} and {tags_filter_clause}" if tags_filter_clause else public_base_filter + public_results = search_client_public.search( search_text=query, vector_queries=[vector_query], @@ -478,7 +523,7 @@ def hybrid_search(query, user_id, document_id=None, document_ids=None, top_n=12, semantic_configuration_name="nexus-public-index-semantic-configuration", query_caption="extractive", query_answer="extractive", - select=["id", "chunk_text", "chunk_id", "file_name", "public_workspace_id", "version", "chunk_sequence", "upload_date", "document_classification", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] + select=["id", "chunk_text", "chunk_id", "file_name", "public_workspace_id", "version", "chunk_sequence", "upload_date", "document_classification", "document_tags", "page_number", "author", "chunk_keywords", "title", "chunk_summary"] ) results = extract_search_results(public_results, top_n) diff --git a/application/single_app/route_backend_chats.py b/application/single_app/route_backend_chats.py index 92ff4dd3..26a71f41 100644 --- a/application/single_app/route_backend_chats.py +++ b/application/single_app/route_backend_chats.py @@ -5,6 +5,7 @@ from semantic_kernel.contents.chat_message_content import ChatMessageContent from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.connectors.ai.chat_completion_client_base import ChatCompletionClientBase +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.azure_chat_prompt_execution_settings import AzureChatPromptExecutionSettings from semantic_kernel_fact_memory_store import FactMemoryStore from semantic_kernel_loader import initialize_semantic_kernel from semantic_kernel_plugins.plugin_invocation_thoughts import ( @@ -45,7 +46,9 @@ from functions_keyvault import SecretReturnType, keyvault_model_endpoint_get_helper from functions_message_artifacts import ( build_agent_citation_artifact_documents, + build_message_artifact_payload_map, filter_assistant_artifact_items, + hydrate_agent_citations_from_artifacts, ) from functions_thoughts import ThoughtTracker @@ -192,8 +195,33 @@ def is_tabular_entity_lookup_question(user_question): 'installment', 'related', ) + explanatory_keywords = ( + 'because', + 'detail', + 'details', + 'explain', + 'reason', + 'summary', + 'why', + ) if any(phrase in normalized_question for phrase in direct_phrases) and any( - keyword in normalized_question for keyword in relationship_keywords + keyword in normalized_question for keyword in relationship_keywords + explanatory_keywords + ): + return True + + identifier_like_reference = bool(re.search( + r'\b(?:ret|tp|case|account|acct|payment|pay|notice|audit|w2|1099)[-_]?[a-z0-9]*\d{2,}[a-z0-9_-]*\b', + normalized_question, + )) + anchored_entity_reference = any( + re.search(pattern, normalized_question) + for pattern in ( + r'\bfor\s+(?:return|taxpayer|case|account|payment|notice|audit)\b', + r'\b(?:return|taxpayer|case|account|payment|notice|audit)\s+[`"\']?[a-z0-9_-]*\d{2,}[a-z0-9_-]*[`"\']?\b', + ) + ) + if anchored_entity_reference and identifier_like_reference and any( + keyword in normalized_question for keyword in relationship_keywords + explanatory_keywords ): return True @@ -204,6 +232,44 @@ def is_tabular_entity_lookup_question(user_question): return any(re.search(pattern, normalized_question) for pattern in entity_lookup_patterns) +def is_tabular_distinct_value_question(user_question): + """Return True for unique-value questions that should start with get_distinct_values.""" + normalized_question = re.sub(r'\s+', ' ', str(user_question or '').strip().lower()) + if not normalized_question or is_tabular_schema_summary_question(normalized_question): + return False + + distinct_keywords = ( + 'different', + 'discrete', + 'distinct', + 'unique', + ) + count_keywords = ( + 'count', + 'counts', + 'how many', + 'number of', + ) + target_keywords = ( + 'link', + 'links', + 'location', + 'locations', + 'sharepoint', + 'site', + 'sites', + 'url', + 'urls', + 'value', + 'values', + ) + + has_distinct_intent = any(keyword in normalized_question for keyword in distinct_keywords) + has_count_intent = any(keyword in normalized_question for keyword in count_keywords) + has_target = any(keyword in normalized_question for keyword in target_keywords) + return (has_distinct_intent or has_count_intent) and has_target + + def is_tabular_cross_sheet_bridge_question(user_question): """Return True for grouped analytical questions that may need multiple worksheets.""" normalized_question = re.sub(r'\s+', ' ', str(user_question or '').strip().lower()) @@ -302,10 +368,209 @@ def build_tabular_computed_results_system_message(source_label, tabular_analysis f"{rendered_analysis}\n\n" "These are tool-backed results derived from the full underlying tabular data, not just retrieved schema excerpts. " "Treat them as authoritative for row-level facts, calculations, and numeric conclusions. " - "Do not say that you lack direct access to the data if the answer is present in these computed results." + "Do not say that you lack direct access to the data if the answer is present in these computed results. " + "If a tool summary includes a full scalar value list, you may enumerate those values directly in the final answer. " + "If a tool summary includes the full matching rows from a row or text search, use the surrounding cell context in those rows when deciding which content is relevant to the user's question." ) +MULTI_FILE_TABULAR_DISTINCT_URL_EXTRACT_PATTERN = ( + r'(?i)https?://[^\s/]+/[^\s]*?(?:sites/|sitecollection/|teams/)[^\s"\']+' +) + + +def get_multi_file_tabular_analysis_mode(user_question, execution_mode='analysis', analysis_file_contexts=None): + """Return a deterministic multi-file mode when the question should bypass SK planning.""" + normalized_execution_mode = str(execution_mode or 'analysis').strip().lower() + normalized_contexts = dedupe_tabular_file_contexts(analysis_file_contexts) + if normalized_execution_mode != 'analysis' or len(normalized_contexts) <= 1: + return None + + if is_tabular_distinct_url_question(user_question): + return 'distinct_url_union' + + return None + + +def score_tabular_distinct_url_column(column_name): + """Score likely URL-bearing column names for deterministic multi-file analysis.""" + normalized_column_name = re.sub(r'\s+', ' ', str(column_name or '').strip().lower()) + if not normalized_column_name: + return None + + exact_priority = { + 'location': 0, + 'locations': 0, + 'url': 1, + 'urls': 1, + 'link': 2, + 'links': 2, + 'site': 3, + 'sites': 3, + 'path': 4, + 'paths': 4, + 'address': 5, + 'addresses': 5, + } + if normalized_column_name in exact_priority: + return exact_priority[normalized_column_name] + + token_priority = { + 'location': 0, + 'locations': 0, + 'url': 1, + 'urls': 1, + 'link': 2, + 'links': 2, + 'site': 3, + 'sites': 3, + 'sharepoint': 4, + 'path': 5, + 'paths': 5, + 'address': 6, + 'addresses': 6, + } + token_scores = [ + token_priority[token] + for token in re.split(r'[^a-z0-9]+', normalized_column_name) + if token and token in token_priority + ] + if not token_scores: + return None + + return min(token_scores) + 10 + + +def select_tabular_distinct_url_column(column_names): + """Return the best URL-like column from a list of schema column names.""" + best_column_name = None + best_comparison_key = None + + for candidate_column in column_names or []: + rendered_column_name = str(candidate_column or '').strip() + if not rendered_column_name: + continue + + column_score = score_tabular_distinct_url_column(rendered_column_name) + if column_score is None: + continue + + comparison_key = (column_score, rendered_column_name.casefold()) + if best_comparison_key is None or comparison_key < best_comparison_key: + best_comparison_key = comparison_key + best_column_name = rendered_column_name + + return best_column_name + + +def select_tabular_distinct_url_sheet_and_column(schema_info): + """Choose the best worksheet and column for deterministic multi-file URL extraction.""" + if not isinstance(schema_info, Mapping): + return None, None + + per_sheet_schemas = schema_info.get('per_sheet_schemas', {}) + if isinstance(per_sheet_schemas, Mapping) and per_sheet_schemas: + ranked_sheet_candidates = [] + for raw_sheet_name, raw_sheet_schema in per_sheet_schemas.items(): + if not isinstance(raw_sheet_schema, Mapping): + continue + + selected_column = select_tabular_distinct_url_column(raw_sheet_schema.get('columns', [])) + if not selected_column: + continue + + row_count = raw_sheet_schema.get('row_count', 0) + try: + normalized_row_count = int(row_count) + except (TypeError, ValueError): + normalized_row_count = 0 + + ranked_sheet_candidates.append(( + score_tabular_distinct_url_column(selected_column), + -normalized_row_count, + str(raw_sheet_name or '').casefold(), + str(raw_sheet_name or '').strip() or None, + selected_column, + )) + + if ranked_sheet_candidates: + _, _, _, selected_sheet_name, selected_column_name = sorted(ranked_sheet_candidates)[0] + return selected_sheet_name, selected_column_name + + return None, select_tabular_distinct_url_column(schema_info.get('columns', [])) + + +def normalize_multi_file_tabular_distinct_value(value): + """Normalize a distinct scalar so multi-file unions remain stable.""" + rendered_value = str(value or '').strip() + if not rendered_value: + return None + + return rendered_value.casefold() + + +def build_multi_file_tabular_distinct_value_analysis(successful_results, failed_results=None): + """Build a deterministic combined distinct-value payload across multiple tabular files.""" + successful_results = list(successful_results or []) + failed_results = list(failed_results or []) + if not successful_results: + return None + + combined_values_by_key = {} + per_file_results = [] + any_values_limited = False + files_with_matches = 0 + + for result_payload in successful_results: + file_values = [] + for raw_value in result_payload.get('values') or []: + rendered_value = str(raw_value or '').strip() + if not rendered_value: + continue + + file_values.append(rendered_value) + normalized_value_key = normalize_multi_file_tabular_distinct_value(rendered_value) + if normalized_value_key and normalized_value_key not in combined_values_by_key: + combined_values_by_key[normalized_value_key] = rendered_value + + distinct_count = parse_tabular_result_count(result_payload.get('distinct_count')) + returned_values = parse_tabular_result_count(result_payload.get('returned_values')) + if distinct_count is None: + distinct_count = len(file_values) + if returned_values is None: + returned_values = len(file_values) + + values_limited = bool(result_payload.get('values_limited', False)) + any_values_limited = any_values_limited or values_limited + if returned_values > 0: + files_with_matches += 1 + + per_file_results.append({ + 'filename': result_payload.get('filename'), + 'selected_sheet': result_payload.get('selected_sheet'), + 'column': result_payload.get('column'), + 'distinct_count': distinct_count, + 'returned_values': returned_values, + 'values_limited': values_limited, + 'values': file_values, + }) + + combined_values = sorted(combined_values_by_key.values(), key=lambda item: item.casefold()) + return json.dumps({ + 'analysis_type': 'multi_file_distinct_url_union', + 'files_requested': len(successful_results) + len(failed_results), + 'files_analyzed': len(successful_results), + 'files_with_matches': files_with_matches, + 'files_failed': len(failed_results), + 'distinct_count': len(combined_values), + 'returned_values': len(combined_values), + 'values_limited': any_values_limited, + 'values': combined_values, + 'per_file_results': per_file_results, + 'failed_files': failed_results, + }, indent=2, default=str) + + def get_kernel(): return getattr(g, 'kernel', None) or getattr(builtins, 'kernel', None) @@ -774,6 +1039,12 @@ def describe_tabular_invocation_conditions(invocation): if query_expression: return query_expression + search_value = str(parameters.get('search_value') or '').strip() + if search_value: + search_columns = str(parameters.get('search_columns') or '').strip() or 'ALL COLUMNS' + search_operator = str(parameters.get('search_operator') or 'contains').strip() + return f"search_value={search_value}; search_operator={search_operator}; search_columns={search_columns}" + column_name = str(parameters.get('column') or '').strip() operator = str(parameters.get('operator') or '').strip() value = parameters.get('value') @@ -785,6 +1056,17 @@ def describe_tabular_invocation_conditions(invocation): if lookup_column: return f"{lookup_column} == {lookup_value}" + extract_mode = str(parameters.get('extract_mode') or '').strip() + if extract_mode: + extraction_bits = [f"extract_mode={extract_mode}"] + extract_pattern = str(parameters.get('extract_pattern') or '').strip() + url_path_segments = parameters.get('url_path_segments') + if extract_pattern: + extraction_bits.append(f"extract_pattern={extract_pattern}") + if url_path_segments not in (None, ''): + extraction_bits.append(f"url_path_segments={url_path_segments}") + return ', '.join(extraction_bits) + return None @@ -841,7 +1123,7 @@ def get_tabular_query_overlap_summary(invocations, max_rows=10): for invocation in invocations or []: function_name = getattr(invocation, 'function_name', '') - if function_name not in {'query_tabular_data', 'filter_rows'}: + if function_name not in {'query_tabular_data', 'filter_rows', 'search_rows'}: continue rows = get_tabular_invocation_data_rows(invocation) @@ -941,6 +1223,49 @@ def get_tabular_invocation_compact_payload(invocation, max_rows=5): 'operation': compact_tabular_fallback_value(result_payload.get('operation')), 'result': compact_tabular_fallback_value(result_payload.get('result')), }) + elif function_name == 'get_distinct_values': + for key_name in ( + 'column', + 'filter_applied', + 'normalize_match', + 'extract_mode', + 'extract_pattern', + 'url_path_segments', + 'matched_cell_count', + 'extracted_match_count', + 'distinct_count', + 'returned_values', + 'values_limited', + ): + if key_name in result_payload: + compact_payload[key_name] = compact_tabular_fallback_value(result_payload.get(key_name)) + + raw_values = result_payload.get('values') + if isinstance(raw_values, list): + compact_values = [] + rendered_values_length = 0 + max_values_in_payload = 200 + max_rendered_values_chars = 14000 + + for raw_value in raw_values: + compact_value = compact_tabular_fallback_value(raw_value) + rendered_value = json.dumps(compact_value, default=str) + projected_length = rendered_values_length + len(rendered_value) + 2 + + if compact_values and ( + len(compact_values) >= max_values_in_payload + or projected_length > max_rendered_values_chars + ): + break + + compact_values.append(compact_value) + rendered_values_length = projected_length + + compact_payload['values'] = compact_values + compact_payload['full_values_included'] = len(compact_values) == len(raw_values) + if len(compact_values) != len(raw_values): + compact_payload['values_limited'] = True + compact_payload['returned_values'] = len(compact_values) elif function_name in {'group_by_aggregate', 'group_by_datetime_component'}: for key_name in ( 'group_by', @@ -975,18 +1300,46 @@ def get_tabular_invocation_compact_payload(invocation, max_rows=5): for row in data_rows[:max_rows] ] compact_payload['sample_rows_limited'] = len(data_rows) > max_rows - elif function_name in {'query_tabular_data', 'filter_rows'}: + elif function_name in {'query_tabular_data', 'filter_rows', 'search_rows'}: + for key_name in ('search_value', 'search_operator', 'searched_columns', 'matched_columns', 'return_columns'): + if key_name in result_payload: + compact_payload[key_name] = compact_tabular_fallback_value(result_payload.get(key_name)) + for key_name in ('total_matches', 'returned_rows'): if key_name in result_payload: compact_payload[key_name] = compact_tabular_fallback_value(result_payload.get(key_name)) data_rows = get_tabular_invocation_data_rows(invocation) if data_rows: + desired_max_rows = max_rows + total_matches = result_payload.get('total_matches') + returned_rows = result_payload.get('returned_rows') + try: + total_matches = int(total_matches) + except (TypeError, ValueError): + total_matches = None + try: + returned_rows = int(returned_rows) + except (TypeError, ValueError): + returned_rows = len(data_rows) + + if ( + total_matches is not None + and returned_rows == total_matches + and total_matches <= 25 + ): + desired_max_rows = max(desired_max_rows, total_matches) + compact_payload['sample_rows'] = [ compact_tabular_fallback_value(row) - for row in data_rows[:max_rows] + for row in data_rows[:desired_max_rows] ] - compact_payload['sample_rows_limited'] = len(data_rows) > max_rows + compact_payload['sample_rows_limited'] = len(data_rows) > desired_max_rows + compact_payload['full_rows_included'] = ( + total_matches is not None + and total_matches == returned_rows + and len(compact_payload['sample_rows']) == len(data_rows) + ) rendered_conditions = describe_tabular_invocation_conditions(invocation) if rendered_conditions: @@ -1061,6 +1414,15 @@ def build_tabular_analysis_fallback_from_invocations(invocations): if 'sample_rows' in shrunk_payload: shrunk_payload['sample_rows'] = shrunk_payload['sample_rows'][:2] shrunk_payload['sample_rows_limited'] = True + shrunk_payload['full_rows_included'] = False + if isinstance(shrunk_payload.get('values'), list) and len(shrunk_payload['values']) > 25: + shrunk_payload['values'] = shrunk_payload['values'][:25] + shrunk_payload['values_limited'] = True + shrunk_payload['full_values_included'] = False + shrunk_payload['returned_values'] = min( + int(shrunk_payload.get('returned_values') or len(shrunk_payload['values'])), + len(shrunk_payload['values']), + ) if isinstance(shrunk_payload.get('top_results'), dict): shrunk_payload['top_results'] = dict(list(shrunk_payload['top_results'].items())[:3]) @@ -1072,6 +1434,15 @@ def build_tabular_analysis_fallback_from_invocations(invocations): if len(candidate_text) > (max_fallback_chars - coverage_note_reserve): shrunk_payload.pop('sample_rows', None) shrunk_payload['sample_rows_limited'] = True + shrunk_payload['full_rows_included'] = False + if isinstance(shrunk_payload.get('values'), list) and len(shrunk_payload['values']) > 10: + shrunk_payload['values'] = shrunk_payload['values'][:10] + shrunk_payload['values_limited'] = True + shrunk_payload['full_values_included'] = False + shrunk_payload['returned_values'] = min( + int(shrunk_payload.get('returned_values') or len(shrunk_payload['values'])), + len(shrunk_payload['values']), + ) shrunk_payload['result_summary_truncated'] = True if isinstance(shrunk_payload.get('top_results'), dict): shrunk_payload['top_results'] = dict(list(shrunk_payload['top_results'].items())[:2]) @@ -1219,163 +1590,1485 @@ def summarize_tabular_invocation_errors(invocations): return unique_errors -def filter_tabular_citation_invocations(invocations): - """Hide discovery-only citation noise when analytical tabular calls exist.""" - if not invocations: - return [] - - successful_analytical_invocations, _ = split_tabular_analysis_invocations(invocations) - if successful_analytical_invocations: - return successful_analytical_invocations +def summarize_tabular_discovery_invocations(invocations, max_sheet_names=6): + """Return compact workbook-discovery summaries for retry prompts.""" + discovery_summaries = [] - successful_schema_summary_invocations = [] for invocation in invocations or []: if getattr(invocation, 'function_name', '') != 'describe_tabular_file': continue if get_tabular_invocation_error_message(invocation): continue - successful_schema_summary_invocations.append(invocation) - if successful_schema_summary_invocations: - return successful_schema_summary_invocations + result_payload = get_tabular_invocation_result_payload(invocation) or {} + filename = str(result_payload.get('filename') or '').strip() + if not filename: + continue - return [] + sheet_names = result_payload.get('sheet_names') or [] + if not isinstance(sheet_names, list): + sheet_names = [] + relationship_hints = result_payload.get('relationship_hints') or [] + if not isinstance(relationship_hints, list): + relationship_hints = [] -def format_tabular_thought_parameter_value(value): - """Render a concise parameter value for tabular thought details.""" - if value is None: - return None + summary_parts = [filename] + if result_payload.get('is_workbook'): + summary_parts.append(f"sheet_count={result_payload.get('sheet_count', len(sheet_names))}") + if sheet_names: + rendered_sheet_names = ', '.join(str(sheet_name) for sheet_name in sheet_names[:max_sheet_names]) + if len(sheet_names) > max_sheet_names: + rendered_sheet_names += f", +{len(sheet_names) - max_sheet_names} more" + summary_parts.append(f"sheets={rendered_sheet_names}") + if relationship_hints: + summary_parts.append(f"relationship_hints={len(relationship_hints)}") - if isinstance(value, (dict, list, tuple)): - rendered_value = json.dumps(value, default=str) - else: - rendered_value = str(value) + discovery_summaries.append('; '.join(summary_parts)) - if not rendered_value: + return discovery_summaries + + +def extract_json_object_from_text(text): + """Extract the first JSON object embedded in a model response.""" + rendered_text = str(text or '').strip() + if not rendered_text: return None - if len(rendered_value) > 120: - rendered_value = rendered_value[:117] + '...' + json_decoder = json.JSONDecoder() + for character_index, character in enumerate(rendered_text): + if character != '{': + continue - return rendered_value + try: + payload, _ = json_decoder.raw_decode(rendered_text[character_index:]) + except Exception: + continue + if isinstance(payload, dict): + return payload -def get_tabular_tool_thought_payloads(invocations): - """Convert tabular plugin invocations into user-visible thought payloads.""" - thought_payloads = [] + return None - for invocation in invocations or []: - function_name = getattr(invocation, 'function_name', 'unknown_tool') - duration_ms = getattr(invocation, 'duration_ms', None) - error_message = get_tabular_invocation_error_message(invocation) - success = getattr(invocation, 'success', True) and not error_message - parameters = getattr(invocation, 'parameters', {}) or {} - filename = parameters.get('filename') - sheet_name = parameters.get('sheet_name') - duration_suffix = f" ({int(duration_ms)}ms)" if duration_ms else "" - content = f"Tabular tool {function_name}{duration_suffix}" - if filename: - content = f"Tabular tool {function_name} on {filename}{duration_suffix}" - if filename and sheet_name: - content = f"Tabular tool {function_name} on {filename} [{sheet_name}]{duration_suffix}" - if not success: - content = f"{content} failed" +def normalize_tabular_reviewer_function_name(function_name): + """Normalize reviewer-selected function names to bare plugin function names.""" + normalized_function_name = str(function_name or '').strip() + if not normalized_function_name: + return '' - detail_parts = [] - for parameter_name, parameter_value in parameters.items(): - if parameter_name in get_tabular_thought_excluded_parameter_names(): - continue + normalized_function_name = normalized_function_name.replace('tabular_processing-', '') + if '.' in normalized_function_name: + normalized_function_name = normalized_function_name.split('.')[-1] - rendered_value = format_tabular_thought_parameter_value(parameter_value) - if rendered_value is None: - continue + return normalized_function_name.strip() - detail_parts.append(f"{parameter_name}={rendered_value}") - rendered_error_message = format_tabular_thought_parameter_value(error_message) - if rendered_error_message: - detail_parts.append(f"error={rendered_error_message}") +def parse_tabular_reviewer_plan(review_text): + """Parse a JSON-only LLM reviewer plan into executable call descriptors.""" + payload = extract_json_object_from_text(review_text) + if not isinstance(payload, dict): + return [] - detail_parts.append(f"success={success}") - detail = "; ".join(detail_parts) if detail_parts else None - thought_payloads.append((content, detail)) + raw_calls = payload.get('calls') + if not isinstance(raw_calls, list): + raw_call = payload.get('call') + raw_calls = [raw_call] if isinstance(raw_call, dict) else [] - return thought_payloads + normalized_calls = [] + for raw_call in raw_calls: + if not isinstance(raw_call, dict): + continue + function_name = normalize_tabular_reviewer_function_name( + raw_call.get('function') or raw_call.get('function_name') + ) + arguments = raw_call.get('arguments') or raw_call.get('args') or {} + if not function_name or not isinstance(arguments, dict): + continue -def get_tabular_status_thought_payloads(invocations, analysis_succeeded): - """Return additional tabular status thoughts for retries and fallbacks.""" - successful_analytical_invocations, failed_analytical_invocations = split_tabular_analysis_invocations(invocations) - if not failed_analytical_invocations: - return [] + normalized_calls.append({ + 'function_name': function_name, + 'arguments': dict(arguments), + }) - error_messages = summarize_tabular_invocation_errors(failed_analytical_invocations) - detail = "; ".join(error_messages) if error_messages else None + return normalized_calls - if analysis_succeeded and successful_analytical_invocations: - return [( - "Tabular analysis recovered after retrying tool errors", - detail, - )] - if analysis_succeeded: - return [( - "Tabular analysis recovered via internal fallback after tool errors", - detail, - )] +def get_tabular_reviewer_function_manifest(): + """Return compact analytical-function guidance for the reviewer LLM.""" + return { + 'lookup_value': { + 'best_for': 'one exact row or entity and one target column value', + 'required_arguments': ['filename', 'lookup_column', 'lookup_value', 'target_column'], + 'optional_arguments': ['match_operator', 'normalize_match', 'sheet_name', 'sheet_index', 'max_rows'], + }, + 'get_distinct_values': { + 'best_for': 'unique values, discrete counts, canonical site lists, embedded URL or regex extraction, and deterministic de-duplication after the relevant text cohort has been narrowed', + 'required_arguments': ['filename', 'column'], + 'optional_arguments': ['query_expression', 'filter_column', 'filter_operator', 'filter_value', 'additional_filter_column', 'additional_filter_operator', 'additional_filter_value', 'extract_mode', 'extract_pattern', 'url_path_segments', 'normalize_match', 'sheet_name', 'sheet_index', 'max_values'], + }, + 'count_rows': { + 'best_for': 'deterministic how-many questions after a filter or query', + 'required_arguments': ['filename'], + 'optional_arguments': ['query_expression', 'filter_column', 'filter_operator', 'filter_value', 'additional_filter_column', 'additional_filter_operator', 'additional_filter_value', 'normalize_match', 'sheet_name', 'sheet_index'], + }, + 'search_rows': { + 'best_for': 'searching one column, several columns, or an entire sheet/workbook for a topic, phrase, path, code, or other value when the relevant column is unclear', + 'required_arguments': ['filename', 'search_value'], + 'optional_arguments': ['search_columns', 'search_operator', 'return_columns', 'query_expression', 'filter_column', 'filter_operator', 'filter_value', 'additional_filter_column', 'additional_filter_operator', 'additional_filter_value', 'normalize_match', 'sheet_name', 'sheet_index', 'max_rows'], + }, + 'filter_rows': { + 'best_for': 'searching a text column for matching cells while preserving full row context before a second analytical step', + 'required_arguments': ['filename', 'column', 'operator', 'value'], + 'optional_arguments': ['additional_filter_column', 'additional_filter_operator', 'additional_filter_value', 'normalize_match', 'sheet_name', 'sheet_index', 'max_rows'], + }, + 'query_tabular_data': { + 'best_for': 'compound boolean filters expressed with pandas DataFrame.query()', + 'required_arguments': ['filename', 'query_expression'], + 'optional_arguments': ['sheet_name', 'sheet_index', 'max_rows'], + }, + 'filter_rows_by_related_values': { + 'best_for': 'joining a cohort from one sheet to matching rows on another sheet', + 'required_arguments': ['filename', 'source_sheet_name', 'source_value_column', 'target_sheet_name', 'target_match_column'], + 'optional_arguments': ['source_query_expression', 'source_filter_column', 'source_filter_operator', 'source_filter_value', 'target_query_expression', 'target_filter_column', 'target_filter_operator', 'target_filter_value', 'normalize_match', 'max_rows'], + }, + 'count_rows_by_related_values': { + 'best_for': 'deterministic counts for cross-sheet cohort membership or related-record questions', + 'required_arguments': ['filename', 'source_sheet_name', 'source_value_column', 'target_sheet_name', 'target_match_column'], + 'optional_arguments': ['source_query_expression', 'source_filter_column', 'source_filter_operator', 'source_filter_value', 'target_query_expression', 'target_filter_column', 'target_filter_operator', 'target_filter_value', 'normalize_match'], + }, + 'aggregate_column': { + 'best_for': 'sum, mean, min, max, median, std, count, nunique, or value_counts on one column', + 'required_arguments': ['filename', 'column', 'operation'], + 'optional_arguments': ['sheet_name', 'sheet_index'], + }, + 'group_by_aggregate': { + 'best_for': 'grouped metrics by category or entity', + 'required_arguments': ['filename', 'group_by', 'aggregate_column', 'operation'], + 'optional_arguments': ['query_expression', 'sheet_name', 'sheet_index', 'top_n'], + }, + 'group_by_datetime_component': { + 'best_for': 'time-based grouped analysis by year, quarter, month, week, day, or hour', + 'required_arguments': ['filename', 'datetime_column', 'date_component', 'aggregate_column', 'operation'], + 'optional_arguments': ['query_expression', 'sheet_name', 'sheet_index', 'top_n'], + }, + } - return [( - "Tabular analysis encountered tool errors before fallback", - detail, - )] +def resolve_tabular_reviewer_call_arguments(raw_arguments, analysis_file_contexts, + fallback_source_hint='workspace', + fallback_group_id=None, + fallback_public_workspace_id=None): + """Inject filename and source context into an LLM reviewer tool plan.""" + raw_arguments = dict(raw_arguments or {}) + normalized_contexts = analysis_file_contexts or [] + file_context_by_exact_name = { + file_context['file_name']: file_context + for file_context in normalized_contexts + if file_context.get('file_name') + } + file_context_by_lower_name = { + str(file_context.get('file_name') or '').strip().lower(): file_context + for file_context in normalized_contexts + if file_context.get('file_name') + } -def _normalize_tabular_sheet_token(token): - """Normalize question and sheet-name tokens for lightweight matching.""" - normalized = re.sub(r'[^a-z0-9]+', '', str(token or '').lower()) - if len(normalized) > 4 and normalized.endswith('ies'): - return normalized[:-3] + 'y' - if len(normalized) > 3 and normalized.endswith('s') and not normalized.endswith('ss'): - return normalized[:-1] - return normalized + requested_filename = str(raw_arguments.get('filename') or '').strip() + resolved_file_context = None + if requested_filename: + resolved_file_context = ( + file_context_by_exact_name.get(requested_filename) + or file_context_by_lower_name.get(requested_filename.lower()) + ) + elif len(normalized_contexts) == 1: + resolved_file_context = normalized_contexts[0] + + if not resolved_file_context: + if requested_filename: + return None, f"Reviewer selected unknown filename '{requested_filename}'." + return None, 'Reviewer did not select a filename and multiple files were available.' + + normalized_arguments = dict(raw_arguments) + normalized_arguments['filename'] = resolved_file_context['file_name'] + normalized_arguments['source'] = ( + resolved_file_context.get('source_hint') + or fallback_source_hint + or normalized_arguments.get('source') + or 'workspace' + ) + resolved_group_id = resolved_file_context.get('group_id') or fallback_group_id + resolved_public_workspace_id = ( + resolved_file_context.get('public_workspace_id') + or fallback_public_workspace_id + ) + if resolved_group_id: + normalized_arguments['group_id'] = resolved_group_id + if resolved_public_workspace_id: + normalized_arguments['public_workspace_id'] = resolved_public_workspace_id -def _tokenize_tabular_sheet_text(text): - """Tokenize free text into normalized sheet-matching tokens.""" - original_text = re.sub(r'(?i)w[\s\-_]*2', ' w2 ', str(text or '')) - expanded_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', original_text) - expanded_text = re.sub(r'([A-Za-z])([0-9])', r'\1 \2', expanded_text) - expanded_text = re.sub(r'([0-9])([A-Za-z])', r'\1 \2', expanded_text) - expanded_text = re.sub(r'[_\-]+', ' ', expanded_text) - tokens = [] - seen_tokens = set() + if not str(normalized_arguments.get('sheet_name') or '').strip(): + normalized_arguments.pop('sheet_name', None) + if normalized_arguments.get('sheet_index') in ('', None): + normalized_arguments.pop('sheet_index', None) - for raw_text in (original_text, expanded_text): - for raw_token in re.split(r'[^a-z0-9]+', raw_text.lower()): - normalized_token = _normalize_tabular_sheet_token(raw_token) - if not normalized_token or len(normalized_token) <= 1: - continue - if normalized_token in seen_tokens: - continue - seen_tokens.add(normalized_token) - tokens.append(normalized_token) + return normalized_arguments, None - return tokens +def normalize_tabular_reviewer_argument_value(argument_name, argument_value): + """Normalize scalar reviewer-planned values to plugin-friendly argument types.""" + if argument_value is None: + return None -def _extract_tabular_entity_anchor_terms(question_text): - """Extract likely primary-entity terms from an entity lookup question.""" - normalized_question = str(question_text or '').strip().lower() + if isinstance(argument_value, bool): + return 'true' if argument_value else 'false' + + if argument_name in {'max_rows', 'max_values', 'sheet_index', 'top_n'} and isinstance(argument_value, (int, float)): + return str(int(argument_value)) + + return argument_value + + +def is_tabular_distinct_url_question(user_question): + """Return True when the user is asking for unique or counted URL/site values.""" + normalized_question = re.sub(r'\s+', ' ', str(user_question or '').strip().lower()) if not normalized_question: - return [] + return False - stopwords = { - 'and', - 'any', - 'by', - 'detail', + count_keywords = ( + 'count', + 'counts', + 'how many', + 'number of', + 'different', + 'discrete', + 'distinct', + 'unique', + ) + url_keywords = ( + 'http', + 'https', + 'link', + 'links', + 'sharepoint', + 'site', + 'sites', + 'url', + 'urls', + ) + return any(keyword in normalized_question for keyword in count_keywords) and any( + keyword in normalized_question for keyword in url_keywords + ) + + +def question_requests_tabular_row_context(user_question): + """Return True when the user question implies a need for matching-row context.""" + normalized_question = re.sub(r'\s+', ' ', str(user_question or '').strip().lower()) + if not normalized_question: + return False + + row_context_keywords = ( + 'appear', + 'appears', + 'appearing', + 'find', + 'found', + 'search', + 'show', + 'where', + ) + return any(keyword in normalized_question for keyword in row_context_keywords) + + +def question_requests_tabular_exhaustive_results(user_question): + """Return True when the user explicitly asks for a full list or all matching results.""" + normalized_question = re.sub(r'\s+', ' ', str(user_question or '').strip().lower()) + if not normalized_question: + return False + + explicit_phrases = ( + 'all results', + 'all rows', + 'all values', + 'all of them', + 'complete list', + 'each one', + 'every one', + 'exhaustive', + 'full list', + 'list all', + 'list each', + 'list every', + 'list them all', + 'list them out', + 'return all', + 'show all', + 'show me all', + ) + if any(phrase in normalized_question for phrase in explicit_phrases): + return True + + return ( + 'list' in normalized_question + and any(token in normalized_question for token in (' all ', ' them', ' out', ' each ', ' every ')) + ) + + +def parse_tabular_result_count(value): + """Parse a numeric count from invocation metadata or payloads.""" + try: + parsed_value = int(value) + except (TypeError, ValueError): + return None + + return parsed_value if parsed_value >= 0 else None + + +def determine_tabular_follow_up_limit(total_available, returned_count, max_cap=200): + """Return a larger result limit when the current tool call returned only a partial slice.""" + total_count = parse_tabular_result_count(total_available) + current_count = parse_tabular_result_count(returned_count) + if total_count is None or current_count is None or total_count <= current_count: + return None + + target_count = min(total_count, max_cap) + if target_count <= current_count: + return None + + return str(target_count) + + +def extract_tabular_high_signal_search_terms(user_question, max_terms=2): + """Extract a short list of likely literal search terms from the user question.""" + question_text = str(user_question or '').strip() + if not question_text: + return [] + + normalized_question = re.sub(r'\s+', ' ', question_text) + lowercase_question = normalized_question.lower() + prioritized_terms = [] + seen_terms = set() + + def add_term(raw_term): + rendered_term = str(raw_term or '').strip() + if not rendered_term: + return + + normalized_term = rendered_term.casefold() + if normalized_term in seen_terms: + return + + seen_terms.add(normalized_term) + prioritized_terms.append(rendered_term) + + for quoted_term in re.findall(r'["\']([^"\']{2,80})["\']', normalized_question): + add_term(quoted_term) + + special_terms = ( + ('sharepoint', 'SharePoint'), + ('onedrive', 'OneDrive'), + ('teams', 'Teams'), + ('ccore', 'CCORe'), + ('o365', 'O365'), + ) + for token, rendered_term in special_terms: + if token in lowercase_question: + add_term(rendered_term) + + ignored_tokens = { + 'all', + 'and', + 'appear', + 'appears', + 'are', + 'cell', + 'cells', + 'column', + 'columns', + 'count', + 'counts', + 'discrete', + 'distinct', + 'document', + 'documents', + 'does', + 'every', + 'file', + 'for', + 'from', + 'get', + 'how', + 'in', + 'is', + 'it', + 'link', + 'links', + 'location', + 'locations', + 'many', + 'number', + 'of', + 'on', + 'or', + 'out', + 'please', + 'reason', + 'row', + 'rows', + 'search', + 'sheet', + 'sheets', + 'show', + 'site', + 'sites', + 'that', + 'the', + 'them', + 'these', + 'they', + 'this', + 'to', + 'topic', + 'unique', + 'url', + 'urls', + 'value', + 'values', + 'what', + 'where', + 'which', + 'word', + 'workbook', + 'list', + 'listed', + 'lists', + 'lsit', + } + + for raw_token in re.findall(r'[A-Za-z0-9][A-Za-z0-9._\-/]{2,}', normalized_question): + lowercase_token = raw_token.casefold() + if lowercase_token in ignored_tokens: + continue + add_term(raw_token) + if len(prioritized_terms) >= max_terms: + break + + return prioritized_terms[:max_terms] + + +def extract_tabular_secondary_filter_terms(user_question, primary_terms=None, max_terms=3): + """Return likely cohort/filter terms after excluding the primary topic terms.""" + excluded_terms = { + str(term or '').strip().casefold() + for term in (primary_terms or []) + if str(term or '').strip() + } + secondary_terms = [] + + for candidate_term in extract_tabular_high_signal_search_terms( + user_question, + max_terms=max_terms + len(excluded_terms) + 3, + ): + normalized_candidate_term = str(candidate_term or '').strip().casefold() + if not normalized_candidate_term or normalized_candidate_term in excluded_terms: + continue + + secondary_terms.append(candidate_term) + if len(secondary_terms) >= max_terms: + break + + return secondary_terms + + +def normalize_tabular_row_text(value): + """Normalize a row cell value for lightweight controller-side term matching.""" + if value is None: + return '' + + return re.sub(r'\s+', ' ', str(value).casefold()).strip() + + +def parse_tabular_column_candidates(raw_columns): + """Normalize column arguments from string or list form into a stable list.""" + if isinstance(raw_columns, list): + candidate_columns = raw_columns + elif isinstance(raw_columns, str): + candidate_columns = raw_columns.split(',') + else: + return [] + + normalized_columns = [] + seen_columns = set() + for candidate_column in candidate_columns: + normalized_column = str(candidate_column or '').strip() + if not normalized_column: + continue + + lowered_column = normalized_column.casefold() + if lowered_column in seen_columns: + continue + + seen_columns.add(lowered_column) + normalized_columns.append(normalized_column) + + return normalized_columns + + +def tabular_value_looks_url_like(value): + """Return True when a scalar cell value looks like a URL or site path.""" + rendered_value = normalize_tabular_row_text(value) + if not rendered_value: + return False + + return ( + 'http://' in rendered_value + or 'https://' in rendered_value + or 'sharepoint.com' in rendered_value + or '/sites/' in rendered_value + ) + + +def tabular_result_payload_contains_url_like_content(result_payload): + """Return True when a result payload contains URL-like strings.""" + if not isinstance(result_payload, dict): + return False + + candidate_values = [] + raw_values = result_payload.get('values') + if isinstance(raw_values, list): + candidate_values.extend(raw_values[:20]) + + raw_rows = result_payload.get('data') + if isinstance(raw_rows, list): + for raw_row in raw_rows[:10]: + if not isinstance(raw_row, dict): + continue + candidate_values.extend(raw_row.values()) + + for candidate_value in candidate_values: + rendered_candidate = str(candidate_value or '').strip().lower() + if not rendered_candidate: + continue + if ( + 'http://' in rendered_candidate + or 'https://' in rendered_candidate + or 'sharepoint.com' in rendered_candidate + or '/sites/' in rendered_candidate + ): + return True + + return False + + +def infer_tabular_url_value_column_from_rows(rows, preferred_columns=None): + """Infer which returned row column contains URL-like values.""" + preferred_columns = parse_tabular_column_candidates(preferred_columns) + for preferred_column in preferred_columns: + if any( + isinstance(row, dict) and tabular_value_looks_url_like(row.get(preferred_column)) + for row in (rows or []) + ): + return preferred_column + + column_scores = {} + for row in rows or []: + if not isinstance(row, dict): + continue + + for column_name, cell_value in row.items(): + normalized_column_name = str(column_name or '').strip() + if not normalized_column_name or normalized_column_name.startswith('_'): + continue + if not tabular_value_looks_url_like(cell_value): + continue + + column_scores[normalized_column_name] = column_scores.get(normalized_column_name, 0) + 1 + + if not column_scores: + return None + + return sorted( + column_scores.items(), + key=lambda item: (-item[1], item[0].casefold()), + )[0][0] + + +def infer_tabular_secondary_filter_from_rows(rows, filter_terms, excluded_columns=None): + """Infer a likely cohort column/term pair from returned row context.""" + normalized_excluded_columns = { + str(column_name or '').strip().casefold() + for column_name in (excluded_columns or []) + if str(column_name or '').strip() + } + normalized_filter_terms = [ + str(filter_term or '').strip() + for filter_term in (filter_terms or []) + if str(filter_term or '').strip() + ] + if not normalized_filter_terms: + return None + + candidate_scores = {} + for row in rows or []: + if not isinstance(row, dict): + continue + + for column_name, cell_value in row.items(): + normalized_column_name = str(column_name or '').strip() + if not normalized_column_name or normalized_column_name.startswith('_'): + continue + if normalized_column_name.casefold() in normalized_excluded_columns: + continue + + rendered_cell_value = normalize_tabular_row_text(cell_value) + if not rendered_cell_value: + continue + + for filter_term in normalized_filter_terms: + if str(filter_term).casefold() not in rendered_cell_value: + continue + + score_key = (normalized_column_name, filter_term) + candidate_scores[score_key] = candidate_scores.get(score_key, 0) + 1 + + if not candidate_scores: + return None + + (selected_column, selected_term), match_count = sorted( + candidate_scores.items(), + key=lambda item: (-item[1], item[0][0].casefold(), item[0][1].casefold()), + )[0] + return { + 'column': selected_column, + 'term': selected_term, + 'match_count': match_count, + } + + +def infer_tabular_url_path_segments(user_question): + """Infer URL path truncation when the user is asking about site roots.""" + normalized_question = re.sub(r'\s+', ' ', str(user_question or '').strip().lower()) + if not normalized_question: + return None + + if 'site' in normalized_question or 'sites' in normalized_question or 'sharepoint' in normalized_question: + return '2' + + return None + + +def build_tabular_follow_up_call_signature(function_name, arguments): + """Return a stable signature for a follow-up tool call.""" + normalized_arguments = {} + for argument_name, argument_value in (arguments or {}).items(): + if argument_value in (None, ''): + continue + normalized_arguments[str(argument_name)] = argument_value + + return f"{function_name}:{json.dumps(normalized_arguments, sort_keys=True, default=str)}" + + +def derive_tabular_follow_up_calls_from_invocations(user_question, invocations): + """Derive targeted follow-up calls when initial analytical results are only intermediate.""" + successful_invocations = [ + invocation for invocation in (invocations or []) + if not get_tabular_invocation_error_message(invocation) + ] + if not successful_invocations: + return [] + + wants_distinct_urls = is_tabular_distinct_url_question(user_question) + wants_exhaustive_results = question_requests_tabular_exhaustive_results(user_question) + wants_row_context = question_requests_tabular_row_context(user_question) + search_terms = extract_tabular_high_signal_search_terms(user_question, max_terms=4) + primary_search_term = search_terms[0] if search_terms else None + secondary_filter_terms = extract_tabular_secondary_filter_terms( + user_question, + primary_terms=[primary_search_term] if primary_search_term else None, + max_terms=3, + ) + has_row_context_tool = any( + getattr(invocation, 'function_name', '') in {'search_rows', 'filter_rows', 'query_tabular_data'} + for invocation in successful_invocations + ) + has_url_extraction_tool = any( + getattr(invocation, 'function_name', '') == 'get_distinct_values' + and str( + ((getattr(invocation, 'parameters', {}) or {}).get('extract_mode')) + or ((get_tabular_invocation_result_payload(invocation) or {}).get('extract_mode')) + or '' + ).strip().lower() == 'url' + for invocation in successful_invocations + ) + + existing_signatures = { + build_tabular_follow_up_call_signature( + getattr(invocation, 'function_name', ''), + getattr(invocation, 'parameters', {}) or {}, + ) + for invocation in successful_invocations + } + follow_up_calls = [] + + for invocation in successful_invocations: + function_name = getattr(invocation, 'function_name', '') + invocation_parameters = getattr(invocation, 'parameters', {}) or {} + result_payload = get_tabular_invocation_result_payload(invocation) or {} + filename = str(invocation_parameters.get('filename') or result_payload.get('filename') or '').strip() + if not filename: + continue + + scope_arguments = { + 'filename': filename, + 'source': invocation_parameters.get('source') or 'workspace', + } + if invocation_parameters.get('group_id'): + scope_arguments['group_id'] = invocation_parameters.get('group_id') + if invocation_parameters.get('public_workspace_id'): + scope_arguments['public_workspace_id'] = invocation_parameters.get('public_workspace_id') + + selected_sheet = get_tabular_invocation_selected_sheet(invocation) + if selected_sheet and 'cross-sheet' not in selected_sheet.lower(): + scope_arguments['sheet_name'] = selected_sheet + elif invocation_parameters.get('sheet_name'): + scope_arguments['sheet_name'] = invocation_parameters.get('sheet_name') + elif invocation_parameters.get('sheet_index') not in (None, ''): + scope_arguments['sheet_index'] = invocation_parameters.get('sheet_index') + + if wants_exhaustive_results and function_name in {'search_rows', 'filter_rows', 'query_tabular_data'}: + expanded_row_limit = determine_tabular_follow_up_limit( + result_payload.get('total_matches'), + result_payload.get('returned_rows'), + ) + if expanded_row_limit: + expanded_arguments = { + argument_name: argument_value + for argument_name, argument_value in invocation_parameters.items() + if argument_name not in {'user_id', 'conversation_id'} and argument_value not in (None, '') + } + expanded_arguments.update(scope_arguments) + expanded_arguments['max_rows'] = expanded_row_limit + + expanded_signature = build_tabular_follow_up_call_signature(function_name, expanded_arguments) + if expanded_signature not in existing_signatures: + follow_up_calls.append({ + 'function_name': function_name, + 'arguments': expanded_arguments, + 'reason': 'expand the matching row slice because the user asked for the full result list', + }) + existing_signatures.add(expanded_signature) + + if function_name == 'get_distinct_values': + target_column = str(invocation_parameters.get('column') or result_payload.get('column') or '').strip() + if not target_column: + continue + + current_filter_columns = [ + str(invocation_parameters.get('filter_column') or '').strip(), + str(invocation_parameters.get('additional_filter_column') or '').strip(), + ] + same_column_filter = any( + filter_column.casefold() == target_column.casefold() + for filter_column in current_filter_columns + if filter_column + ) + distinct_count = parse_tabular_result_count(result_payload.get('distinct_count')) + returned_values = parse_tabular_result_count(result_payload.get('returned_values')) + + if wants_exhaustive_results: + expanded_value_limit = determine_tabular_follow_up_limit(distinct_count, returned_values) + if expanded_value_limit: + expanded_arguments = { + argument_name: argument_value + for argument_name, argument_value in invocation_parameters.items() + if argument_name not in {'user_id', 'conversation_id'} and argument_value not in (None, '') + } + expanded_arguments.update(scope_arguments) + expanded_arguments['max_values'] = expanded_value_limit + + expanded_signature = build_tabular_follow_up_call_signature('get_distinct_values', expanded_arguments) + if expanded_signature not in existing_signatures: + follow_up_calls.append({ + 'function_name': 'get_distinct_values', + 'arguments': expanded_arguments, + 'reason': 'expand the returned value list because the user asked for the full result set', + }) + existing_signatures.add(expanded_signature) + + needs_broad_row_context = bool( + wants_row_context + and primary_search_term + and not has_row_context_tool + and same_column_filter + and secondary_filter_terms + and distinct_count == 0 + ) + + if wants_row_context and primary_search_term and not has_row_context_tool: + row_search_arguments = dict(scope_arguments) + row_search_arguments['search_value'] = primary_search_term + row_search_arguments['search_columns'] = target_column + + normalize_match_value = invocation_parameters.get('normalize_match') + if normalize_match_value not in (None, ''): + row_search_arguments['normalize_match'] = normalize_match_value + + if not needs_broad_row_context: + for argument_name in ( + 'query_expression', + 'filter_column', + 'filter_operator', + 'filter_value', + 'additional_filter_column', + 'additional_filter_operator', + 'additional_filter_value', + ): + argument_value = invocation_parameters.get(argument_name) + if argument_value in (None, ''): + continue + row_search_arguments[argument_name] = argument_value + + return_columns = [] + for candidate_column in ( + invocation_parameters.get('filter_column'), + invocation_parameters.get('additional_filter_column'), + target_column, + ): + normalized_column = str(candidate_column or '').strip() + if not normalized_column or normalized_column in return_columns: + continue + return_columns.append(normalized_column) + + if return_columns: + row_search_arguments['return_columns'] = ','.join(return_columns) + + row_search_arguments['max_rows'] = '50' if needs_broad_row_context else '25' + + row_search_signature = build_tabular_follow_up_call_signature('search_rows', row_search_arguments) + if row_search_signature not in existing_signatures: + follow_up_calls.append({ + 'function_name': 'search_rows', + 'arguments': row_search_arguments, + 'reason': ( + 'collect broad row context for the literal topic before inferring a cohort column' + if needs_broad_row_context else + 'collect matching row context for the literal topic before final reasoning' + ), + }) + existing_signatures.add(row_search_signature) + has_row_context_tool = True + + if wants_distinct_urls and not str(invocation_parameters.get('extract_mode') or '').strip() and not has_url_extraction_tool: + if needs_broad_row_context: + continue + if not tabular_result_payload_contains_url_like_content(result_payload): + continue + + extraction_arguments = dict(scope_arguments) + extraction_arguments['column'] = target_column + for argument_name in ( + 'query_expression', + 'filter_column', + 'filter_operator', + 'filter_value', + 'additional_filter_column', + 'additional_filter_operator', + 'additional_filter_value', + 'normalize_match', + 'max_values', + ): + argument_value = invocation_parameters.get(argument_name) + if argument_value in (None, ''): + continue + extraction_arguments[argument_name] = argument_value + + extraction_arguments['extract_mode'] = 'url' + inferred_path_segments = infer_tabular_url_path_segments(user_question) + if inferred_path_segments: + extraction_arguments['url_path_segments'] = inferred_path_segments + + extraction_signature = build_tabular_follow_up_call_signature('get_distinct_values', extraction_arguments) + if extraction_signature not in existing_signatures: + follow_up_calls.append({ + 'function_name': 'get_distinct_values', + 'arguments': extraction_arguments, + 'reason': 'extract canonical URL or site values from composite text cells', + }) + existing_signatures.add(extraction_signature) + has_url_extraction_tool = True + + if function_name == 'search_rows' and wants_distinct_urls and not has_url_extraction_tool: + search_rows_result_rows = get_tabular_invocation_data_rows(invocation) + if not search_rows_result_rows: + continue + + target_column = None + searched_columns = parse_tabular_column_candidates( + result_payload.get('searched_columns') or invocation_parameters.get('search_columns') + ) + if len(searched_columns) == 1: + target_column = searched_columns[0] + else: + target_column = infer_tabular_url_value_column_from_rows( + search_rows_result_rows, + preferred_columns=searched_columns, + ) + + if not target_column: + continue + + extraction_arguments = dict(scope_arguments) + extraction_arguments['column'] = target_column + + inferred_filter = infer_tabular_secondary_filter_from_rows( + search_rows_result_rows, + secondary_filter_terms, + excluded_columns=[target_column], + ) + if inferred_filter: + extraction_arguments['filter_column'] = inferred_filter['column'] + extraction_arguments['filter_operator'] = 'contains' + extraction_arguments['filter_value'] = inferred_filter['term'] + elif not secondary_filter_terms: + for argument_name in ( + 'query_expression', + 'filter_column', + 'filter_operator', + 'filter_value', + 'additional_filter_column', + 'additional_filter_operator', + 'additional_filter_value', + ): + argument_value = invocation_parameters.get(argument_name) + if argument_value in (None, ''): + continue + extraction_arguments[argument_name] = argument_value + else: + continue + + normalize_match_value = invocation_parameters.get('normalize_match') + if normalize_match_value not in (None, ''): + extraction_arguments['normalize_match'] = normalize_match_value + + extraction_arguments['extract_mode'] = 'url' + inferred_path_segments = infer_tabular_url_path_segments(user_question) + if inferred_path_segments: + extraction_arguments['url_path_segments'] = inferred_path_segments + + expanded_value_limit = None + if wants_exhaustive_results: + expanded_value_limit = determine_tabular_follow_up_limit( + result_payload.get('total_matches'), + result_payload.get('returned_rows'), + ) + if expanded_value_limit: + extraction_arguments['max_values'] = expanded_value_limit + elif invocation_parameters.get('max_rows') not in (None, ''): + extraction_arguments['max_values'] = invocation_parameters.get('max_rows') + + extraction_signature = build_tabular_follow_up_call_signature('get_distinct_values', extraction_arguments) + if extraction_signature not in existing_signatures: + follow_up_calls.append({ + 'function_name': 'get_distinct_values', + 'arguments': extraction_arguments, + 'reason': 'extract canonical URL or site values after inferring the cohort column from matching rows', + }) + existing_signatures.add(extraction_signature) + has_url_extraction_tool = True + + if len(follow_up_calls) >= 2: + break + + return follow_up_calls[:2] + + +async def maybe_recover_tabular_analysis_with_llm_reviewer(chat_service, kernel, + tabular_plugin, plugin_logger, + user_question, schema_context, + source_context, + analysis_file_contexts, + user_id, conversation_id, + execution_mode, + allowed_function_names, + workbook_sheet_hints=None, + workbook_related_sheet_hints=None, + workbook_cross_sheet_bridge_hints=None, + tool_error_messages=None, + execution_gap_messages=None, + discovery_feedback_messages=None, + fallback_source_hint='workspace', + fallback_group_id=None, + fallback_public_workspace_id=None): + """Use an LLM reviewer to choose analytical tool calls when the main SK loop stalls.""" + reviewer_allowed_function_names = [ + function_name for function_name in (allowed_function_names or []) + if function_name in get_tabular_analysis_function_names() + ] + if not reviewer_allowed_function_names: + return None + + reviewer_manifest = { + function_name: get_tabular_reviewer_function_manifest().get(function_name, {}) + for function_name in reviewer_allowed_function_names + } + + reviewer_sections = [ + f"QUESTION:\n{user_question}", + f"EXECUTION_MODE: {execution_mode}", + f"SOURCE_CONTEXT:\n{source_context}", + f"FILE_SCHEMAS:\n{schema_context}", + "FUNCTION_MANIFEST:\n" + json.dumps(reviewer_manifest, indent=2, default=str), + ] + if discovery_feedback_messages: + reviewer_sections.append( + 'WORKBOOK_DISCOVERY_RESULTS:\n' + json.dumps(discovery_feedback_messages, indent=2, default=str) + ) + if tool_error_messages: + reviewer_sections.append( + 'PREVIOUS_TOOL_ERRORS:\n' + json.dumps(tool_error_messages, indent=2, default=str) + ) + if execution_gap_messages: + reviewer_sections.append( + 'PREVIOUS_EXECUTION_GAPS:\n' + json.dumps(execution_gap_messages, indent=2, default=str) + ) + if workbook_sheet_hints: + reviewer_sections.append( + 'LIKELY_WORKSHEET_HINTS:\n' + json.dumps(workbook_sheet_hints, indent=2, default=str) + ) + if workbook_related_sheet_hints: + reviewer_sections.append( + 'QUESTION_RELEVANT_WORKSHEETS:\n' + json.dumps(workbook_related_sheet_hints, indent=2, default=str) + ) + if workbook_cross_sheet_bridge_hints: + reviewer_sections.append( + 'CROSS_SHEET_BRIDGE_HINTS:\n' + json.dumps(workbook_cross_sheet_bridge_hints, indent=2, default=str) + ) + + review_history = ChatHistory() + review_history.add_system_message( + "You are a tabular recovery planner. A previous workbook analysis came close but did not reach computed analytical results. " + "Choose the next 1-3 analytical tabular calls that should be executed directly. " + "Return JSON only with this schema: {\"reasoning_summary\": \"...\", \"calls\": [{\"function\": \"get_distinct_values\", \"arguments\": {...}}]}. " + "Rules: Use only the listed analytical functions. Do not return describe_tabular_file. " + "Prefer the smallest number of high-confidence calls needed to compute the answer. " + "For deterministic how-many, discrete, unique, or canonical-list questions, prefer count_rows or get_distinct_values over sampled-row tools when possible. " + "When the user is asking where a topic, phrase, code, path, identifier, or other value appears and the relevant column is unclear, prefer search_rows. Omit search_columns to search all columns, and use return_columns to surface the fields most relevant to the question. " + "When the user wants values from a subset or pattern within one column, prefer get_distinct_values with filter_column/filter_operator/filter_value instead of an unfiltered full-column distinct-value call. " + "When the answer depends on two literal column conditions, prefer count_rows, get_distinct_values, or filter_rows with filter_column/filter_operator/filter_value plus additional_filter_column/additional_filter_operator/additional_filter_value instead of a broad query_expression call. " + "When the user is asking for URLs, sites, links, or regex-like identifiers embedded inside a text cell, prefer get_distinct_values with extract_mode='url' or extract_mode='regex' rather than counting whole-cell strings. Use url_path_segments when you need canonical higher-level URL roots. " + "If whether an embedded URL or identifier counts depends on surrounding text in the original cell rather than the extracted value itself, search/filter the original text column first. Prefer filter_rows for that text search when the matching row context matters, and set max_rows high enough to return the full cohort when it is modest. If a prior tool result is limited and the user explicitly asked for the full list, rerun with a higher max_rows or max_values instead of stopping at the preview slice. " + "Do not classify extracted URLs solely by whether the URL text itself contains the keyword when the original cell text already defines the category. " + "For URLs, links, paths, and literal identifiers, set normalize_match=false unless normalization is clearly necessary. " + "Prefer sheet_name when the correct worksheet is evident from the schemas or discovery results. " + "Omit sheet_name only for a deliberate cross-sheet analytical search. " + "Use filename exactly as listed in FILE_SCHEMAS. " + "Do not include user_id or conversation_id in arguments. Do not wrap the JSON in markdown fences." + ) + review_history.add_user_message("\n\n".join(reviewer_sections)) + + reviewer_settings = AzureChatPromptExecutionSettings(service_id="tabular-analysis") + + try: + reviewer_result = await chat_service.get_chat_message_contents( + review_history, + reviewer_settings, + kernel=kernel, + ) + except Exception as reviewer_error: + log_event( + f"[Tabular SK Analysis] Reviewer recovery call failed: {reviewer_error}", + level=logging.WARNING, + exceptionTraceback=True, + ) + return None + + reviewer_text = '' + if reviewer_result and reviewer_result[0].content: + reviewer_text = reviewer_result[0].content.strip() + + reviewer_calls = parse_tabular_reviewer_plan(reviewer_text) + if not reviewer_calls: + log_event( + '[Tabular SK Analysis] Reviewer recovery did not return an executable analytical plan', + extra={'reviewer_output_preview': reviewer_text[:500]}, + level=logging.WARNING, + ) + return None + + baseline_invocation_count = len(plugin_logger.get_invocations_for_conversation( + user_id, + conversation_id, + limit=1000, + )) + executed_function_names = [] + reviewer_plan_errors = [] + + for reviewer_call in reviewer_calls[:3]: + function_name = reviewer_call['function_name'] + if function_name not in reviewer_allowed_function_names: + reviewer_plan_errors.append( + f"Reviewer selected disallowed function '{function_name}'." + ) + continue + + call_arguments, argument_error = resolve_tabular_reviewer_call_arguments( + reviewer_call.get('arguments'), + analysis_file_contexts, + fallback_source_hint=fallback_source_hint, + fallback_group_id=fallback_group_id, + fallback_public_workspace_id=fallback_public_workspace_id, + ) + if argument_error: + reviewer_plan_errors.append(argument_error) + continue + + plugin_function = getattr(tabular_plugin, function_name, None) + if plugin_function is None: + reviewer_plan_errors.append( + f"Reviewer selected unavailable function '{function_name}'." + ) + continue + + function_signature = inspect.signature(plugin_function) + executable_arguments = { + 'user_id': user_id, + 'conversation_id': conversation_id, + } + for argument_name, argument_value in call_arguments.items(): + if argument_name not in function_signature.parameters: + continue + + normalized_argument_value = normalize_tabular_reviewer_argument_value( + argument_name, + argument_value, + ) + if normalized_argument_value is None: + continue + + executable_arguments[argument_name] = normalized_argument_value + + try: + await plugin_function(**executable_arguments) + executed_function_names.append(function_name) + except Exception as execution_error: + reviewer_plan_errors.append(f"{function_name}: {execution_error}") + + invocations_after = plugin_logger.get_invocations_for_conversation( + user_id, + conversation_id, + limit=1000, + ) + reviewer_invocations = get_new_plugin_invocations(invocations_after, baseline_invocation_count) + successful_analytical_invocations, failed_analytical_invocations = split_tabular_analysis_invocations( + reviewer_invocations + ) + for follow_up_round in range(2): + follow_up_calls = derive_tabular_follow_up_calls_from_invocations( + user_question, + successful_analytical_invocations, + ) + if not follow_up_calls: + break + + auto_follow_up_names = [] + for follow_up_call in follow_up_calls: + function_name = follow_up_call.get('function_name') + if function_name not in reviewer_allowed_function_names: + reviewer_plan_errors.append( + f"Auto follow-up selected disallowed function '{function_name}'." + ) + continue + + plugin_function = getattr(tabular_plugin, function_name, None) + if plugin_function is None: + reviewer_plan_errors.append( + f"Auto follow-up selected unavailable function '{function_name}'." + ) + continue + + function_signature = inspect.signature(plugin_function) + executable_arguments = { + 'user_id': user_id, + 'conversation_id': conversation_id, + } + for argument_name, argument_value in (follow_up_call.get('arguments') or {}).items(): + if argument_name not in function_signature.parameters: + continue + + normalized_argument_value = normalize_tabular_reviewer_argument_value( + argument_name, + argument_value, + ) + if normalized_argument_value is None: + continue + + executable_arguments[argument_name] = normalized_argument_value + + try: + await plugin_function(**executable_arguments) + auto_follow_up_names.append(function_name) + except Exception as execution_error: + reviewer_plan_errors.append(f"{function_name}: {execution_error}") + + if not auto_follow_up_names: + break + + log_event( + '[Tabular SK Analysis] Reviewer recovery executed automatic analytical follow-up calls', + extra={ + 'follow_up_functions': auto_follow_up_names, + 'initial_reviewer_functions': executed_function_names, + 'follow_up_round': follow_up_round + 1, + }, + level=logging.INFO, + ) + executed_function_names.extend(auto_follow_up_names) + invocations_after = plugin_logger.get_invocations_for_conversation( + user_id, + conversation_id, + limit=1000, + ) + reviewer_invocations = get_new_plugin_invocations(invocations_after, baseline_invocation_count) + successful_analytical_invocations, failed_analytical_invocations = split_tabular_analysis_invocations( + reviewer_invocations + ) + + fallback = build_tabular_analysis_fallback_from_invocations(successful_analytical_invocations) + failed_tool_error_messages = summarize_tabular_invocation_errors(failed_analytical_invocations) + + if fallback: + log_event( + '[Tabular SK Analysis] Reviewer recovery produced computed analytical tool results', + extra={ + 'reviewer_functions': executed_function_names, + 'successful_tool_count': len(successful_analytical_invocations), + 'failed_tool_count': len(failed_analytical_invocations), + }, + level=logging.INFO, + ) + return { + 'fallback': fallback, + 'tool_error_messages': failed_tool_error_messages, + 'reviewer_plan_errors': reviewer_plan_errors, + } + + if reviewer_plan_errors or failed_tool_error_messages: + log_event( + '[Tabular SK Analysis] Reviewer recovery executed but did not produce usable analytical results', + extra={ + 'reviewer_functions': executed_function_names, + 'reviewer_plan_errors': reviewer_plan_errors[:5], + 'tool_errors': failed_tool_error_messages[:5], + 'reviewer_output_preview': reviewer_text[:500], + }, + level=logging.WARNING, + ) + + return None + + +def filter_tabular_citation_invocations(invocations): + """Hide discovery-only citation noise when analytical tabular calls exist.""" + if not invocations: + return [] + + successful_analytical_invocations, _ = split_tabular_analysis_invocations(invocations) + if successful_analytical_invocations: + return successful_analytical_invocations + + successful_schema_summary_invocations = [] + for invocation in invocations or []: + if getattr(invocation, 'function_name', '') != 'describe_tabular_file': + continue + if get_tabular_invocation_error_message(invocation): + continue + successful_schema_summary_invocations.append(invocation) + + if successful_schema_summary_invocations: + return successful_schema_summary_invocations + + return [] + + +def format_tabular_thought_parameter_value(value): + """Render a concise parameter value for tabular thought details.""" + if value is None: + return None + + if isinstance(value, (dict, list, tuple)): + rendered_value = json.dumps(value, default=str) + else: + rendered_value = str(value) + + if not rendered_value: + return None + + if len(rendered_value) > 120: + rendered_value = rendered_value[:117] + '...' + + return rendered_value + + +def get_tabular_tool_thought_payloads(invocations): + """Convert tabular plugin invocations into user-visible thought payloads.""" + thought_payloads = [] + + for invocation in invocations or []: + function_name = getattr(invocation, 'function_name', 'unknown_tool') + duration_ms = getattr(invocation, 'duration_ms', None) + error_message = get_tabular_invocation_error_message(invocation) + success = getattr(invocation, 'success', True) and not error_message + parameters = getattr(invocation, 'parameters', {}) or {} + + filename = parameters.get('filename') + sheet_name = parameters.get('sheet_name') + duration_suffix = f" ({int(duration_ms)}ms)" if duration_ms else "" + content = f"Tabular tool {function_name}{duration_suffix}" + if filename: + content = f"Tabular tool {function_name} on {filename}{duration_suffix}" + if filename and sheet_name: + content = f"Tabular tool {function_name} on {filename} [{sheet_name}]{duration_suffix}" + if not success: + content = f"{content} failed" + + detail_parts = [] + for parameter_name, parameter_value in parameters.items(): + if parameter_name in get_tabular_thought_excluded_parameter_names(): + continue + + rendered_value = format_tabular_thought_parameter_value(parameter_value) + if rendered_value is None: + continue + + detail_parts.append(f"{parameter_name}={rendered_value}") + + rendered_error_message = format_tabular_thought_parameter_value(error_message) + if rendered_error_message: + detail_parts.append(f"error={rendered_error_message}") + + detail_parts.append(f"success={success}") + detail = "; ".join(detail_parts) if detail_parts else None + thought_payloads.append((content, detail)) + + return thought_payloads + + +def get_tabular_status_thought_payloads(invocations, analysis_succeeded): + """Return additional tabular status thoughts for retries and fallbacks.""" + successful_analytical_invocations, failed_analytical_invocations = split_tabular_analysis_invocations(invocations) + if not failed_analytical_invocations: + return [] + + error_messages = summarize_tabular_invocation_errors(failed_analytical_invocations) + detail = "; ".join(error_messages) if error_messages else None + + if analysis_succeeded and successful_analytical_invocations: + return [( + "Tabular analysis recovered after retrying tool errors", + detail, + )] + + if analysis_succeeded: + return [( + "Tabular analysis recovered via internal fallback after tool errors", + detail, + )] + + return [( + "Tabular analysis encountered tool errors before fallback", + detail, + )] + + +def _normalize_tabular_sheet_token(token): + """Normalize question and sheet-name tokens for lightweight matching.""" + normalized = re.sub(r'[^a-z0-9]+', '', str(token or '').lower()) + if len(normalized) > 4 and normalized.endswith('ies'): + return normalized[:-3] + 'y' + if len(normalized) > 3 and normalized.endswith('s') and not normalized.endswith('ss'): + return normalized[:-1] + return normalized + + +def _tokenize_tabular_sheet_text(text): + """Tokenize free text into normalized sheet-matching tokens.""" + original_text = re.sub(r'(?i)w[\s\-_]*2', ' w2 ', str(text or '')) + expanded_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', original_text) + expanded_text = re.sub(r'([A-Za-z])([0-9])', r'\1 \2', expanded_text) + expanded_text = re.sub(r'([0-9])([A-Za-z])', r'\1 \2', expanded_text) + expanded_text = re.sub(r'[_\-]+', ' ', expanded_text) + tokens = [] + seen_tokens = set() + + for raw_text in (original_text, expanded_text): + for raw_token in re.split(r'[^a-z0-9]+', raw_text.lower()): + normalized_token = _normalize_tabular_sheet_token(raw_token) + if not normalized_token or len(normalized_token) <= 1: + continue + if normalized_token in seen_tokens: + continue + seen_tokens.add(normalized_token) + tokens.append(normalized_token) + + return tokens + + +def _coerce_citation_sort_number(value): + """Return a numeric citation sort value when possible.""" + if value in (None, '') or isinstance(value, bool): + return None + + if isinstance(value, (int, float)): + return float(value) + + raw_value = str(value).strip() + if not raw_value: + return None + + try: + return float(raw_value) + except (TypeError, ValueError): + return None + + +def _build_hybrid_citation_sort_key(citation): + """Sort numeric page citations first, then metadata-style citations safely.""" + if not isinstance(citation, dict): + return (0, -1.0, -1.0, '', '') + + page_number = citation.get('page_number') + page_value = _coerce_citation_sort_number(page_number) + chunk_sequence_value = _coerce_citation_sort_number(citation.get('chunk_sequence')) + page_label = str(page_number or '').strip().lower() + metadata_type = str(citation.get('metadata_type') or '').strip().lower() + + if page_value is not None: + return ( + 2, + page_value, + chunk_sequence_value if chunk_sequence_value is not None else -1.0, + page_label, + metadata_type, + ) + + if chunk_sequence_value is not None: + return (1, chunk_sequence_value, -1.0, page_label, metadata_type) + + return (0, -1.0, -1.0, page_label, metadata_type) + + +def _extract_tabular_entity_anchor_terms(question_text): + """Extract likely primary-entity terms from an entity lookup question.""" + normalized_question = str(question_text or '').strip().lower() + if not normalized_question: + return [] + + stopwords = { + 'and', + 'any', + 'by', + 'detail', 'details', 'exact', 'explain', @@ -1642,7 +3335,8 @@ async def run_tabular_sk_analysis(user_question, tabular_filenames, user_id, conversation_id, gpt_model, settings, source_hint="workspace", group_id=None, public_workspace_id=None, - execution_mode='analysis'): + execution_mode='analysis', + tabular_file_contexts=None): """Run lightweight SK with TabularProcessingPlugin to analyze tabular data. Creates a temporary Kernel with only the TabularProcessingPlugin, uses the @@ -1661,8 +3355,16 @@ async def run_tabular_sk_analysis(user_question, tabular_filenames, user_id, execution_mode = execution_mode if execution_mode in {'analysis', 'schema_summary', 'entity_lookup'} else 'analysis' schema_summary_mode = execution_mode == 'schema_summary' entity_lookup_mode = execution_mode == 'entity_lookup' + analysis_file_contexts = normalize_tabular_file_contexts_for_analysis( + tabular_filenames=tabular_filenames, + tabular_file_contexts=tabular_file_contexts, + fallback_source_hint=source_hint, + fallback_group_id=group_id, + fallback_public_workspace_id=public_workspace_id, + ) + analysis_filenames = [file_context['file_name'] for file_context in analysis_file_contexts] log_event( - f"[Tabular SK Analysis] Starting {execution_mode} analysis for files: {tabular_filenames}", + f"[Tabular SK Analysis] Starting {execution_mode} analysis for files: {analysis_filenames}", level=logging.INFO, ) @@ -1703,11 +3405,12 @@ async def run_tabular_sk_analysis(user_question, tabular_filenames, user_id, kernel.add_service(chat_service) # 3. Pre-dispatch: load file schemas to eliminate discovery LLM rounds - source_context = f"source='{source_hint}'" - if group_id: - source_context += f", group_id='{group_id}'" - if public_workspace_id: - source_context += f", public_workspace_id='{public_workspace_id}'" + source_context = build_tabular_analysis_source_context( + analysis_file_contexts, + fallback_source_hint=source_hint, + fallback_group_id=group_id, + fallback_public_workspace_id=public_workspace_id, + ) schema_parts = [] workbook_sheet_hints = {} @@ -1716,22 +3419,28 @@ async def run_tabular_sk_analysis(user_question, tabular_filenames, user_id, workbook_blob_locations = {} retry_sheet_overrides = {} previous_failed_call_parameters = [] # entity lookup: concrete failed call params for retry hints + has_multi_sheet_workbook = False sheet_score_match_fn = _score_tabular_entity_sheet_match if entity_lookup_mode else _score_tabular_sheet_match - allowed_function_filters = { - 'included_functions': [ - f"tabular_processing-{function_name}" - for function_name in ( - ['describe_tabular_file'] - if schema_summary_mode else - sorted(get_tabular_analysis_function_names()) - ) - ] - } - for fname in tabular_filenames: + for file_context in analysis_file_contexts: + fname = file_context['file_name'] + file_source_hint = file_context.get('source_hint', source_hint) + file_group_id = file_context.get('group_id') + file_public_workspace_id = file_context.get('public_workspace_id') + schema_source_context = {'source': file_source_hint} + if file_group_id: + schema_source_context['group_id'] = file_group_id + if file_public_workspace_id: + schema_source_context['public_workspace_id'] = file_public_workspace_id try: container, blob_path = tabular_plugin._resolve_blob_location_with_fallback( - user_id, conversation_id, fname, source_hint, - group_id=group_id, public_workspace_id=public_workspace_id + user_id, conversation_id, fname, file_source_hint, + group_id=file_group_id, public_workspace_id=file_public_workspace_id + ) + tabular_plugin.remember_resolved_blob_location( + file_source_hint, + fname, + container, + blob_path, ) schema_info = tabular_plugin._build_workbook_schema_summary( container, @@ -1742,6 +3451,7 @@ async def run_tabular_sk_analysis(user_question, tabular_filenames, user_id, workbook_blob_locations[fname] = (container, blob_path) if schema_info.get('is_workbook') and schema_info.get('sheet_count', 0) > 1: + has_multi_sheet_workbook = True # Build a compact sheet directory so the model can pick the # relevant sheet itself instead of us guessing. per_sheet = schema_info.get('per_sheet_schemas', {}) @@ -1798,6 +3508,7 @@ async def run_tabular_sk_analysis(user_question, tabular_filenames, user_id, }) directory_schema = { 'filename': fname, + 'source_context': schema_source_context, 'is_workbook': True, 'sheet_count': schema_info.get('sheet_count', 0), 'likely_sheet': likely_sheet, @@ -1813,7 +3524,9 @@ async def run_tabular_sk_analysis(user_question, tabular_filenames, user_id, level=logging.DEBUG, ) else: - schema_parts.append(json.dumps(schema_info, indent=2, default=str)) + schema_with_context = dict(schema_info) + schema_with_context['source_context'] = schema_source_context + schema_parts.append(json.dumps(schema_with_context, indent=2, default=str)) if schema_info.get('is_workbook'): # Single-sheet workbook โ€” set default so the model needs no sheet arg single_sheet = (schema_info.get('sheet_names') or [None])[0] @@ -1822,12 +3535,31 @@ async def run_tabular_sk_analysis(user_question, tabular_filenames, user_id, df = tabular_plugin._read_tabular_blob_to_dataframe(container, blob_path) log_event(f"[Tabular SK Analysis] Pre-loaded schema for {fname} ({len(df)} rows)", level=logging.DEBUG) except Exception as e: - log_event(f"[Tabular SK Analysis] Failed to pre-load schema for {fname}: {e}", level=logging.WARNING) - schema_parts.append(json.dumps({"filename": fname, "error": f"Could not pre-load: {str(e)}"})) + log_event( + f"[Tabular SK Analysis] Failed to pre-load schema for {fname} " + f"(source={file_source_hint}, group_id={file_group_id}, public_workspace_id={file_public_workspace_id}): {e}", + level=logging.WARNING, + ) + schema_parts.append(json.dumps({ + "filename": fname, + "source_context": schema_source_context, + "error": f"Could not pre-load: {str(e)}", + })) schema_context = "\n".join(schema_parts) + allow_multi_sheet_discovery = has_multi_sheet_workbook and not schema_summary_mode + allowed_function_names = ['describe_tabular_file'] if schema_summary_mode else sorted(get_tabular_analysis_function_names()) + if allow_multi_sheet_discovery: + allowed_function_names = ['describe_tabular_file'] + allowed_function_names + allowed_function_filters = { + 'included_functions': [ + f"tabular_processing-{function_name}" + for function_name in allowed_function_names + ] + } - def build_system_prompt(force_tool_use=False, tool_error_messages=None, execution_gap_messages=None): + def build_system_prompt(force_tool_use=False, tool_error_messages=None, + execution_gap_messages=None, discovery_feedback_messages=None): if schema_summary_mode: retry_prefix = "" if force_tool_use: @@ -1898,6 +3630,17 @@ def build_system_prompt(force_tool_use=False, tool_error_messages=None, executio "Correct the analysis plan and query the missing related worksheets before answering.\n\n" ) + discovery_feedback = "" + if discovery_feedback_messages: + rendered_discovery_feedback = "\n".join( + f"- {message}" for message in discovery_feedback_messages + ) + discovery_feedback = ( + "WORKBOOK DISCOVERY RESULTS:\n" + f"{rendered_discovery_feedback}\n" + "Use these discovery results to choose the next analytical tool calls. Discovery alone does not answer the question.\n\n" + ) + missing_sheet_feedback = "" if tool_error_messages and any( 'Specify sheet_name or sheet_index on analytical calls.' in error_message @@ -1976,6 +3719,13 @@ def build_system_prompt(force_tool_use=False, tool_error_messages=None, executio "These recovery hints override the original likely-sheet guess when the previous tool call failed on the wrong worksheet.\n\n" ) + discovery_step_feedback = "" + if allow_multi_sheet_discovery: + discovery_step_feedback = ( + "MULTI-SHEET DISCOVERY:\n" + "If the right worksheet or columns are unclear, call describe_tabular_file without sheet_name as an exploration step, then continue with one or more analytical tool calls. You may need multiple tool rounds.\n\n" + ) + related_sheet_feedback = "" if workbook_related_sheet_hints: rendered_related_sheet_hints = "\n".join( @@ -2027,30 +3777,38 @@ def build_system_prompt(force_tool_use=False, tool_error_messages=None, executio f"{entity_retry_prefix}" f"{tool_error_feedback}" f"{execution_gap_feedback}" + f"{discovery_feedback}" f"{recovery_sheet_feedback}" f"{sheet_hint_feedback}" f"{related_sheet_feedback}" + f"{discovery_step_feedback}" f"{missing_sheet_feedback}" f"FILE SCHEMAS:\n" f"{schema_context}\n\n" - "AVAILABLE FUNCTIONS: filter_rows, query_tabular_data, lookup_value, get_distinct_values, count_rows, " - "filter_rows_by_related_values, count_rows_by_related_values, aggregate_column, group_by_aggregate, and group_by_datetime_component.\n\n" - "Discovery functions are not available in this analysis run because schema context is already pre-loaded.\n\n" + f"AVAILABLE FUNCTIONS: {', '.join(allowed_function_names)}.\n\n" + + ( + "Workbook discovery is available through describe_tabular_file. Discovery-only results do NOT complete the analysis. After exploration, continue with analytical functions before answering.\n\n" + if allow_multi_sheet_discovery else + "Discovery functions are not available in this analysis run because schema context is already pre-loaded.\n\n" + ) + + "IMPORTANT:\n" - "1. If the question includes an exact identifier or exact entity name and the correct starting worksheet is unclear, begin with filter_rows or query_tabular_data without sheet_name so the plugin can perform a cross-sheet discovery search.\n" - "2. After the first discovery step, pass sheet_name='' on follow-up analytical calls for multi-sheet workbooks. Do not rely on a default sheet for cross-sheet entity lookups.\n" - "3. Use filter_rows or query_tabular_data first when you need full matching rows. Use lookup_value only when you already know the exact worksheet and target column.\n" - "4. Do not start with aggregate_column, group_by_aggregate, or group_by_datetime_component until you have located the relevant entity rows.\n" - "5. When using query_tabular_data, use simple DataFrame.query() syntax with backticked column names for columns containing spaces. Avoid method calls such as .str.lower() or .astype(...).\n" - "6. Then query other relevant worksheets explicitly to collect related records.\n" - "7. When a retrieved row contains a secondary identifier such as ReturnID, CaseID, AccountID, PaymentID, W2ID, or Form1099ID, reuse it to query dependent worksheets.\n" - "8. Do not stop after the first successful row if the question asks for related records across sheets.\n" - "9. If a requested record type has no corresponding worksheet in the workbook, say that the workbook does not contain that record type.\n" - "10. Clearly distinguish between no matching rows and no corresponding worksheet.\n" - "11. Summarize concrete found records sheet-by-sheet using the tool results, not schema placeholders.\n" - "12. For count or percentage questions involving a cohort defined on one sheet and facts on another, prefer get_distinct_values, count_rows, filter_rows_by_related_values, or count_rows_by_related_values over manually counting sampled rows.\n" - "13. Use normalize_match=true when matching names, owners, assignees, engineers, or similar entity-text columns across worksheets.\n" - "14. Do not mention hypothetical follow-up analyses, parser errors, or failed attempts unless the user explicitly asked about failures and you have actual tool error output to report." + "0. Use the source_context listed in FILE SCHEMAS for the matching filename when calling tabular_processing functions.\n" + "1. If the right worksheet is unclear on a multi-sheet workbook, you may call describe_tabular_file without sheet_name first, then continue with analytical tool calls.\n" + "2. If the question includes an exact identifier, exact entity name, or asks where a topic or value appears and the correct starting worksheet or column is unclear, begin with search_rows, filter_rows, or query_tabular_data without sheet_name so the plugin can perform a cross-sheet discovery search. Omit search_columns on search_rows to search all columns, and use return_columns to surface the fields most relevant to the lookup.\n" + "3. After the first discovery step, pass sheet_name='' on follow-up analytical calls for multi-sheet workbooks. Do not rely on a default sheet for cross-sheet entity lookups.\n" + "4. Use search_rows, filter_rows, or query_tabular_data first when you need full matching rows. Use lookup_value only when you already know the exact worksheet and target column.\n" + "5. Do not start with aggregate_column, group_by_aggregate, or group_by_datetime_component until you have located the relevant entity rows.\n" + "6. When using query_tabular_data, use simple DataFrame.query() syntax with backticked column names for columns containing spaces. Avoid method calls such as .str.lower() or .astype(...).\n" + "7. Then query other relevant worksheets explicitly to collect related records.\n" + "8. When a retrieved row contains a secondary identifier such as ReturnID, CaseID, AccountID, PaymentID, W2ID, or Form1099ID, reuse it to query dependent worksheets.\n" + "9. Do not stop after the first successful row if the question asks for related records across sheets.\n" + "10. If a requested record type has no corresponding worksheet in the workbook, say that the workbook does not contain that record type.\n" + "11. Clearly distinguish between no matching rows and no corresponding worksheet.\n" + "12. Summarize concrete found records sheet-by-sheet using the tool results, not schema placeholders.\n" + "13. For count or percentage questions involving a cohort defined on one sheet and facts on another, prefer get_distinct_values, count_rows, filter_rows_by_related_values, or count_rows_by_related_values over manually counting sampled rows.\n" + "14. Use normalize_match=true when matching names, owners, assignees, engineers, or similar entity-text columns across worksheets.\n" + "15. Do not mention hypothetical follow-up analyses, parser errors, or failed attempts unless the user explicitly asked about failures and you have actual tool error output to report." ) return ( @@ -2062,39 +3820,49 @@ def build_system_prompt(force_tool_use=False, tool_error_messages=None, executio f"{retry_prefix}" f"{tool_error_feedback}" f"{execution_gap_feedback}" + f"{discovery_feedback}" f"{recovery_sheet_feedback}" f"{sheet_hint_feedback}" f"{related_sheet_feedback}" f"{cross_sheet_bridge_feedback}" + f"{discovery_step_feedback}" f"{missing_sheet_feedback}" f"FILE SCHEMAS:\n" f"{schema_context}\n\n" - "AVAILABLE FUNCTIONS: lookup_value, get_distinct_values, count_rows, filter_rows, query_tabular_data, " - "filter_rows_by_related_values, count_rows_by_related_values, aggregate_column, group_by_aggregate, and group_by_datetime_component for year/quarter/month/week/day/hour trend analysis.\n\n" - "Discovery functions are not available in this analysis run because schema context is already pre-loaded.\n\n" + f"AVAILABLE FUNCTIONS: {', '.join(allowed_function_names)} for year/quarter/month/week/day/hour trend analysis.\n\n" + + ( + "Workbook discovery is available through describe_tabular_file. Discovery-only results do NOT complete the analysis. After exploration, continue with analytical functions before answering.\n\n" + if allow_multi_sheet_discovery else + "Discovery functions are not available in this analysis run because schema context is already pre-loaded.\n\n" + ) + + "IMPORTANT:\n" - "1. Use the pre-loaded schema to pick the correct columns, then call the plugin functions.\n" - "2. For multi-sheet workbooks, review the sheet_directory to find the most relevant sheet for the question. Pass sheet_name='' in every analytical tool call unless a trustworthy default sheet has already been established. If a CROSS-SHEET BRIDGE PLAN is provided, query the listed worksheets explicitly and do not rely on a default sheet.\n" - "3. If a previous tool error says a requested column is missing on the current sheet and suggests candidate sheets, switch to one of those candidate sheets immediately.\n" - "4. For account/category lookup questions at a specific period or metric, use lookup_value first. Provide lookup_column, lookup_value, and target_column.\n" - "5. If lookup_value is not sufficient, use filter_rows or query_tabular_data on the label column, then read the requested period column.\n" - "6. For deterministic how-many questions, use count_rows instead of estimating counts from partial returned rows.\n" - "7. For cohort, membership, ownership-share, or percentage questions where one sheet defines the group and another sheet contains the fact rows, use get_distinct_values, filter_rows_by_related_values, or count_rows_by_related_values.\n" - "8. When the question asks for one named member's share within that cohort, prefer count_rows_by_related_values and either read source_value_match_counts from the helper result or rerun count_rows_by_related_values with source_filter_column/source_filter_value on the reference sheet. Do not fall back to query_tabular_data or filter_rows on the fact sheet with a guessed exact text value unless the workbook already exposed that canonical target value.\n" - "9. Use normalize_match=true when matching names, owners, assignees, engineers, or similar entity-text columns across worksheets.\n" - "10. Only use aggregate_column when the user explicitly asks for a sum, average, min, max, or count across rows and count_rows is not the simpler deterministic option.\n" - "11. For time-based questions on datetime columns, use group_by_datetime_component.\n" - "12. For threshold, ranking, comparison, or correlation-like questions, first filter/query the relevant rows, then compute grouped metrics.\n" - "13. When the question asks for grouped results for each entity or category and a cross-sheet bridge plan or relationship hint is available, use the reference worksheet to identify the canonical entities or categories and the fact worksheet to compute the metric. Do not answer 'each X' by grouping a yes/no, boolean, or membership-flag column unless the user explicitly asked about that flag.\n" - "14. When the question asks for rows satisfying multiple conditions, prefer one combined query_expression using and/or instead of separate broad queries that you plan to intersect later.\n" - "15. Batch multiple independent function calls in a SINGLE response whenever possible.\n" - "16. Keep max_rows as small as possible. Only increase it when the user explicitly asked for an exhaustive row list or export; otherwise return total_matches plus representative rows.\n" - "17. For analytical questions, prefer deterministic counts plus lookup/filter/query/grouped computations over raw row or preview output.\n" - "18. For identifier-based workbook questions, locate the identifier on the correct sheet before explaining downstream calculations.\n" - "19. For peak, busiest, highest, or lowest questions, use grouped functions and inspect the highest_group, highest_value, lowest_group, and lowest_value summary fields.\n" - "20. Return only computed findings and name the strongest drivers clearly.\n" - "21. Do not mention hypothetical follow-up analyses, parser errors, or failed attempts unless the user explicitly asked about failures and you have actual tool error output to report.\n" - "22. When using query_tabular_data, use simple DataFrame.query() syntax with backticked column names for columns containing spaces. Avoid method calls such as .str.lower(), .astype(...), or other Python expressions that DataFrame.query() may reject." + "1. Use the pre-loaded schema to pick the correct columns, then call the plugin functions. Use the source_context listed in FILE SCHEMAS for the matching filename.\n" + "2. For multi-sheet workbooks, review the sheet_directory to find the most relevant sheet for the question. If the right worksheet is still unclear, call describe_tabular_file without sheet_name, then continue with analytical calls. Pass sheet_name='' in follow-up analytical tool calls unless a trustworthy default sheet has already been established or you are intentionally doing an initial cross-sheet discovery step. If a CROSS-SHEET BRIDGE PLAN is provided, query the listed worksheets explicitly and do not rely on a default sheet.\n" + "3. If the question includes an exact identifier or asks where a topic, phrase, path, code, or other value appears and the correct starting worksheet or column is unclear, begin with search_rows, filter_rows, or query_tabular_data without sheet_name so the plugin can perform a cross-sheet discovery search. Omit search_columns on search_rows to search all columns, and use return_columns to surface the columns most relevant to the question.\n" + "4. If a previous tool error says a requested column is missing on the current sheet and suggests candidate sheets, switch to one of those candidate sheets immediately.\n" + "5. For account/category lookup questions at a specific period or metric, use lookup_value first. Provide lookup_column, lookup_value, and target_column.\n" + "6. If lookup_value is not sufficient, use search_rows, filter_rows, or query_tabular_data on the relevant label or text columns, then read the requested period or target column.\n" + "7. For deterministic how-many questions, use count_rows instead of estimating counts from partial returned rows. Use get_distinct_values when the answer depends on the unique values present in a column. When the cohort is defined by two literal conditions on different columns, prefer count_rows, get_distinct_values, or filter_rows with filter_column/filter_operator/filter_value plus additional_filter_column/additional_filter_operator/additional_filter_value instead of a broad query_tabular_data call.\n" + "8. When URLs, links, sites, or regex-like identifiers are embedded inside a text column, prefer get_distinct_values with extract_mode='url' or extract_mode='regex' after filtering the relevant cohort. Use url_path_segments when the question asks for higher-level URL roots rather than full page paths.\n" + "9. If whether an embedded URL, site, link, or identifier counts depends on surrounding text in the original cell rather than the extracted value itself, search/filter the original text column first. Prefer filter_rows when the matching row context matters, and return the full matching rows when the cohort is modest enough to fit comfortably.\n" + "10. Do not classify extracted URLs solely by whether the URL text itself contains the keyword when the original cell text already defines the category.\n" + "11. For cohort, membership, ownership-share, or percentage questions where one sheet defines the group and another sheet contains the fact rows, use get_distinct_values, filter_rows_by_related_values, or count_rows_by_related_values.\n" + "12. When the question asks for one named member's share within that cohort, prefer count_rows_by_related_values and either read source_value_match_counts from the helper result or rerun count_rows_by_related_values with source_filter_column/source_filter_value on the reference sheet. Do not fall back to query_tabular_data or filter_rows on the fact sheet with a guessed exact text value unless the workbook already exposed that canonical target value.\n" + "13. Use normalize_match=true when matching names, owners, assignees, engineers, or similar entity-text columns across worksheets.\n" + "14. Only use aggregate_column when the user explicitly asks for a sum, average, min, max, or count across rows and count_rows is not the simpler deterministic option.\n" + "15. For time-based questions on datetime columns, use group_by_datetime_component.\n" + "16. For threshold, ranking, comparison, or correlation-like questions, first filter/query the relevant rows, then compute grouped metrics.\n" + "17. When the question asks for grouped results for each entity or category and a cross-sheet bridge plan or relationship hint is available, use the reference worksheet to identify the canonical entities or categories and the fact worksheet to compute the metric. Do not answer 'each X' by grouping a yes/no, boolean, or membership-flag column unless the user explicitly asked about that flag.\n" + "18. When the question asks for rows satisfying multiple conditions, prefer one combined query_expression using and/or instead of separate broad queries that you plan to intersect later.\n" + "19. Batch multiple independent function calls in a SINGLE response whenever possible.\n" + "20. Keep max_rows as small as possible. Only increase it when the user explicitly asked for an exhaustive row list or export, or when the full matching row context is required and the cohort is modest; otherwise return total_matches plus representative rows. If a prior result reports total_matches > returned_rows or distinct_count > returned_values for a full-list question, rerun with a higher max_rows or max_values before answering.\n" + "21. For analytical questions, prefer deterministic counts plus lookup/filter/query/grouped computations over raw row or preview output.\n" + "22. For identifier-based workbook questions, locate the identifier on the correct sheet before explaining downstream calculations.\n" + "23. For peak, busiest, highest, or lowest questions, use grouped functions and inspect the highest_group, highest_value, lowest_group, and lowest_value summary fields.\n" + "24. Return only computed findings and name the strongest drivers clearly.\n" + "25. Do not mention hypothetical follow-up analyses, parser errors, or failed attempts unless the user explicitly asked about failures and you have actual tool error output to report.\n" + "26. When using query_tabular_data, use simple DataFrame.query() syntax with backticked column names for columns containing spaces. Avoid method calls such as .str.lower(), .astype(...), or other Python expressions that DataFrame.query() may reject." ) baseline_invocations = plugin_logger.get_invocations_for_conversation( @@ -2105,20 +3873,24 @@ def build_system_prompt(force_tool_use=False, tool_error_messages=None, executio baseline_invocation_count = len(baseline_invocations) previous_tool_error_messages = [] previous_execution_gap_messages = [] + previous_discovery_feedback_messages = [] + analysis_requires_immediate_tool_choice = has_multi_sheet_workbook and not schema_summary_mode for attempt_number in range(1, 4): - force_tool_use = attempt_number > 1 + force_tool_use = attempt_number > 1 or (attempt_number == 1 and analysis_requires_immediate_tool_choice) # 4. Build chat history with pre-loaded schemas chat_history = SKChatHistory() chat_history.add_system_message(build_system_prompt( force_tool_use=force_tool_use, tool_error_messages=previous_tool_error_messages, execution_gap_messages=previous_execution_gap_messages, + discovery_feedback_messages=previous_discovery_feedback_messages, )) chat_history.add_user_message( f"Analyze the tabular data to answer: {user_question}\n" - f"Use user_id='{user_id}', conversation_id='{conversation_id}', {source_context}." + f"Use user_id='{user_id}', conversation_id='{conversation_id}'.\n" + f"{source_context}" ) # 5. Execute with auto function calling @@ -2249,6 +4021,7 @@ def build_system_prompt(force_tool_use=False, tool_error_messages=None, executio if successful_analytical_invocations: previous_tool_error_messages = [] previous_failed_call_parameters = [] + previous_discovery_feedback_messages = [] if entity_lookup_mode: selected_sheets = get_tabular_invocation_selected_sheets(successful_analytical_invocations) @@ -2363,7 +4136,12 @@ def build_system_prompt(force_tool_use=False, tool_error_messages=None, executio level=logging.WARNING ) elif discovery_invocations: - previous_execution_gap_messages = [] + previous_discovery_feedback_messages = summarize_tabular_discovery_invocations( + successful_schema_summary_invocations or discovery_invocations, + ) + previous_execution_gap_messages = [ + 'Previous attempt explored workbook structure but did not execute analytical functions. Continue with analytical tool calls now.' + ] discovery_function_names = sorted({ invocation.function_name for invocation in discovery_invocations }) @@ -2372,13 +4150,19 @@ def build_system_prompt(force_tool_use=False, tool_error_messages=None, executio level=logging.WARNING ) elif new_invocation_count > 0: + previous_discovery_feedback_messages = [] previous_execution_gap_messages = [] log_event( f"[Tabular SK Analysis] Attempt {attempt_number} used unsupported tool(s) without computed analysis; retrying", level=logging.WARNING ) else: - previous_execution_gap_messages = [] + previous_discovery_feedback_messages = [] + previous_execution_gap_messages = ( + ['Previous attempt did not use any tools. Start with workbook discovery if the right worksheet is unclear, then continue with analytical tool calls.'] + if allow_multi_sheet_discovery else + [] + ) log_event( f"[Tabular SK Analysis] Attempt {attempt_number} returned narrative without tool use; retrying", level=logging.WARNING @@ -2397,6 +4181,7 @@ def build_system_prompt(force_tool_use=False, tool_error_messages=None, executio ) elif failed_analytical_invocations: previous_tool_error_messages = summarize_tabular_invocation_errors(failed_analytical_invocations) + previous_discovery_feedback_messages = [] previous_execution_gap_messages = [] log_event( f"[Tabular SK Analysis] Attempt {attempt_number} returned no content after tool errors; retrying", @@ -2414,6 +4199,34 @@ def build_system_prompt(force_tool_use=False, tool_error_messages=None, executio baseline_invocation_count = len(invocations_after) + reviewer_recovery = None + if has_multi_sheet_workbook and not schema_summary_mode: + reviewer_recovery = await maybe_recover_tabular_analysis_with_llm_reviewer( + chat_service=chat_service, + kernel=kernel, + tabular_plugin=tabular_plugin, + plugin_logger=plugin_logger, + user_question=user_question, + schema_context=schema_context, + source_context=source_context, + analysis_file_contexts=analysis_file_contexts, + user_id=user_id, + conversation_id=conversation_id, + execution_mode=execution_mode, + allowed_function_names=allowed_function_names, + workbook_sheet_hints=workbook_sheet_hints, + workbook_related_sheet_hints=workbook_related_sheet_hints, + workbook_cross_sheet_bridge_hints=workbook_cross_sheet_bridge_hints, + tool_error_messages=previous_tool_error_messages, + execution_gap_messages=previous_execution_gap_messages, + discovery_feedback_messages=previous_discovery_feedback_messages, + fallback_source_hint=source_hint, + fallback_group_id=group_id, + fallback_public_workspace_id=public_workspace_id, + ) + if reviewer_recovery and reviewer_recovery.get('fallback'): + return reviewer_recovery['fallback'] + log_event("[Tabular SK Analysis] Unable to obtain computed tool-backed results", level=logging.WARNING) return None @@ -2515,17 +4328,125 @@ def get_document_container_for_scope(document_scope): return cosmos_user_documents_container -def get_selected_workspace_tabular_filenames(selected_document_ids=None, selected_document_id=None, document_scope='personal'): - """Resolve explicitly selected workspace documents and return tabular filenames.""" +def get_document_containers_for_scope(document_scope): + """Return workspace source/container pairs for the requested document scope.""" + if document_scope == 'group': + return [('group', cosmos_group_documents_container)] + if document_scope == 'public': + return [('public', cosmos_public_documents_container)] + if document_scope == 'all': + return [ + ('workspace', cosmos_user_documents_container), + ('group', cosmos_group_documents_container), + ('public', cosmos_public_documents_container), + ] + return [('workspace', cosmos_user_documents_container)] + + +def build_tabular_file_context(file_name, source_hint='workspace', group_id=None, public_workspace_id=None): + """Build normalized source metadata for a tabular file when enough scope is known.""" + normalized_file_name = str(file_name or '').strip() + if not is_tabular_filename(normalized_file_name): + return None + + normalized_source_hint = str(source_hint or 'workspace').strip().lower() + if normalized_source_hint == 'personal': + normalized_source_hint = 'workspace' + if normalized_source_hint not in {'workspace', 'chat', 'group', 'public'}: + normalized_source_hint = 'workspace' + + normalized_group_id = str(group_id or '').strip() or None + normalized_public_workspace_id = str(public_workspace_id or '').strip() or None + + if normalized_source_hint == 'group' and not normalized_group_id: + normalized_source_hint = 'workspace' + if normalized_source_hint == 'public' and not normalized_public_workspace_id: + normalized_source_hint = 'workspace' + + context = { + 'file_name': normalized_file_name, + 'source_hint': normalized_source_hint, + } + if normalized_source_hint == 'group' and normalized_group_id: + context['group_id'] = normalized_group_id + if normalized_source_hint == 'public' and normalized_public_workspace_id: + context['public_workspace_id'] = normalized_public_workspace_id + return context + + +def dedupe_tabular_file_contexts(file_contexts=None): + """Return unique tabular file contexts while preserving the first-seen order.""" + unique_contexts = [] + seen_contexts = set() + + for file_context in file_contexts or []: + if not isinstance(file_context, Mapping): + continue + + context_key = ( + str(file_context.get('file_name') or '').strip(), + str(file_context.get('source_hint') or 'workspace').strip().lower(), + str(file_context.get('group_id') or '').strip(), + str(file_context.get('public_workspace_id') or '').strip(), + ) + if not context_key[0] or context_key in seen_contexts: + continue + + seen_contexts.add(context_key) + unique_contexts.append(dict(file_context)) + + return unique_contexts + + +def infer_tabular_source_context_from_document(source_doc, document_scope='personal', + active_group_id=None, active_public_workspace_id=None): + """Infer tabular file source metadata from a search result or citation document.""" + if not isinstance(source_doc, Mapping): + return None + + file_name = source_doc.get('file_name') + doc_group_id = str(source_doc.get('group_id') or '').strip() or None + doc_public_workspace_id = str(source_doc.get('public_workspace_id') or '').strip() or None + + if doc_public_workspace_id: + return build_tabular_file_context( + file_name, + source_hint='public', + public_workspace_id=doc_public_workspace_id, + ) + if doc_group_id: + return build_tabular_file_context( + file_name, + source_hint='group', + group_id=doc_group_id, + ) + if document_scope == 'group': + return build_tabular_file_context( + file_name, + source_hint='group', + group_id=active_group_id, + ) + if document_scope == 'public': + return build_tabular_file_context( + file_name, + source_hint='public', + public_workspace_id=active_public_workspace_id, + ) + return build_tabular_file_context(file_name, source_hint='workspace') + + +def get_selected_workspace_tabular_file_contexts(selected_document_ids=None, selected_document_id=None, + document_scope='personal', active_group_id=None, + active_public_workspace_id=None): + """Resolve explicitly selected workspace documents and return tabular source contexts.""" selected_ids = list(selected_document_ids or []) if not selected_ids and selected_document_id and selected_document_id != 'all': selected_ids = [selected_document_id] if not selected_ids: - return set() + return [] - cosmos_container = get_document_container_for_scope(document_scope) - tabular_filenames = set() + tabular_file_contexts = [] for doc_id in selected_ids: if not doc_id or doc_id == 'all': @@ -2533,58 +4454,337 @@ def get_selected_workspace_tabular_filenames(selected_document_ids=None, selecte try: doc_query = ( - "SELECT TOP 1 c.file_name, c.title " + "SELECT TOP 1 c.file_name, c.title, c.group_id, c.public_workspace_id " "FROM c WHERE c.id = @doc_id " "ORDER BY c.version DESC" ) - doc_params = [{"name": "@doc_id", "value": doc_id}] - doc_results = list(cosmos_container.query_items( - query=doc_query, - parameters=doc_params, - enable_cross_partition_query=True - )) + doc_params = [{"name": "@doc_id", "value": doc_id}] + + for source_hint, cosmos_container in get_document_containers_for_scope(document_scope): + doc_results = list(cosmos_container.query_items( + query=doc_query, + parameters=doc_params, + enable_cross_partition_query=True + )) + + if not doc_results: + continue + + doc_info = doc_results[0] + file_context = build_tabular_file_context( + doc_info.get('file_name') or doc_info.get('title'), + source_hint=source_hint, + group_id=doc_info.get('group_id') or active_group_id, + public_workspace_id=doc_info.get('public_workspace_id') or active_public_workspace_id, + ) + if file_context: + tabular_file_contexts.append(file_context) + break + except Exception as e: + log_event( + f"[Tabular SK Analysis] Failed to resolve selected document '{doc_id}': {e}", + level=logging.WARNING + ) + + return dedupe_tabular_file_contexts(tabular_file_contexts) + + +def collect_workspace_tabular_file_contexts(combined_documents=None, selected_document_ids=None, + selected_document_id=None, document_scope='personal', + active_group_id=None, active_public_workspace_id=None): + """Collect tabular source contexts from search results and explicit workspace selection.""" + tabular_file_contexts = [] + + for source_doc in combined_documents or []: + file_context = infer_tabular_source_context_from_document( + source_doc, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + ) + if file_context: + tabular_file_contexts.append(file_context) + + tabular_file_contexts.extend(get_selected_workspace_tabular_file_contexts( + selected_document_ids=selected_document_ids, + selected_document_id=selected_document_id, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + )) + + return dedupe_tabular_file_contexts(tabular_file_contexts) + + +def collect_workspace_tabular_filenames(combined_documents=None, selected_document_ids=None, + selected_document_id=None, document_scope='personal', + active_group_id=None, active_public_workspace_id=None): + """Collect unique tabular filenames from search results and explicit workspace selection.""" + tabular_file_contexts = collect_workspace_tabular_file_contexts( + combined_documents=combined_documents, + selected_document_ids=selected_document_ids, + selected_document_id=selected_document_id, + document_scope=document_scope, + active_group_id=active_group_id, + active_public_workspace_id=active_public_workspace_id, + ) + return {file_context['file_name'] for file_context in tabular_file_contexts} + + +def normalize_tabular_file_contexts_for_analysis(tabular_filenames=None, tabular_file_contexts=None, + fallback_source_hint='workspace', fallback_group_id=None, + fallback_public_workspace_id=None): + """Return per-file tabular source contexts, defaulting to a shared fallback only when needed.""" + normalized_contexts = dedupe_tabular_file_contexts(tabular_file_contexts) + if normalized_contexts: + return normalized_contexts + + fallback_contexts = [] + for file_name in tabular_filenames or []: + fallback_context = build_tabular_file_context( + file_name, + source_hint=fallback_source_hint, + group_id=fallback_group_id, + public_workspace_id=fallback_public_workspace_id, + ) + if fallback_context: + fallback_contexts.append(fallback_context) + + return dedupe_tabular_file_contexts(fallback_contexts) + + +def build_tabular_analysis_source_context(tabular_file_contexts=None, fallback_source_hint='workspace', + fallback_group_id=None, fallback_public_workspace_id=None): + """Build prompt instructions for per-file tabular source metadata.""" + normalized_contexts = dedupe_tabular_file_contexts(tabular_file_contexts) + if normalized_contexts: + lines = [ + "Use the following per-file source metadata on tabular_processing tool calls. " + "Do not substitute a different source for a listed file:", + ] + for file_context in normalized_contexts: + context_parts = [f"source='{file_context.get('source_hint', 'workspace')}'"] + if file_context.get('group_id'): + context_parts.append(f"group_id='{file_context['group_id']}'") + if file_context.get('public_workspace_id'): + context_parts.append(f"public_workspace_id='{file_context['public_workspace_id']}'") + lines.append(f"- {file_context['file_name']}: {', '.join(context_parts)}") + return "\n".join(lines) + + fallback_parts = [f"source='{fallback_source_hint}'"] + if fallback_source_hint == 'group' and fallback_group_id: + fallback_parts.append(f"group_id='{fallback_group_id}'") + if fallback_source_hint == 'public' and fallback_public_workspace_id: + fallback_parts.append(f"public_workspace_id='{fallback_public_workspace_id}'") + return f"Use {', '.join(fallback_parts)} on tabular_processing tool calls." + + +def determine_tabular_source_hint(document_scope, active_group_id=None, active_public_workspace_id=None): + """Map workspace scope metadata to the tabular plugin source hint.""" + if document_scope == 'group' and active_group_id: + return 'group' + if document_scope == 'public' and active_public_workspace_id: + return 'public' + return 'workspace' + + +async def run_multi_file_tabular_distinct_url_analysis(user_question, analysis_file_contexts, + user_id, conversation_id): + """Run deterministic per-file URL extraction and union the distinct results in Python.""" + from semantic_kernel_plugins.tabular_processing_plugin import TabularProcessingPlugin + + del user_question + normalized_contexts = dedupe_tabular_file_contexts(analysis_file_contexts) + if len(normalized_contexts) <= 1: + return None + + tabular_plugin = TabularProcessingPlugin() + successful_results = [] + fatal_failures = [] + + for file_context in normalized_contexts: + filename = file_context['file_name'] + source_hint = file_context.get('source_hint', 'workspace') + group_id = file_context.get('group_id') + public_workspace_id = file_context.get('public_workspace_id') + + try: + container_name, blob_name = tabular_plugin._resolve_blob_location_with_fallback( + user_id, + conversation_id, + filename, + source_hint, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + tabular_plugin.remember_resolved_blob_location( + source_hint, + filename, + container_name, + blob_name, + ) + schema_info = tabular_plugin._build_workbook_schema_summary( + container_name, + blob_name, + filename, + preview_rows=2, + ) + except Exception as exc: + fatal_failures.append({ + 'filename': filename, + 'source': source_hint, + 'error': f'Could not load workbook schema: {exc}', + }) + continue - if not doc_results: - continue + selected_sheet, selected_column = select_tabular_distinct_url_sheet_and_column(schema_info) + if not selected_column: + fatal_failures.append({ + 'filename': filename, + 'source': source_hint, + 'error': 'Could not identify a URL/location-style column from workbook schema.', + }) + continue - file_name = doc_results[0].get('file_name') or doc_results[0].get('title') - if is_tabular_filename(file_name): - tabular_filenames.add(file_name) - except Exception as e: - log_event( - f"[Tabular SK Analysis] Failed to resolve selected document '{doc_id}': {e}", - level=logging.WARNING - ) + base_arguments = { + 'user_id': user_id, + 'conversation_id': conversation_id, + 'filename': filename, + 'column': selected_column, + 'extract_mode': 'regex', + 'extract_pattern': MULTI_FILE_TABULAR_DISTINCT_URL_EXTRACT_PATTERN, + 'normalize_match': 'false', + 'max_values': '10000', + 'source': source_hint, + } + if group_id: + base_arguments['group_id'] = group_id + if public_workspace_id: + base_arguments['public_workspace_id'] = public_workspace_id + + attempt_arguments = [] + primary_arguments = dict(base_arguments) + if selected_sheet: + primary_arguments['sheet_name'] = selected_sheet + attempt_arguments.append(primary_arguments) + + if ( + selected_sheet + and schema_info.get('is_workbook') + and int(schema_info.get('sheet_count', 0) or 0) > 1 + ): + attempt_arguments.append(dict(base_arguments)) - return tabular_filenames + best_result_payload = None + best_result_counts = None + last_error_message = None + for current_arguments in attempt_arguments: + raw_result = await tabular_plugin.get_distinct_values(**current_arguments) + try: + result_payload = json.loads(raw_result) + except (TypeError, ValueError): + last_error_message = 'get_distinct_values returned a non-JSON payload.' + continue + if result_payload.get('error'): + last_error_message = str(result_payload.get('error')).strip() + continue -def collect_workspace_tabular_filenames(combined_documents=None, selected_document_ids=None, - selected_document_id=None, document_scope='personal'): - """Collect tabular filenames from search results and explicit workspace selection.""" - tabular_filenames = set() + distinct_count = parse_tabular_result_count(result_payload.get('distinct_count')) or 0 + returned_values = parse_tabular_result_count(result_payload.get('returned_values')) or 0 + comparison_key = (distinct_count, returned_values) + if best_result_counts is None or comparison_key > best_result_counts: + best_result_payload = result_payload + best_result_counts = comparison_key + + if best_result_payload is None: + fatal_failures.append({ + 'filename': filename, + 'source': source_hint, + 'error': last_error_message or 'Distinct URL extraction failed for this file.', + }) + continue - for source_doc in combined_documents or []: - file_name = source_doc.get('file_name', '') - if is_tabular_filename(file_name): - tabular_filenames.add(file_name) + successful_results.append(best_result_payload) - tabular_filenames.update(get_selected_workspace_tabular_filenames( - selected_document_ids=selected_document_ids, - selected_document_id=selected_document_id, - document_scope=document_scope, - )) + if fatal_failures: + log_event( + '[Tabular Multi-File] Deterministic distinct URL analysis could not cover every file; falling back to SK orchestration.', + extra={ + 'conversation_id': conversation_id, + 'file_count': len(normalized_contexts), + 'fatal_failures': fatal_failures[:5], + }, + level=logging.WARNING, + ) + return None - return tabular_filenames + combined_analysis = build_multi_file_tabular_distinct_value_analysis(successful_results) + if combined_analysis: + log_event( + '[Tabular Multi-File] Deterministic distinct URL analysis completed.', + extra={ + 'conversation_id': conversation_id, + 'file_count': len(normalized_contexts), + 'matched_file_count': len(successful_results), + }, + level=logging.INFO, + ) + return combined_analysis + + +async def run_tabular_analysis_with_multi_file_support(user_question, tabular_filenames, user_id, + conversation_id, gpt_model, settings, + source_hint='workspace', group_id=None, + public_workspace_id=None, + execution_mode='analysis', + tabular_file_contexts=None): + """Run deterministic multi-file helpers first, then fall back to the SK planner.""" + analysis_file_contexts = normalize_tabular_file_contexts_for_analysis( + tabular_filenames=tabular_filenames, + tabular_file_contexts=tabular_file_contexts, + fallback_source_hint=source_hint, + fallback_group_id=group_id, + fallback_public_workspace_id=public_workspace_id, + ) + multi_file_mode = get_multi_file_tabular_analysis_mode( + user_question, + execution_mode=execution_mode, + analysis_file_contexts=analysis_file_contexts, + ) -def determine_tabular_source_hint(document_scope, active_group_id=None, active_public_workspace_id=None): - """Map workspace scope metadata to the tabular plugin source hint.""" - if document_scope == 'group' and active_group_id: - return 'group' - if document_scope == 'public' and active_public_workspace_id: - return 'public' - return 'workspace' + if multi_file_mode == 'distinct_url_union': + log_event( + '[Tabular Multi-File] Starting deterministic distinct URL union analysis.', + extra={ + 'conversation_id': conversation_id, + 'file_names': [file_context['file_name'] for file_context in analysis_file_contexts], + }, + level=logging.INFO, + ) + deterministic_analysis = await run_multi_file_tabular_distinct_url_analysis( + user_question, + analysis_file_contexts, + user_id, + conversation_id, + ) + if deterministic_analysis: + return deterministic_analysis + + return await run_tabular_sk_analysis( + user_question=user_question, + tabular_filenames=tabular_filenames, + tabular_file_contexts=analysis_file_contexts, + user_id=user_id, + conversation_id=conversation_id, + gpt_model=gpt_model, + settings=settings, + source_hint=source_hint, + group_id=group_id, + public_workspace_id=public_workspace_id, + execution_mode=execution_mode, + ) def resolve_foundry_scope_for_auth(auth_settings, endpoint=None): @@ -2988,6 +5188,51 @@ def consume_stream(): } ) + def get_facts_for_context(scope_id, scope_type, conversation_id: str = None, agent_id: str = None): + if not scope_id or not scope_type: + return "" + fact_store = FactMemoryStore() + kwargs = dict( + scope_type=scope_type, + scope_id=scope_id, + ) + if agent_id: + kwargs['agent_id'] = agent_id + if conversation_id: + kwargs['conversation_id'] = conversation_id + facts = fact_store.get_facts(**kwargs) + if not facts: + return "" + fact_lines = [] + for fact in facts: + value = str(fact.get('value') or '').strip() + if value: + fact_lines.append(f"- {value}") + if not fact_lines: + return "" + fact_lines.append(f"- agent_id: {agent_id or 'None'}") + fact_lines.append(f"- scope_type: {scope_type}") + fact_lines.append(f"- scope_id: {scope_id}") + fact_lines.append(f"- conversation_id: {conversation_id or 'None'}") + return "\n".join(fact_lines) + + def inject_fact_memory_context(conversation_history, scope_id, scope_type, conversation_id: str = None, agent_id: str = None): + facts = get_facts_for_context( + scope_id=scope_id, + scope_type=scope_type, + conversation_id=conversation_id, + agent_id=agent_id, + ) + if facts: + conversation_history.insert(0, { + "role": "system", + "content": f"\n{facts}\n" + }) + conversation_history.insert(0, { + "role": "system", + "content": f"""\n\n\n\n\n""" + }) + @app.route('/api/chat', methods=['POST']) @swagger_route(security=get_auth_security()) @login_required @@ -3158,6 +5403,18 @@ def result_requires_message_reload(result: Any) -> bool: if isinstance(image_gen_enabled, str): image_gen_enabled = image_gen_enabled.lower() == 'true' + original_hybrid_search_enabled = bool(hybrid_search_enabled) + history_grounded_search_used = False + history_only_answerability = None + prior_grounded_document_refs = [] + effective_document_scope = document_scope + effective_selected_document_ids = list(selected_document_ids or []) + effective_selected_document_id = selected_document_id + effective_active_group_ids = list(active_group_ids or []) + effective_active_group_id = active_group_id + effective_active_public_workspace_ids = list(active_public_workspace_ids or []) + effective_active_public_workspace_id = active_public_workspace_id + # GPT & Image generation APIM or direct gpt_model = "" gpt_client = None @@ -3790,16 +6047,127 @@ def result_requires_message_reload(result: Any) -> bool: debug_print(f"[Content Safety Error] {e}") except Exception as ex: debug_print(f"[Content Safety] Unexpected error: {ex}") + + if not original_hybrid_search_enabled: + prior_grounded_document_refs = _normalize_prior_grounded_document_refs(conversation_item) + if prior_grounded_document_refs: + thought_tracker.add_thought( + 'history_context', + 'Checking whether prior conversation context already answers the question', + detail=f"grounded_documents={len(prior_grounded_document_refs)}" + ) + try: + preflight_messages_query = ( + "SELECT * FROM c WHERE c.conversation_id = @conv_id ORDER BY c.timestamp ASC" + ) + preflight_messages_params = [{"name": "@conv_id", "value": conversation_id}] + preflight_messages = list(cosmos_messages_container.query_items( + query=preflight_messages_query, + parameters=preflight_messages_params, + partition_key=conversation_id, + enable_cross_partition_query=True, + )) + preflight_history_segments = build_conversation_history_segments( + all_messages=preflight_messages, + conversation_history_limit=conversation_history_limit, + enable_summarize_older_messages=enable_summarize_content_history_beyond_conversation_history_limit, + gpt_client=gpt_client, + gpt_model=gpt_model, + user_message_id=user_message_id, + fallback_user_message=user_message, + ) + history_only_answerability = assess_history_only_answerability( + gpt_client, + gpt_model, + build_history_only_assessment_messages( + preflight_history_segments, + settings.get('default_system_prompt', '').strip(), + ), + ) + except Exception as assessment_error: + debug_print( + f"[History Fallback] History-only sufficiency assessment failed: {assessment_error}" + ) + + if history_only_answerability and history_only_answerability.get('can_answer_from_history'): + thought_tracker.add_thought( + 'history_context', + 'Prior conversation context appears sufficient without new document retrieval', + detail=history_only_answerability.get('reason') or None, + ) + else: + fallback_search_parameters = build_prior_grounded_document_search_parameters( + prior_grounded_document_refs + ) + if fallback_search_parameters.get('document_ids'): + history_grounded_search_used = True + effective_document_scope = fallback_search_parameters.get('doc_scope') or 'all' + effective_selected_document_ids = list( + fallback_search_parameters.get('document_ids') or [] + ) + effective_selected_document_id = ( + effective_selected_document_ids[0] + if len(effective_selected_document_ids) == 1 + else None + ) + effective_active_group_ids = list( + fallback_search_parameters.get('active_group_ids') or [] + ) + effective_active_group_id = fallback_search_parameters.get('active_group_id') + effective_active_public_workspace_ids = list( + fallback_search_parameters.get('active_public_workspace_ids') or [] + ) + effective_active_public_workspace_id = fallback_search_parameters.get( + 'active_public_workspace_id' + ) + + rewritten_search_query = '' + if history_only_answerability: + rewritten_search_query = str( + history_only_answerability.get('search_query') or '' + ).strip() + if rewritten_search_query: + search_query = rewritten_search_query + + fallback_detail_parts = [ + f"documents={len(effective_selected_document_ids)}", + f"scope={effective_document_scope or 'all'}", + ] + if history_only_answerability and history_only_answerability.get('reason'): + fallback_detail_parts.append( + f"reason={history_only_answerability['reason']}" + ) + thought_tracker.add_thought( + 'search', + 'Conversation context alone was insufficient; searching previously grounded documents', + detail=' | '.join(fallback_detail_parts), + ) + + user_metadata.setdefault('workspace_search', {})[ + 'history_grounded_fallback' + ] = { + 'used': True, + 'document_scope': effective_document_scope, + 'document_count': len(effective_selected_document_ids), + 'search_query': search_query, + } + user_message_doc['metadata'] = user_metadata + cosmos_messages_container.upsert_item(user_message_doc) + else: + thought_tracker.add_thought( + 'history_context', + 'No prior grounded documents were available; using conversation history only' + ) # region 4 - Augmentation # --------------------------------------------------------------------- # 4) Augmentation (Search, etc.) - Run *before* final history prep # --------------------------------------------------------------------- # Hybrid Search - if hybrid_search_enabled: + if hybrid_search_enabled or history_grounded_search_used: # Optional: Summarize recent history *for search* (uses its own limit) - if enable_summarize_content_history_for_search: + if hybrid_search_enabled and enable_summarize_content_history_for_search: # Fetch last N messages for search context limit_n_search = number_of_historical_messages_to_summarize * 2 query_search = f"SELECT TOP {limit_n_search} * FROM c WHERE c.conversation_id = @conv_id ORDER BY c.timestamp DESC" @@ -3852,7 +6220,16 @@ def result_requires_message_reload(result: Any) -> bool: # Perform the search - thought_tracker.add_thought('search', f"Searching {document_scope or 'personal'} workspace documents for '{(search_query or user_message)[:50]}'") + if history_grounded_search_used and not hybrid_search_enabled: + thought_tracker.add_thought( + 'search', + f"Searching {len(effective_selected_document_ids)} previously grounded document(s) for '{(search_query or user_message)[:50]}'" + ) + else: + thought_tracker.add_thought( + 'search', + f"Searching {effective_document_scope or 'personal'} workspace documents for '{(search_query or user_message)[:50]}'" + ) try: # Prepare search arguments # Set default and maximum values for top_n @@ -3878,25 +6255,31 @@ def result_requires_message_reload(result: Any) -> bool: "query": search_query, "user_id": user_id, "top_n": top_n, - "doc_scope": document_scope, + "doc_scope": effective_document_scope, } # Add active_group_ids when: # 1. Document scope is 'group' or chat_type is 'group', OR # 2. Document scope is 'all' and groups are enabled (so group search can be included) - if active_group_ids and (document_scope == 'group' or document_scope == 'all' or chat_type == 'group'): - search_args["active_group_ids"] = active_group_ids + if effective_active_group_ids and ( + effective_document_scope == 'group' + or effective_document_scope == 'all' + or chat_type == 'group' + ): + search_args["active_group_ids"] = effective_active_group_ids # Add active_public_workspace_id when: # 1. Document scope is 'public' or # 2. Document scope is 'all' and public workspaces are enabled - if active_public_workspace_id and (document_scope == 'public' or document_scope == 'all'): - search_args["active_public_workspace_id"] = active_public_workspace_id + if effective_active_public_workspace_id and ( + effective_document_scope == 'public' or effective_document_scope == 'all' + ): + search_args["active_public_workspace_id"] = effective_active_public_workspace_id - if selected_document_ids: - search_args["document_ids"] = selected_document_ids - elif selected_document_id: - search_args["document_id"] = selected_document_id + if effective_selected_document_ids: + search_args["document_ids"] = effective_selected_document_ids + elif effective_selected_document_id: + search_args["document_id"] = effective_selected_document_id # Add tags filter if provided if tags_filter and isinstance(tags_filter, list) and len(tags_filter) > 0: @@ -3930,10 +6313,18 @@ def result_requires_message_reload(result: Any) -> bool: chunk_sequence = doc.get('chunk_sequence', 0) # Add default page_number = doc.get('page_number') or chunk_sequence or 1 # Ensure a fallback page citation_id = doc.get('id', str(uuid.uuid4())) # Ensure ID exists + document_id = str(doc.get('document_id') or '').strip() + if not document_id: + document_id = ( + '_'.join(str(citation_id).split('_')[:-1]) + if '_' in str(citation_id) + else str(citation_id) + ) classification = doc.get('document_classification') chunk_id = doc.get('chunk_id', str(uuid.uuid4())) # Ensure ID exists score = doc.get('score', 0.0) # Add default score group_id = doc.get('group_id', None) # Add default group ID + doc_public_workspace_id = doc.get('public_workspace_id', None) sheet_name = doc.get('sheet_name') location_label, location_value = get_citation_location( file_name, @@ -3946,6 +6337,7 @@ def result_requires_message_reload(result: Any) -> bool: retrieved_texts.append(f"{chunk_text}\n{citation}") combined_documents.append({ "file_name": file_name, + "document_id": document_id, "citation_id": citation_id, "page_number": page_number, "sheet_name": sheet_name, @@ -3958,6 +6350,7 @@ def result_requires_message_reload(result: Any) -> bool: "chunk_id": chunk_id, "score": score, "group_id": group_id, + "public_workspace_id": doc_public_workspace_id, }) if classification: classifications_found.add(classification) @@ -3979,12 +6372,14 @@ def result_requires_message_reload(result: Any) -> bool: # in the citation itself, as it can be large. The citation points *to* the chunk. citation_data = { "file_name": source_doc.get("file_name"), + "document_id": source_doc.get("document_id"), "citation_id": source_doc.get("citation_id"), # Seems like a useful identifier "page_number": source_doc.get("page_number"), "chunk_id": source_doc.get("chunk_id"), # Specific chunk identifier "chunk_sequence": source_doc.get("chunk_sequence"), # Order within document/group "score": source_doc.get("score"), # Relevance score from search "group_id": source_doc.get("group_id"), # Grouping info if used + "public_workspace_id": source_doc.get("public_workspace_id"), "version": source_doc.get("version"), # Document version "classification": source_doc.get("classification") # Document classification # Add any other relevant metadata fields from source_doc here @@ -3992,8 +6387,7 @@ def result_requires_message_reload(result: Any) -> bool: # Using .get() provides None if a key is missing, preventing KeyErrors hybrid_citations_list.append(citation_data) - # Reorder hybrid citations list in descending order based on page_number - hybrid_citations_list.sort(key=lambda x: x.get('page_number', 0), reverse=True) + hybrid_citations_list.sort(key=_build_hybrid_citation_sort_key, reverse=True) # --- NEW: Extract metadata (keywords/abstract) for additional citations --- # Only if extract_metadata is enabled @@ -4006,7 +6400,10 @@ def result_requires_message_reload(result: Any) -> bool: for doc in search_results: # Get document ID (from the chunk's document reference) # AI Search chunks contain references to their parent document - doc_id = doc.get('id', '').split('_')[0] if doc.get('id') else None + doc_id = str(doc.get('document_id') or '').strip() + if not doc_id and doc.get('id'): + raw_doc_id = str(doc.get('id') or '').strip() + doc_id = '_'.join(raw_doc_id.split('_')[:-1]) if '_' in raw_doc_id else raw_doc_id # Skip if we've already processed this document if not doc_id or doc_id in processed_doc_ids: @@ -4043,6 +6440,7 @@ def result_requires_message_reload(result: Any) -> bool: keywords_citation = { "file_name": file_name, + "document_id": doc_id, "citation_id": keywords_citation_id, "page_number": "Metadata", # Special page identifier "chunk_id": keywords_citation_id, @@ -4076,6 +6474,7 @@ def result_requires_message_reload(result: Any) -> bool: abstract_citation = { "file_name": file_name, + "document_id": doc_id, "citation_id": abstract_citation_id, "page_number": "Metadata", # Special page identifier "chunk_id": abstract_citation_id, @@ -4119,6 +6518,7 @@ def result_requires_message_reload(result: Any) -> bool: vision_citation = { "file_name": file_name, + "document_id": doc_id, "citation_id": vision_citation_id, "page_number": "AI Vision", # Special page identifier "chunk_id": vision_citation_id, @@ -4152,18 +6552,23 @@ def result_requires_message_reload(result: Any) -> bool: if list(classifications_found) != conversation_item.get('classification', []): conversation_item['classification'] = list(classifications_found) # No need to upsert item here, will be updated later + elif history_grounded_search_used: + thought_tracker.add_thought( + 'search', + 'No matching excerpts were found in the previously grounded documents' + ) # Update message-level chat_type based on actual document usage for this message # This must happen after document search is completed so search_results is populated message_chat_type = None - if hybrid_search_enabled and search_results and len(search_results) > 0: + if (hybrid_search_enabled or history_grounded_search_used) and search_results and len(search_results) > 0: # Documents were actually used for this message - if document_scope == 'group': + if effective_document_scope == 'group': message_chat_type = 'group' - elif document_scope == 'public': + elif effective_document_scope == 'public': message_chat_type = 'public' else: - message_chat_type = 'personal_single_user' + message_chat_type = 'personal_single_user' else: # No documents used for this message - only model knowledge message_chat_type = 'Model' @@ -4171,19 +6576,22 @@ def result_requires_message_reload(result: Any) -> bool: # Update the message-level chat_type in user_metadata user_metadata['chat_context']['chat_type'] = message_chat_type debug_print(f"Set message-level chat_type to: {message_chat_type}") - debug_print(f"hybrid_search_enabled: {hybrid_search_enabled}, search_results count: {len(search_results) if search_results else 0}") + debug_print( + f"hybrid_search_enabled: {hybrid_search_enabled}, history_grounded_search_used: {history_grounded_search_used}, " + f"search_results count: {len(search_results) if search_results else 0}" + ) # Add context-specific information based on message chat type - if message_chat_type == 'group' and active_group_id: - user_metadata['chat_context']['group_id'] = active_group_id + if message_chat_type == 'group' and effective_active_group_id: + user_metadata['chat_context']['group_id'] = effective_active_group_id # We may have already fetched this in workspace_search section if 'workspace_search' in user_metadata and user_metadata['workspace_search'].get('group_name'): user_metadata['chat_context']['group_name'] = user_metadata['workspace_search']['group_name'] debug_print(f"Chat context - using group_name from workspace_search: {user_metadata['workspace_search']['group_name']}") else: try: - debug_print(f"Chat context - looking up group for id: {active_group_id}") - group_doc = find_group_by_id(active_group_id) + debug_print(f"Chat context - looking up group for id: {effective_active_group_id}") + group_doc = find_group_by_id(effective_active_group_id) debug_print(f"Chat context group lookup result: {group_doc}") if group_doc and group_doc.get('name'): @@ -4191,7 +6599,7 @@ def result_requires_message_reload(result: Any) -> bool: user_metadata['chat_context']['group_name'] = group_title debug_print(f"Chat context - set group_name to: {group_title}") else: - debug_print(f"Chat context - no group found or no name for id: {active_group_id}") + debug_print(f"Chat context - no group found or no name for id: {effective_active_group_id}") user_metadata['chat_context']['group_name'] = None except Exception as e: @@ -4495,20 +6903,26 @@ def result_requires_message_reload(result: Any) -> bool: 'error': user_friendly_message }), status_code + workspace_tabular_file_contexts = [] workspace_tabular_files = set() - if hybrid_search_enabled and is_tabular_processing_enabled(settings): - workspace_tabular_files = collect_workspace_tabular_filenames( + if (hybrid_search_enabled or history_grounded_search_used) and is_tabular_processing_enabled(settings): + workspace_tabular_file_contexts = collect_workspace_tabular_file_contexts( combined_documents=combined_documents, - selected_document_ids=selected_document_ids, - selected_document_id=selected_document_id, - document_scope=document_scope, + selected_document_ids=effective_selected_document_ids, + selected_document_id=effective_selected_document_id, + document_scope=effective_document_scope, + active_group_id=effective_active_group_id, + active_public_workspace_id=effective_active_public_workspace_id, ) + workspace_tabular_files = { + file_context['file_name'] for file_context in workspace_tabular_file_contexts + } - if hybrid_search_enabled and workspace_tabular_files and is_tabular_processing_enabled(settings): + if (hybrid_search_enabled or history_grounded_search_used) and workspace_tabular_files and is_tabular_processing_enabled(settings): tabular_source_hint = determine_tabular_source_hint( - document_scope, - active_group_id=active_group_id, - active_public_workspace_id=active_public_workspace_id, + effective_document_scope, + active_group_id=effective_active_group_id, + active_public_workspace_id=effective_active_public_workspace_id, ) tabular_execution_mode = get_tabular_execution_mode(user_message) tabular_filenames_str = ", ".join(sorted(workspace_tabular_files)) @@ -4517,16 +6931,17 @@ def result_requires_message_reload(result: Any) -> bool: plugin_logger.get_invocations_for_conversation(user_id, conversation_id, limit=1000) ) - tabular_analysis = asyncio.run(run_tabular_sk_analysis( + tabular_analysis = asyncio.run(run_tabular_analysis_with_multi_file_support( user_question=user_message, tabular_filenames=workspace_tabular_files, + tabular_file_contexts=workspace_tabular_file_contexts, user_id=user_id, conversation_id=conversation_id, gpt_model=gpt_model, settings=settings, source_hint=tabular_source_hint, - group_id=active_group_id if tabular_source_hint == 'group' else None, - public_workspace_id=active_public_workspace_id if tabular_source_hint == 'public' else None, + group_id=effective_active_group_id if tabular_source_hint == 'group' else None, + public_workspace_id=effective_active_public_workspace_id if tabular_source_hint == 'public' else None, execution_mode=tabular_execution_mode, )) tabular_invocations = get_new_plugin_invocations( @@ -4596,6 +7011,8 @@ def result_requires_message_reload(result: Any) -> bool: # --------------------------------------------------------------------- conversation_history_for_api = [] summary_of_older = "" + history_debug_info = {} + final_api_source_refs = [] try: @@ -4605,66 +7022,18 @@ def result_requires_message_reload(result: Any) -> bool: all_messages = list(cosmos_messages_container.query_items( query=all_messages_query, parameters=params_all, partition_key=conversation_id, enable_cross_partition_query=True )) - all_messages = filter_assistant_artifact_items(all_messages) - - # Sort messages using threading logic - all_messages = sort_messages_by_thread(all_messages) - - total_messages = len(all_messages) - - # Determine which messages are "recent" and which are "older" - # `conversation_history_limit` includes the *current* user message - num_recent_messages = min(total_messages, conversation_history_limit) - num_older_messages = total_messages - num_recent_messages - - recent_messages = all_messages[-num_recent_messages:] # Last N messages - older_messages_to_summarize = all_messages[:num_older_messages] # Messages before the recent ones - - # Summarize older messages if needed and present - if enable_summarize_content_history_beyond_conversation_history_limit and older_messages_to_summarize: - debug_print(f"Summarizing {len(older_messages_to_summarize)} older messages for conversation {conversation_id}") - summary_prompt_older = ( - "Summarize the following conversation history concisely (around 50-100 words), " - "focusing on key facts, decisions, or context that might be relevant for future turns. " - "Do not add any introductory phrases like 'Here is a summary'.\n\n" - "Conversation History:\n" - ) - message_texts_older = [] - for msg in older_messages_to_summarize: - role = msg.get('role', 'user') - metadata = msg.get('metadata', {}) - - # Check active_thread flag - skip messages with active_thread=False - thread_info = metadata.get('thread_info', {}) - active_thread = thread_info.get('active_thread') - - # Exclude content when active_thread is explicitly False - if active_thread is False: - debug_print(f"[THREAD] Skipping inactive thread message {msg.get('id')} from summary") - continue - - # Skip roles that shouldn't be in summary (adjust as needed) - if role in ['system', 'safety', 'blocked', 'image', 'file']: continue - content = msg.get('content', '') - message_texts_older.append(f"{role.upper()}: {content}") - - if message_texts_older: # Only summarize if there's content to summarize - summary_prompt_older += "\n".join(message_texts_older) - try: - # Use the already initialized client and model - summary_response_older = gpt_client.chat.completions.create( - model=gpt_model, - messages=[{"role": "system", "content": summary_prompt_older}], - max_tokens=150, # Adjust token limit for summary - temperature=0.3 # Lower temp for factual summary - ) - summary_of_older = summary_response_older.choices[0].message.content.strip() - debug_print(f"Generated summary: {summary_of_older}") - except Exception as e: - debug_print(f"Error summarizing older conversation history: {e}") - summary_of_older = "" # Failed, proceed without summary - else: - debug_print("No summarizable content found in older messages.") + history_segments = build_conversation_history_segments( + all_messages=all_messages, + conversation_history_limit=conversation_history_limit, + enable_summarize_older_messages=enable_summarize_content_history_beyond_conversation_history_limit, + gpt_client=gpt_client, + gpt_model=gpt_model, + user_message_id=user_message_id, + fallback_user_message=user_message, + ) + summary_of_older = history_segments['summary_of_older'] + chat_tabular_files = history_segments['chat_tabular_files'] + history_debug_info = history_segments.get('debug_info', {}) # Construct the final history for the API call @@ -4674,6 +7043,7 @@ def result_requires_message_reload(result: Any) -> bool: "role": "system", "content": f"\n{summary_of_older}\n" }) + final_api_source_refs.append('system:summary_of_older') # Add augmentation system messages (search, agents) next # **Important**: Decide if you want these saved. If so, you need to upsert them now. @@ -4721,6 +7091,7 @@ def result_requires_message_reload(result: Any) -> bool: } cosmos_messages_container.upsert_item(system_doc) conversation_history_for_api.append(aug_msg) # Add to API context + final_api_source_refs.append(f"system:augmentation:{len(final_api_source_refs) + 1}") # System message shares the same thread as user message, no thread update needed # --- NEW: Save plugin output as agent citation --- @@ -4731,141 +7102,8 @@ def result_requires_message_reload(result: Any) -> bool: "timestamp": datetime.utcnow().isoformat() }) - - # Add the recent messages (user, assistant, relevant system/file messages) - allowed_roles_in_history = ['user', 'assistant'] # Add 'system' if you PERSIST general system messages not related to augmentation - max_file_content_length_in_history = 50000 # Increased limit for all file content in history - max_tabular_content_length_in_history = 50000 # Same limit for tabular data consistency - chat_tabular_files = set() # Track tabular files uploaded directly to chat - - for message in recent_messages: - role = message.get('role') - content = message.get('content') - metadata = message.get('metadata', {}) - - # Check active_thread flag - skip messages with active_thread=False - # This handles both threaded messages and legacy messages with the flag set - thread_info = metadata.get('thread_info', {}) - active_thread = thread_info.get('active_thread') - - # Exclude content when active_thread is explicitly False - # Include when: active_thread is True, None, or not present (legacy messages) - if active_thread is False: - debug_print(f"[THREAD] Skipping inactive thread message {message.get('id')} (thread_id: {thread_info.get('thread_id')}, attempt: {thread_info.get('thread_attempt')})") - continue - - # Check if message is fully masked - skip it entirely - if metadata.get('masked', False): - debug_print(f"[MASK] Skipping fully masked message {message.get('id')}") - continue - - # Check for partially masked content - masked_ranges = metadata.get('masked_ranges', []) - if masked_ranges and content: - # Remove masked portions from content - content = remove_masked_content(content, masked_ranges) - debug_print(f"[MASK] Applied {len(masked_ranges)} masked ranges to message {message.get('id')}") - - if role in allowed_roles_in_history: - conversation_history_for_api.append({"role": role, "content": content}) - elif role == 'file': # Handle file content inclusion (simplified) - filename = message.get('filename', 'uploaded_file') - file_content = message.get('file_content', '') # Assuming file content is stored - is_table = message.get('is_table', False) - file_content_source = message.get('file_content_source', '') - - # Tabular files stored in blob (enhanced citations enabled) - reference plugin - if is_table and file_content_source == 'blob': - chat_tabular_files.add(filename) # Track for mini SK analysis - conversation_history_for_api.append({ - 'role': 'system', - 'content': f"[User uploaded a tabular data file named '{filename}'. " - f"The file is stored in blob storage and available for analysis. " - f"Use the tabular_processing plugin functions (list_tabular_files, describe_tabular_file, " - f"aggregate_column, filter_rows, query_tabular_data, group_by_aggregate, group_by_datetime_component) to analyze this data. " - f"The file source is 'chat'.]" - }) - else: - # Use higher limit for tabular data that needs complete analysis - content_limit = max_tabular_content_length_in_history if is_table else max_file_content_length_in_history - - display_content = file_content[:content_limit] - if len(file_content) > content_limit: - display_content += "..." - - # Enhanced message for tabular data - if is_table: - conversation_history_for_api.append({ - 'role': 'system', # Represent file as system info - 'content': f"[User uploaded a tabular data file named '{filename}'. This is CSV format data for analysis:\n{display_content}]\nThis is complete tabular data in CSV format. You can perform calculations, analysis, and data operations on this dataset." - }) - else: - conversation_history_for_api.append({ - 'role': 'system', # Represent file as system info - 'content': f"[User uploaded a file named '{filename}'. Content preview:\n{display_content}]\nUse this file context if relevant." - }) - elif role == 'image': # Handle image uploads with extracted text and vision analysis - filename = message.get('filename', 'uploaded_image') - is_user_upload = message.get('metadata', {}).get('is_user_upload', False) - - if is_user_upload: - # This is a user-uploaded image with extracted text and vision analysis - # IMPORTANT: Do NOT include message.get('content') as it contains base64 image data - # which would consume excessive tokens. Only use extracted_text and vision_analysis. - extracted_text = message.get('extracted_text', '') - vision_analysis = message.get('vision_analysis', {}) - - # Build comprehensive context from OCR and vision analysis (NO BASE64!) - image_context_parts = [f"[User uploaded an image named '{filename}'.]"] - - if extracted_text: - # Include OCR text from Document Intelligence - extracted_preview = extracted_text[:max_file_content_length_in_history] - if len(extracted_text) > max_file_content_length_in_history: - extracted_preview += "..." - image_context_parts.append(f"\n\nExtracted Text (OCR):\n{extracted_preview}") - - if vision_analysis: - # Include AI vision analysis - image_context_parts.append("\n\nAI Vision Analysis:") - - if vision_analysis.get('description'): - image_context_parts.append(f"\nDescription: {vision_analysis['description']}") - - if vision_analysis.get('objects'): - objects_str = ', '.join(vision_analysis['objects']) - image_context_parts.append(f"\nObjects detected: {objects_str}") - - if vision_analysis.get('text'): - image_context_parts.append(f"\nText visible in image: {vision_analysis['text']}") - - if vision_analysis.get('contextual_analysis'): - image_context_parts.append(f"\nContextual analysis: {vision_analysis['contextual_analysis']}") - - image_context_content = ''.join(image_context_parts) + "\n\nUse this image information to answer questions about the uploaded image." - - # Verify we're not accidentally including base64 data - if 'data:image/' in image_context_content or ';base64,' in image_context_content: - debug_print(f"WARNING: Base64 image data detected in chat history for {filename}! Removing to save tokens.") - # This should never happen, but safety check just in case - image_context_content = f"[User uploaded an image named '{filename}' - image data excluded from chat history to conserve tokens]" - - debug_print(f"[IMAGE_CONTEXT] Adding user-uploaded image to history: {filename}, context length: {len(image_context_content)} chars") - conversation_history_for_api.append({ - 'role': 'system', - 'content': image_context_content - }) - else: - # This is a system-generated image (DALL-E, etc.) - # Don't include the image data URL in history either - prompt = message.get('prompt', 'User requested image generation.') - debug_print(f"[IMAGE_CONTEXT] Adding system-generated image to history: {prompt[:100]}...") - conversation_history_for_api.append({ - 'role': 'system', - 'content': f"[Assistant generated an image based on the prompt: '{prompt}']" - }) - - # Ignored roles: 'safety', 'blocked', 'system' (if they are only for augmentation/summary) + conversation_history_for_api.extend(history_segments['history_messages']) + final_api_source_refs.extend(history_debug_info.get('history_message_source_refs', [])) # --- Mini SK analysis for tabular files uploaded directly to chat --- if chat_tabular_files and is_tabular_processing_enabled(settings): @@ -4880,7 +7118,7 @@ def result_requires_message_reload(result: Any) -> bool: plugin_logger.get_invocations_for_conversation(user_id, conversation_id, limit=1000) ) - chat_tabular_analysis = asyncio.run(run_tabular_sk_analysis( + chat_tabular_analysis = asyncio.run(run_tabular_analysis_with_multi_file_support( user_question=user_message, tabular_filenames=chat_tabular_files, user_id=user_id, @@ -4913,6 +7151,7 @@ def result_requires_message_reload(result: Any) -> bool: chat_tabular_analysis, ) }) + final_api_source_refs.append('system:tabular_results') # Collect tool execution citations from SK tabular analysis chat_tabular_sk_citations = collect_tabular_sk_citations(user_id, conversation_id) @@ -4928,20 +7167,6 @@ def result_requires_message_reload(result: Any) -> bool: ) debug_print("[Chat Tabular SK] Analysis returned None, relying on existing file context messages") - # Ensure the very last message is the current user's message (it should be if fetched correctly) - if not conversation_history_for_api or conversation_history_for_api[-1]['role'] != 'user': - debug_print("Warning: Last message in history is not the user's current message. Appending.") - # This might happen if 'recent_messages' somehow didn't include the latest user message saved in step 2 - # Or if the last message had an ignored role. Find the actual user message: - user_msg_found = False - for msg in reversed(recent_messages): - if msg['role'] == 'user' and msg['id'] == user_message_id: - conversation_history_for_api.append({"role": "user", "content": msg['content']}) - user_msg_found = True - break - if not user_msg_found: # Still not found? Append the original input as fallback - conversation_history_for_api.append({"role": "user", "content": user_message}) - except Exception as e: debug_print(f"Error preparing conversation history: {e}") return jsonify({'error': f'Error preparing conversation history: {str(e)}'}), 500 @@ -4951,6 +7176,7 @@ def result_requires_message_reload(result: Any) -> bool: # 6) Final GPT Call # --------------------------------------------------------------------- default_system_prompt = settings.get('default_system_prompt', '').strip() + default_system_prompt_inserted = False # Only add if non-empty and not already present (excluding summary/augmentation system messages) if default_system_prompt: # Find if any system message (not summary or augmentation) is present @@ -4970,6 +7196,43 @@ def result_requires_message_reload(result: Any) -> bool: "role": "system", "content": default_system_prompt }) + final_api_source_refs.insert(insert_idx, 'system:default_prompt') + default_system_prompt_inserted = True + + if not original_hybrid_search_enabled: + history_grounding_message = build_history_grounding_system_message() + insert_idx = 0 + if ( + conversation_history_for_api + and conversation_history_for_api[0].get('role') == 'system' + and conversation_history_for_api[0].get('content', '').startswith( + '' + ) + ): + insert_idx = 1 + if default_system_prompt_inserted: + insert_idx += 1 + conversation_history_for_api.insert(insert_idx, history_grounding_message) + final_api_source_refs.insert(insert_idx, 'system:history_grounding') + + history_debug_info = enrich_history_context_debug_info( + history_debug_info, + conversation_history_for_api, + final_api_source_refs, + path_label='standard', + augmentation_message_count=len(system_messages_for_augmentation), + default_system_prompt_inserted=default_system_prompt_inserted, + ) + emit_history_context_debug(history_debug_info, conversation_id) + thought_tracker.add_thought( + 'history_context', + build_history_context_thought_content(history_debug_info), + build_history_context_thought_detail(history_debug_info), + ) + if settings.get('enable_debug_logging', False): + agent_citations_list.append( + build_history_context_debug_citation(history_debug_info, 'standard') + ) # --- DRY Fallback Chain Helper --- def try_fallback_chain(steps): @@ -4996,38 +7259,6 @@ def try_fallback_chain(steps): # If all fail, return default error return ("Sorry, I encountered an error.", gpt_model, None, None) - # --- Inject facts as a system message at the top of conversation_history_for_api --- - def get_facts_for_context(scope_id, scope_type, conversation_id: str = None, agent_id: str = None): - settings = get_settings() - agents = settings.get('semantic_kernel_agents', []) - default_agent = next((a for a in agents if a.get('default_agent')), None) - agent_dict = default_agent or (agents[0] if agents else None) - agent_id = agent_dict.get('id') if agent_dict else None - if not scope_id or not scope_type: - return "" - fact_store = FactMemoryStore() - kwargs = dict( - scope_type=scope_type, - scope_id=scope_id, - ) - if agent_id: - kwargs['agent_id'] = agent_id - if conversation_id: - kwargs['conversation_id'] = conversation_id - facts = fact_store.get_facts(**kwargs) - if not facts: - return "" - fact_lines = [] - for fact in facts: - value = fact.get('value', '') - if value: - fact_lines.append(f"- {value}") - fact_lines.append(f"- agent_id: {agent_id}") - fact_lines.append(f"- scope_type: {scope_type}") - fact_lines.append(f"- scope_id: {scope_id}") - fact_lines.append(f"- conversation_id: {conversation_id}") - return "\n".join(fact_lines) - async def run_sk_call(callable_obj, *args, **kwargs): log_event( f"Running Semantic Kernel callable: {callable_obj.__name__}", @@ -5225,19 +7456,13 @@ async def run_sk_call(callable_obj, *args, **kwargs): # Add additional metadata here to scope the facts to be returned # Allows for additional per agent and per conversation scoping. - facts = get_facts_for_context( + inject_fact_memory_context( + conversation_history=conversation_history_for_api, scope_id=scope_id, - scope_type=scope_type + scope_type=scope_type, + conversation_id=conversation_id, + agent_id=agent_id, ) - if facts: - conversation_history_for_api.insert(0, { - "role": "system", - "content": f"\n{facts}\n" - }) - conversation_history_for_api.insert(0, { - "role": "system", - "content": f"""\n\n\n\n\n""" - }) agent_message_history = [ ChatMessageContent( @@ -5791,7 +8016,7 @@ def gpt_error(e): 'augmented': bool(system_messages_for_augmentation), 'hybrid_citations': hybrid_citations_list, # <--- SIMPLIFIED: Directly use the list 'web_search_citations': web_search_citations_list, - 'hybridsearch_query': search_query if hybrid_search_enabled and search_results else None, # Log query only if hybrid search ran and found results + 'hybridsearch_query': search_query if search_results else None, # Log query when any bounded document retrieval produced results 'agent_citations': prepared_agent_citations, 'model_deployment_name': actual_model_used, 'agent_display_name': agent_display_name, @@ -5799,6 +8024,7 @@ def gpt_error(e): 'metadata': { 'user_info': user_info_for_assistant, # Track which user created this assistant message 'reasoning_effort': reasoning_effort, + 'history_context': history_debug_info, 'thread_info': { 'thread_id': user_thread_id, # Same thread as user message 'previous_thread_id': user_previous_thread_id, # Same previous_thread_id as user message @@ -5824,9 +8050,9 @@ def gpt_error(e): # Determine workspace type based on active group/public workspace workspace_type = 'personal' - if active_public_workspace_id: + if effective_active_public_workspace_id: workspace_type = 'public' - elif active_group_id: + elif effective_active_group_id: workspace_type = 'group' log_token_usage( @@ -5839,8 +8065,8 @@ def gpt_error(e): completion_tokens=token_usage_data.get('completion_tokens'), conversation_id=conversation_id, message_id=assistant_message_id, - group_id=active_group_id, - public_workspace_id=active_public_workspace_id, + group_id=effective_active_group_id, + public_workspace_id=effective_active_public_workspace_id, additional_context={ 'agent_name': agent_name, 'augmented': bool(system_messages_for_augmentation), @@ -5882,20 +8108,20 @@ def gpt_error(e): user_message=user_message, conversation_id=conversation_id, user_id=user_id, - active_group_id=active_group_id, - active_group_ids=active_group_ids, - document_scope=document_scope, - selected_document_id=selected_document_id, + active_group_id=effective_active_group_id, + active_group_ids=effective_active_group_ids, + document_scope=effective_document_scope, + selected_document_id=effective_selected_document_id, model_deployment=actual_model_used, - hybrid_search_enabled=hybrid_search_enabled, + hybrid_search_enabled=hybrid_search_enabled or history_grounded_search_used, image_gen_enabled=image_gen_enabled, selected_documents=combined_documents if 'combined_documents' in locals() else None, selected_agent=selected_agent_name, selected_agent_details=user_metadata.get('agent_selection'), search_results=search_results if 'search_results' in locals() else None, conversation_item=conversation_item, - active_public_workspace_id=active_public_workspace_id, - active_public_workspace_ids=active_public_workspace_ids + active_public_workspace_id=effective_active_public_workspace_id, + active_public_workspace_ids=effective_active_public_workspace_ids ) except Exception as e: debug_print(f"Error collecting conversation metadata: {e}") @@ -6224,6 +8450,8 @@ def generate(publish_background_event=None): # Validate chat_type if chat_type not in ('user', 'group'): chat_type = 'user' + scope_id = active_group_id if chat_type == 'group' else user_id + scope_type = 'group' if chat_type == 'group' else 'user' # Initialize variables search_query = user_message @@ -6239,12 +8467,27 @@ def generate(publish_background_event=None): conversation_history_limit = math.ceil(raw_conversation_history_limit) if conversation_history_limit % 2 != 0: conversation_history_limit += 1 + enable_summarize_content_history_beyond_conversation_history_limit = settings.get( + 'enable_summarize_content_history_beyond_conversation_history_limit', + True, + ) # Convert toggles if isinstance(hybrid_search_enabled, str): hybrid_search_enabled = hybrid_search_enabled.lower() == 'true' if isinstance(web_search_enabled, str): web_search_enabled = web_search_enabled.lower() == 'true' + original_hybrid_search_enabled = bool(hybrid_search_enabled) + history_grounded_search_used = False + history_only_answerability = None + prior_grounded_document_refs = [] + effective_document_scope = document_scope + effective_selected_document_ids = list(selected_document_ids or []) + effective_selected_document_id = selected_document_id + effective_active_group_ids = list(active_group_ids or []) + effective_active_group_id = active_group_id + effective_active_public_workspace_ids = list(active_public_workspace_ids or []) + effective_active_public_workspace_id = active_public_workspace_id debug_print( "[Streaming] Normalized toggles | " f"hybrid_search={hybrid_search_enabled} | " @@ -6709,36 +8952,162 @@ def publish_live_plugin_thought(thought_payload): except Exception as ex: debug_print(f"[Content Safety - Streaming] Unexpected error: {ex}") + if not original_hybrid_search_enabled: + prior_grounded_document_refs = _normalize_prior_grounded_document_refs(conversation_item) + if prior_grounded_document_refs: + yield emit_thought( + 'history_context', + 'Checking whether prior conversation context already answers the question', + detail=f"grounded_documents={len(prior_grounded_document_refs)}" + ) + try: + preflight_messages_query = ( + "SELECT * FROM c WHERE c.conversation_id = @conv_id ORDER BY c.timestamp ASC" + ) + preflight_messages_params = [{"name": "@conv_id", "value": conversation_id}] + preflight_messages = list(cosmos_messages_container.query_items( + query=preflight_messages_query, + parameters=preflight_messages_params, + partition_key=conversation_id, + enable_cross_partition_query=True, + )) + preflight_history_segments = build_conversation_history_segments( + all_messages=preflight_messages, + conversation_history_limit=conversation_history_limit, + enable_summarize_older_messages=enable_summarize_content_history_beyond_conversation_history_limit, + gpt_client=gpt_client, + gpt_model=gpt_model, + user_message_id=user_message_id, + fallback_user_message=user_message, + ) + history_only_answerability = assess_history_only_answerability( + gpt_client, + gpt_model, + build_history_only_assessment_messages( + preflight_history_segments, + settings.get('default_system_prompt', '').strip(), + ), + ) + except Exception as assessment_error: + debug_print( + f"[Streaming][History Fallback] History-only sufficiency assessment failed: {assessment_error}" + ) + + if history_only_answerability and history_only_answerability.get('can_answer_from_history'): + yield emit_thought( + 'history_context', + 'Prior conversation context appears sufficient without new document retrieval', + detail=history_only_answerability.get('reason') or None, + ) + else: + fallback_search_parameters = build_prior_grounded_document_search_parameters( + prior_grounded_document_refs + ) + if fallback_search_parameters.get('document_ids'): + history_grounded_search_used = True + effective_document_scope = fallback_search_parameters.get('doc_scope') or 'all' + effective_selected_document_ids = list( + fallback_search_parameters.get('document_ids') or [] + ) + effective_selected_document_id = ( + effective_selected_document_ids[0] + if len(effective_selected_document_ids) == 1 + else None + ) + effective_active_group_ids = list( + fallback_search_parameters.get('active_group_ids') or [] + ) + effective_active_group_id = fallback_search_parameters.get('active_group_id') + effective_active_public_workspace_ids = list( + fallback_search_parameters.get('active_public_workspace_ids') or [] + ) + effective_active_public_workspace_id = fallback_search_parameters.get( + 'active_public_workspace_id' + ) + + rewritten_search_query = '' + if history_only_answerability: + rewritten_search_query = str( + history_only_answerability.get('search_query') or '' + ).strip() + if rewritten_search_query: + search_query = rewritten_search_query + + fallback_detail_parts = [ + f"documents={len(effective_selected_document_ids)}", + f"scope={effective_document_scope or 'all'}", + ] + if history_only_answerability and history_only_answerability.get('reason'): + fallback_detail_parts.append( + f"reason={history_only_answerability['reason']}" + ) + yield emit_thought( + 'search', + 'Conversation context alone was insufficient; searching previously grounded documents', + detail=' | '.join(fallback_detail_parts), + ) + + user_metadata.setdefault('workspace_search', {})[ + 'history_grounded_fallback' + ] = { + 'used': True, + 'document_scope': effective_document_scope, + 'document_count': len(effective_selected_document_ids), + 'search_query': search_query, + } + user_message_doc['metadata'] = user_metadata + cosmos_messages_container.upsert_item(user_message_doc) + else: + yield emit_thought( + 'history_context', + 'No prior grounded documents were available; using conversation history only' + ) + # Hybrid search (if enabled) combined_documents = [] - if hybrid_search_enabled: + if hybrid_search_enabled or history_grounded_search_used: debug_print( "[Streaming] Starting hybrid search | " - f"conversation_id={conversation_id} | doc_scope={document_scope} | " - f"selected_document_ids={len(selected_document_ids)} | tags={len(tags_filter) if isinstance(tags_filter, list) else 0}" + f"conversation_id={conversation_id} | doc_scope={effective_document_scope} | " + f"selected_document_ids={len(effective_selected_document_ids)} | tags={len(tags_filter) if isinstance(tags_filter, list) else 0}" ) - yield emit_thought('search', f"Searching {document_scope or 'personal'} workspace documents for '{(search_query or user_message)[:50]}'") + if history_grounded_search_used and not hybrid_search_enabled: + yield emit_thought( + 'search', + f"Searching {len(effective_selected_document_ids)} previously grounded document(s) for '{(search_query or user_message)[:50]}'" + ) + else: + yield emit_thought( + 'search', + f"Searching {effective_document_scope or 'personal'} workspace documents for '{(search_query or user_message)[:50]}'" + ) try: search_args = { "query": search_query, "user_id": user_id, "top_n": 12, - "doc_scope": document_scope, + "doc_scope": effective_document_scope, } - if active_group_ids and (document_scope == 'group' or document_scope == 'all' or chat_type == 'group'): - search_args['active_group_ids'] = active_group_ids + if effective_active_group_ids and ( + effective_document_scope == 'group' + or effective_document_scope == 'all' + or chat_type == 'group' + ): + search_args['active_group_ids'] = effective_active_group_ids # Add active_public_workspace_id when: # 1. Document scope is 'public' or # 2. Document scope is 'all' and public workspaces are enabled - if active_public_workspace_id and (document_scope == 'public' or document_scope == 'all'): - search_args['active_public_workspace_id'] = active_public_workspace_id + if effective_active_public_workspace_id and ( + effective_document_scope == 'public' or effective_document_scope == 'all' + ): + search_args['active_public_workspace_id'] = effective_active_public_workspace_id - if selected_document_ids: - search_args['document_ids'] = selected_document_ids - elif selected_document_id: - search_args['document_id'] = selected_document_id + if effective_selected_document_ids: + search_args['document_ids'] = effective_selected_document_ids + elif effective_selected_document_id: + search_args['document_id'] = effective_selected_document_id # Add tags filter if provided if tags_filter and isinstance(tags_filter, list) and len(tags_filter) > 0: @@ -6763,10 +9132,18 @@ def publish_live_plugin_thought(thought_payload): chunk_sequence = doc.get('chunk_sequence', 0) page_number = doc.get('page_number') or chunk_sequence or 1 citation_id = doc.get('id', str(uuid.uuid4())) + document_id = str(doc.get('document_id') or '').strip() + if not document_id: + document_id = ( + '_'.join(str(citation_id).split('_')[:-1]) + if '_' in str(citation_id) + else str(citation_id) + ) classification = doc.get('document_classification') chunk_id = doc.get('chunk_id', str(uuid.uuid4())) score = doc.get('score', 0.0) group_id = doc.get('group_id', None) + doc_public_workspace_id = doc.get('public_workspace_id', None) sheet_name = doc.get('sheet_name') location_label, location_value = get_citation_location( file_name, @@ -6780,6 +9157,7 @@ def publish_live_plugin_thought(thought_payload): combined_documents.append({ "file_name": file_name, + "document_id": document_id, "citation_id": citation_id, "page_number": page_number, "sheet_name": sheet_name, @@ -6792,17 +9170,20 @@ def publish_live_plugin_thought(thought_payload): "chunk_id": chunk_id, "score": score, "group_id": group_id, + "public_workspace_id": doc_public_workspace_id, }) # Build citation data to match non-streaming format citation_data = { "file_name": file_name, + "document_id": document_id, "citation_id": citation_id, "page_number": page_number, "chunk_id": chunk_id, "chunk_sequence": chunk_sequence, "score": score, "group_id": group_id, + "public_workspace_id": doc_public_workspace_id, "version": version, "classification": classification } @@ -6815,7 +9196,10 @@ def publish_live_plugin_thought(thought_payload): processed_doc_ids = set() for doc in search_results: - doc_id = doc.get('document_id') or doc.get('id') + doc_id = str(doc.get('document_id') or '').strip() + if not doc_id and doc.get('id'): + raw_doc_id = str(doc.get('id') or '').strip() + doc_id = '_'.join(raw_doc_id.split('_')[:-1]) if '_' in raw_doc_id else raw_doc_id if not doc_id or doc_id in processed_doc_ids: continue @@ -6826,10 +9210,10 @@ def publish_live_plugin_thought(thought_payload): # Map document_scope to correct parameter names for the function metadata_params = {'user_id': user_id} - if document_scope == 'group': - metadata_params['group_id'] = active_group_id - elif document_scope == 'public': - metadata_params['public_workspace_id'] = active_public_workspace_id + if effective_document_scope == 'group': + metadata_params['group_id'] = effective_active_group_id + elif effective_document_scope == 'public': + metadata_params['public_workspace_id'] = effective_active_public_workspace_id metadata = get_document_metadata_for_citations( doc_id, @@ -6846,6 +9230,7 @@ def publish_live_plugin_thought(thought_payload): keywords_citation = { "file_name": file_name, + "document_id": doc_id, "citation_id": keywords_citation_id, "page_number": "Metadata", "chunk_id": keywords_citation_id, @@ -6868,6 +9253,7 @@ def publish_live_plugin_thought(thought_payload): abstract_citation = { "file_name": file_name, + "document_id": doc_id, "citation_id": abstract_citation_id, "page_number": "Metadata", "chunk_id": abstract_citation_id, @@ -6903,6 +9289,7 @@ def publish_live_plugin_thought(thought_payload): vision_citation = { "file_name": file_name, + "document_id": doc_id, "citation_id": vision_citation_id, "page_number": "AI Vision", "chunk_id": vision_citation_id, @@ -6929,23 +9316,33 @@ def publish_live_plugin_thought(thought_payload): 'documents': combined_documents }) - # Reorder hybrid citations list in descending order based on page_number - hybrid_citations_list.sort(key=lambda x: x.get('page_number', 0), reverse=True) + hybrid_citations_list.sort(key=_build_hybrid_citation_sort_key, reverse=True) + elif history_grounded_search_used: + yield emit_thought( + 'search', + 'No matching excerpts were found in the previously grounded documents' + ) + workspace_tabular_file_contexts = [] workspace_tabular_files = set() - if hybrid_search_enabled and is_tabular_processing_enabled(settings): - workspace_tabular_files = collect_workspace_tabular_filenames( + if (hybrid_search_enabled or history_grounded_search_used) and is_tabular_processing_enabled(settings): + workspace_tabular_file_contexts = collect_workspace_tabular_file_contexts( combined_documents=combined_documents, - selected_document_ids=selected_document_ids, - selected_document_id=selected_document_id, - document_scope=document_scope, + selected_document_ids=effective_selected_document_ids, + selected_document_id=effective_selected_document_id, + document_scope=effective_document_scope, + active_group_id=effective_active_group_id, + active_public_workspace_id=effective_active_public_workspace_id, ) + workspace_tabular_files = { + file_context['file_name'] for file_context in workspace_tabular_file_contexts + } - if hybrid_search_enabled and workspace_tabular_files and is_tabular_processing_enabled(settings): + if (hybrid_search_enabled or history_grounded_search_used) and workspace_tabular_files and is_tabular_processing_enabled(settings): tabular_source_hint = determine_tabular_source_hint( - document_scope, - active_group_id=active_group_id, - active_public_workspace_id=active_public_workspace_id, + effective_document_scope, + active_group_id=effective_active_group_id, + active_public_workspace_id=effective_active_public_workspace_id, ) tabular_execution_mode = get_tabular_execution_mode(user_message) tabular_filenames_str = ", ".join(sorted(workspace_tabular_files)) @@ -6956,19 +9353,21 @@ def publish_live_plugin_thought(thought_payload): debug_print( "[Streaming][Tabular SK] Starting workspace tabular analysis | " f"files={sorted(workspace_tabular_files)} | source_hint={tabular_source_hint} | " + f"file_contexts={workspace_tabular_file_contexts} | " f"execution_mode={tabular_execution_mode} | baseline_invocations={baseline_tabular_invocation_count}" ) - tabular_analysis = asyncio.run(run_tabular_sk_analysis( + tabular_analysis = asyncio.run(run_tabular_analysis_with_multi_file_support( user_question=user_message, tabular_filenames=workspace_tabular_files, + tabular_file_contexts=workspace_tabular_file_contexts, user_id=user_id, conversation_id=conversation_id, gpt_model=gpt_model, settings=settings, source_hint=tabular_source_hint, - group_id=active_group_id if tabular_source_hint == 'group' else None, - public_workspace_id=active_public_workspace_id if tabular_source_hint == 'public' else None, + group_id=effective_active_group_id if tabular_source_hint == 'group' else None, + public_workspace_id=effective_active_public_workspace_id if tabular_source_hint == 'public' else None, execution_mode=tabular_execution_mode, )) tabular_invocations = get_new_plugin_invocations( @@ -7044,10 +9443,10 @@ def publish_live_plugin_thought(thought_payload): # Update message chat type message_chat_type = None - if hybrid_search_enabled and search_results and len(search_results) > 0: - if document_scope == 'group': + if (hybrid_search_enabled or history_grounded_search_used) and search_results and len(search_results) > 0: + if effective_document_scope == 'group': message_chat_type = 'group' - elif document_scope == 'public': + elif effective_document_scope == 'public': message_chat_type = 'public' else: message_chat_type = 'personal_single_user' @@ -7060,6 +9459,8 @@ def publish_live_plugin_thought(thought_payload): # Prepare conversation history conversation_history_for_api = [] + history_debug_info = {} + final_api_source_refs = [] try: all_messages_query = "SELECT * FROM c WHERE c.conversation_id = @conv_id ORDER BY c.timestamp ASC" @@ -7068,85 +9469,38 @@ def publish_live_plugin_thought(thought_payload): query=all_messages_query, parameters=params_all, partition_key=conversation_id, enable_cross_partition_query=True )) - all_messages = filter_assistant_artifact_items(all_messages) - - # Sort messages using threading logic - all_messages = sort_messages_by_thread(all_messages) - - total_messages = len(all_messages) - num_recent_messages = min(total_messages, conversation_history_limit) - recent_messages = all_messages[-num_recent_messages:] - - # Add augmentation messages - for aug_msg in system_messages_for_augmentation: - conversation_history_for_api.append({ - 'role': aug_msg['role'], - 'content': aug_msg['content'] - }) - - # Add recent messages (with file role handling) - allowed_roles_in_history = ['user', 'assistant'] - max_file_content_length_in_history = 50000 - max_tabular_content_length_in_history = 50000 - chat_tabular_files = set() # Track tabular files uploaded directly to chat - - for message in recent_messages: - role = message.get('role') - content = message.get('content', '') + history_segments = build_conversation_history_segments( + all_messages=all_messages, + conversation_history_limit=conversation_history_limit, + enable_summarize_older_messages=enable_summarize_content_history_beyond_conversation_history_limit, + gpt_client=gpt_client, + gpt_model=gpt_model, + user_message_id=user_message_id, + fallback_user_message=user_message, + ) + summary_of_older = history_segments['summary_of_older'] + chat_tabular_files = history_segments['chat_tabular_files'] + history_debug_info = history_segments.get('debug_info', {}) - if role in allowed_roles_in_history: - conversation_history_for_api.append({ - 'role': role, - 'content': content - }) - elif role == 'file': - filename = message.get('filename', 'uploaded_file') - file_content = message.get('file_content', '') - is_table = message.get('is_table', False) - file_content_source = message.get('file_content_source', '') - - # Tabular files stored in blob - track for mini SK analysis - if is_table and file_content_source == 'blob': - chat_tabular_files.add(filename) - conversation_history_for_api.append({ - 'role': 'system', - 'content': ( - f"[User uploaded a tabular data file named '{filename}'. " - f"The file is stored in blob storage and available for analysis. " - f"Use the tabular_processing plugin functions (list_tabular_files, " - f"describe_tabular_file, aggregate_column, filter_rows, " - f"query_tabular_data, group_by_aggregate, group_by_datetime_component) to analyze this data. " - f"The file source is 'chat'.]" - ) - }) - else: - content_limit = ( - max_tabular_content_length_in_history if is_table - else max_file_content_length_in_history - ) - display_content = file_content[:content_limit] - if len(file_content) > content_limit: - display_content += "..." - - if is_table: - conversation_history_for_api.append({ - 'role': 'system', - 'content': ( - f"[User uploaded a tabular data file named '{filename}'. " - f"This is CSV format data for analysis:\n{display_content}]\n" - f"This is complete tabular data in CSV format. You can perform " - f"calculations, analysis, and data operations on this dataset." - ) - }) - else: - conversation_history_for_api.append({ - 'role': 'system', - 'content': ( - f"[User uploaded a file named '{filename}'. " - f"Content preview:\n{display_content}]\n" - f"Use this file context if relevant." - ) - }) + if summary_of_older: + conversation_history_for_api.append({ + 'role': 'system', + 'content': ( + f"\n{summary_of_older}\n" + "" + ) + }) + final_api_source_refs.append('system:summary_of_older') + + # Add augmentation messages + for aug_msg in system_messages_for_augmentation: + conversation_history_for_api.append({ + 'role': aug_msg['role'], + 'content': aug_msg['content'] + }) + final_api_source_refs.append(f"system:augmentation:{len(final_api_source_refs) + 1}") + conversation_history_for_api.extend(history_segments['history_messages']) + final_api_source_refs.extend(history_debug_info.get('history_message_source_refs', [])) # --- Mini SK analysis for tabular files uploaded directly to chat --- if chat_tabular_files and is_tabular_processing_enabled(settings): @@ -7166,7 +9520,7 @@ def publish_live_plugin_thought(thought_payload): f"baseline_invocations={baseline_tabular_invocation_count}" ) - chat_tabular_analysis = asyncio.run(run_tabular_sk_analysis( + chat_tabular_analysis = asyncio.run(run_tabular_analysis_with_multi_file_support( user_question=user_message, tabular_filenames=chat_tabular_files, user_id=user_id, @@ -7202,6 +9556,7 @@ def publish_live_plugin_thought(thought_payload): chat_tabular_analysis, ) }) + final_api_source_refs.append('system:tabular_results') # Collect tool execution citations chat_tabular_sk_citations = collect_tabular_sk_citations(user_id, conversation_id) @@ -7223,18 +9578,66 @@ def publish_live_plugin_thought(thought_payload): # Add system prompt default_system_prompt = settings.get('default_system_prompt', '').strip() + default_system_prompt_inserted = False if default_system_prompt: has_general_system_prompt = any( msg.get('role') == 'system' and not ( + msg.get('content', '').startswith('') or "retrieved document excerpts" in msg.get('content', '') ) for msg in conversation_history_for_api ) if not has_general_system_prompt: - conversation_history_for_api.insert(0, { + insert_idx = 0 + if ( + conversation_history_for_api + and conversation_history_for_api[0].get('role') == 'system' + and conversation_history_for_api[0].get('content', '').startswith( + '' + ) + ): + insert_idx = 1 + conversation_history_for_api.insert(insert_idx, { 'role': 'system', 'content': default_system_prompt }) + final_api_source_refs.insert(insert_idx, 'system:default_prompt') + default_system_prompt_inserted = True + + if not original_hybrid_search_enabled: + history_grounding_message = build_history_grounding_system_message() + insert_idx = 0 + if ( + conversation_history_for_api + and conversation_history_for_api[0].get('role') == 'system' + and conversation_history_for_api[0].get('content', '').startswith( + '' + ) + ): + insert_idx = 1 + if default_system_prompt_inserted: + insert_idx += 1 + conversation_history_for_api.insert(insert_idx, history_grounding_message) + final_api_source_refs.insert(insert_idx, 'system:history_grounding') + + history_debug_info = enrich_history_context_debug_info( + history_debug_info, + conversation_history_for_api, + final_api_source_refs, + path_label='streaming', + augmentation_message_count=len(system_messages_for_augmentation), + default_system_prompt_inserted=default_system_prompt_inserted, + ) + emit_history_context_debug(history_debug_info, conversation_id) + yield emit_thought( + 'history_context', + build_history_context_thought_content(history_debug_info), + build_history_context_thought_detail(history_debug_info), + ) + if settings.get('enable_debug_logging', False): + agent_citations_list.append( + build_history_context_debug_citation(history_debug_info, 'streaming') + ) # Check if agents are enabled and should be used selected_agent = None @@ -7324,6 +9727,14 @@ def publish_live_plugin_thought(thought_payload): debug_print(f"--- Streaming from Agent: {agent_name_used} (model: {actual_model_used}) ---") else: debug_print(f"[Streaming] โš ๏ธ No agent selected, falling back to GPT") + + inject_fact_memory_context( + conversation_history=conversation_history_for_api, + scope_id=scope_id, + scope_type=scope_type, + conversation_id=conversation_id, + agent_id=getattr(selected_agent, 'id', None), + ) # Stream the response accumulated_content = "" @@ -7703,13 +10114,14 @@ def make_json_serializable(obj): 'augmented': bool(system_messages_for_augmentation), 'hybrid_citations': hybrid_citations_list, 'web_search_citations': web_search_citations_list, - 'hybridsearch_query': search_query if hybrid_search_enabled and search_results else None, + 'hybridsearch_query': search_query if search_results else None, 'agent_citations': prepared_agent_citations, 'model_deployment_name': final_model_used if use_agent_streaming else gpt_model, 'agent_display_name': agent_display_name_used if use_agent_streaming else None, 'agent_name': agent_name_used if use_agent_streaming else None, 'metadata': { 'reasoning_effort': reasoning_effort, + 'history_context': history_debug_info, 'thread_info': { 'thread_id': user_thread_id, 'previous_thread_id': user_previous_thread_id, @@ -7728,9 +10140,9 @@ def make_json_serializable(obj): # Determine workspace type based on active group/public workspace workspace_type = 'personal' - if active_public_workspace_id: + if effective_active_public_workspace_id: workspace_type = 'public' - elif active_group_id: + elif effective_active_group_id: workspace_type = 'group' log_token_usage( @@ -7743,8 +10155,8 @@ def make_json_serializable(obj): completion_tokens=token_usage_data.get('completion_tokens'), conversation_id=conversation_id, message_id=assistant_message_id, - group_id=active_group_id, - public_workspace_id=active_public_workspace_id, + group_id=effective_active_group_id, + public_workspace_id=effective_active_public_workspace_id, additional_context={ 'agent_name': agent_name_used if use_agent_streaming else None, 'augmented': bool(system_messages_for_augmentation), @@ -7764,20 +10176,20 @@ def make_json_serializable(obj): user_message=user_message, conversation_id=conversation_id, user_id=user_id, - active_group_id=active_group_id, - active_group_ids=active_group_ids, - document_scope=document_scope, - selected_document_id=selected_document_id, + active_group_id=effective_active_group_id, + active_group_ids=effective_active_group_ids, + document_scope=effective_document_scope, + selected_document_id=effective_selected_document_id, model_deployment=gpt_model, - hybrid_search_enabled=hybrid_search_enabled, + hybrid_search_enabled=hybrid_search_enabled or history_grounded_search_used, image_gen_enabled=False, selected_documents=combined_documents if combined_documents else None, selected_agent=agent_name_used if use_agent_streaming else None, selected_agent_details=selected_agent_metadata if use_agent_streaming else None, search_results=search_results if search_results else None, conversation_item=conversation_item, - active_public_workspace_id=active_public_workspace_id, - active_public_workspace_ids=active_public_workspace_ids + active_public_workspace_id=effective_active_public_workspace_id, + active_public_workspace_ids=effective_active_public_workspace_ids ) except Exception as e: debug_print(f"Error collecting conversation metadata: {e}") @@ -7872,6 +10284,7 @@ def make_json_serializable(obj): 'incomplete': True, 'error': error_msg, 'reasoning_effort': reasoning_effort, + 'history_context': history_debug_info, 'thread_info': { 'thread_id': user_thread_id, 'previous_thread_id': user_previous_thread_id, @@ -8145,6 +10558,889 @@ def remove_masked_content(content, masked_ranges): return result +def _format_history_message_ref(message): + role = str((message or {}).get('role') or 'unknown') + message_id = str((message or {}).get('id') or 'unknown') + return f"{role}:{message_id}" + + +def _capture_history_refs(refs, max_items=12): + ref_list = [str(ref) for ref in refs if ref] + if len(ref_list) <= max_items: + return ref_list + remaining = len(ref_list) - max_items + return ref_list[:max_items] + [f"... (+{remaining} more)"] + + +def _format_history_refs_for_detail(refs): + if not refs: + return 'none' + return ', '.join(str(ref) for ref in refs) + + +def _truncate_history_citation_text(text, max_chars=1600): + value = str(text or '').strip() + if not value: + return '' + if len(value) <= max_chars: + return value + return f"{value[:max_chars]}... [truncated {len(value) - max_chars} chars]" + + +def _serialize_history_citation_value(value, max_chars=1200): + if value in (None, '', [], {}): + return '' + + if isinstance(value, str): + serialized = value + else: + try: + serialized = json.dumps(value, default=str, ensure_ascii=False) + except Exception: + serialized = str(value) + + compact_serialized = ' '.join(serialized.split()) + return _truncate_history_citation_text(compact_serialized, max_chars=max_chars) + + +def _build_agent_citation_history_lines(agent_citations, max_citations=4): + def parse_citation_payload(value): + if isinstance(value, str): + stripped_value = value.strip() + if stripped_value[:1] in ('{', '['): + try: + return json.loads(stripped_value) + except Exception: + return value + return value + + def is_tabular_citation(citation): + if not isinstance(citation, dict): + return False + tool_name = str(citation.get('tool_name') or '') + function_name = str(citation.get('function_name') or '') + plugin_name = str(citation.get('plugin_name') or '') + return ( + plugin_name == 'TabularProcessingPlugin' + or 'TabularProcessingPlugin.' in tool_name + or function_name in { + 'aggregate_column', + 'count_rows', + 'count_rows_by_related_values', + 'describe_tabular_file', + 'filter_rows', + 'filter_rows_by_related_values', + 'get_distinct_values', + 'group_by_aggregate', + 'group_by_datetime_component', + 'lookup_value', + 'query_tabular_data', + } + ) + + def build_tabular_signature(citation): + arguments = parse_citation_payload(citation.get('function_arguments')) + result = parse_citation_payload(citation.get('function_result')) + if not isinstance(arguments, dict): + arguments = {} + if not isinstance(result, dict): + result = {} + + tool_signature_name = str(citation.get('function_name') or citation.get('tool_name') or '').strip() + if ' [' in tool_signature_name: + tool_signature_name = tool_signature_name.split(' [', 1)[0] + + signature_payload = { + 'tool': tool_signature_name, + 'filename': result.get('filename') or arguments.get('filename'), + 'column': result.get('column') or arguments.get('column'), + 'values': result.get('values'), + 'sample_rows': result.get('sample_rows'), + 'value': result.get('value'), + } + try: + return json.dumps(signature_payload, sort_keys=True, default=str) + except Exception: + return str(signature_payload) + + def summarize_tabular_values(values, max_chars=2200, max_items=60): + if not isinstance(values, list) or not values: + return '' + + compact_values = [] + current_length = 0 + for index, item in enumerate(values[:max_items]): + item_text = _serialize_history_citation_value(item, max_chars=300) + if not item_text: + continue + + separator_length = 2 if compact_values else 0 + if current_length + separator_length + len(item_text) > max_chars: + remaining = len(values) - index + compact_values.append(f"... (+{remaining} more values)") + break + + compact_values.append(item_text) + current_length += separator_length + len(item_text) + + if len(values) > max_items and (not compact_values or not str(compact_values[-1]).startswith('... (+')): + compact_values.append(f"... (+{len(values) - max_items} more values)") + + return '; '.join(compact_values) + + def build_tabular_line(citation): + arguments = parse_citation_payload(citation.get('function_arguments')) + result = parse_citation_payload(citation.get('function_result')) + if not isinstance(arguments, dict): + arguments = {} + if not isinstance(result, dict): + result = {} + + tool_name = str(citation.get('tool_name') or citation.get('function_name') or 'TabularProcessingPlugin').strip() + filename = result.get('filename') or arguments.get('filename') or 'unknown file' + selected_sheet = result.get('selected_sheet') or arguments.get('sheet_name') or 'unknown sheet' + column = result.get('column') or arguments.get('column') or 'unknown column' + distinct_count = result.get('distinct_count') + returned_values = result.get('returned_values') + values_summary = summarize_tabular_values(result.get('values')) + + line_parts = [ + tool_name, + f"file={filename}", + f"sheet={selected_sheet}", + f"column={column}", + ] + if distinct_count not in (None, ''): + line_parts.append(f"distinct_count={distinct_count}") + if returned_values not in (None, ''): + line_parts.append(f"returned_values={returned_values}") + if values_summary: + line_parts.append(f"values={values_summary}") + + return f"- {' | '.join(str(part) for part in line_parts if part not in (None, ''))}" + + eligible_citations = [] + seen_tabular_signatures = set() + for citation in agent_citations or []: + if isinstance(citation, dict): + tool_name = str(citation.get('tool_name') or citation.get('function_name') or '').strip() + if tool_name.startswith('[Debug]') or tool_name == 'Conversation History': + continue + if is_tabular_citation(citation): + signature = build_tabular_signature(citation) + if signature in seen_tabular_signatures: + continue + seen_tabular_signatures.add(signature) + eligible_citations.append(citation) + + lines = [] + for citation in eligible_citations[:max_citations]: + if not isinstance(citation, dict): + value_summary = _serialize_history_citation_value(citation, max_chars=800) + if value_summary: + lines.append(f"- Tool result: {value_summary}") + continue + + if is_tabular_citation(citation): + lines.append(build_tabular_line(citation)) + continue + + tool_name = str(citation.get('tool_name') or citation.get('function_name') or 'Tool invocation').strip() + argument_summary = _serialize_history_citation_value(citation.get('function_arguments'), max_chars=350) + result_summary = _serialize_history_citation_value(citation.get('function_result'), max_chars=700) + error_summary = '' + if citation.get('success') is False: + error_summary = _serialize_history_citation_value(citation.get('error_message'), max_chars=400) + + line_parts = [tool_name] + if argument_summary: + line_parts.append(f"args={argument_summary}") + if result_summary: + line_parts.append(f"result={result_summary}") + if error_summary: + line_parts.append(f"error={error_summary}") + lines.append(f"- {' | '.join(line_parts)}") + + remaining = len(eligible_citations) - min(len(eligible_citations), max_citations) + if remaining > 0: + lines.append(f"- ... (+{remaining} more prior tool results)") + + return lines + + +def _build_document_citation_history_lines(hybrid_citations, max_citations=5): + lines = [] + for citation in (hybrid_citations or [])[:max_citations]: + if not isinstance(citation, dict): + continue + + file_name = str(citation.get('file_name') or 'Document').strip() + line_parts = [file_name] + + page_number = citation.get('page_number') + if page_number not in (None, ''): + line_parts.append(f"page {page_number}") + + chunk_sequence = citation.get('chunk_sequence') + chunk_id = citation.get('chunk_id') + if chunk_sequence not in (None, ''): + line_parts.append(f"chunk {chunk_sequence}") + elif chunk_id not in (None, ''): + line_parts.append(f"chunk {chunk_id}") + + classification = citation.get('classification') + if classification not in (None, ''): + line_parts.append(str(classification)) + + lines.append(f"- {', '.join(line_parts)}") + + remaining = max(0, len(hybrid_citations or []) - min(len(hybrid_citations or []), max_citations)) + if remaining > 0: + lines.append(f"- ... (+{remaining} more cited documents)") + + return lines + + +def _build_web_citation_history_lines(web_search_citations, max_citations=4): + lines = [] + for citation in (web_search_citations or [])[:max_citations]: + if not isinstance(citation, dict): + continue + + title = str(citation.get('title') or citation.get('url') or 'Web source').strip() + url = str(citation.get('url') or '').strip() + if url and url != title: + lines.append(f"- {title} ({url})") + else: + lines.append(f"- {title}") + + remaining = max(0, len(web_search_citations or []) - min(len(web_search_citations or []), max_citations)) + if remaining > 0: + lines.append(f"- ... (+{remaining} more web sources)") + + return lines + + +def _parse_json_object_from_text(text): + """Extract a JSON object from a plain text model response.""" + value = str(text or '').strip() + if not value: + return None + + try: + parsed = json.loads(value) + return parsed if isinstance(parsed, dict) else None + except Exception: + pass + + start_index = value.find('{') + end_index = value.rfind('}') + if start_index == -1 or end_index == -1 or end_index <= start_index: + return None + + try: + parsed = json.loads(value[start_index:end_index + 1]) + return parsed if isinstance(parsed, dict) else None + except Exception: + return None + + +def _normalize_prior_grounded_document_refs(conversation_item): + """Return the reusable grounded document set for follow-up turns with search disabled.""" + normalized_refs = [] + seen_refs = set() + + def add_ref(raw_ref): + if not isinstance(raw_ref, dict): + return + + document_id = str(raw_ref.get('document_id') or '').strip() + scope = str(raw_ref.get('scope') or '').strip().lower() + scope_id = str( + raw_ref.get('scope_id') + or raw_ref.get('group_id') + or raw_ref.get('public_workspace_id') + or raw_ref.get('user_id') + or '' + ).strip() + if not document_id or not scope or not scope_id: + return + + ref_key = (scope, scope_id, document_id) + if ref_key in seen_refs: + return + + seen_refs.add(ref_key) + + normalized_ref = { + 'document_id': document_id, + 'scope': scope, + 'scope_id': scope_id, + 'file_name': raw_ref.get('file_name') or raw_ref.get('title'), + 'classification': raw_ref.get('classification'), + } + + if scope == 'group': + normalized_ref['group_id'] = scope_id + elif scope == 'public': + normalized_ref['public_workspace_id'] = scope_id + else: + normalized_ref['user_id'] = scope_id + + normalized_refs.append(normalized_ref) + + for raw_ref in (conversation_item or {}).get('last_grounded_document_refs', []) or []: + add_ref(raw_ref) + + if normalized_refs: + return normalized_refs + + for tag in (conversation_item or {}).get('tags', []) or []: + if not isinstance(tag, dict) or tag.get('category') != 'document': + continue + + scope_info = tag.get('scope') or {} + add_ref({ + 'document_id': tag.get('document_id'), + 'scope': scope_info.get('type'), + 'scope_id': scope_info.get('id'), + 'title': tag.get('title'), + 'classification': tag.get('classification'), + }) + + return normalized_refs + + +def build_prior_grounded_document_search_parameters(grounded_refs): + """Translate grounded document refs into bounded search parameters.""" + document_ids = [] + group_ids = [] + public_workspace_ids = [] + scope_types = set() + + for ref in grounded_refs or []: + if not isinstance(ref, dict): + continue + + document_id = str(ref.get('document_id') or '').strip() + if document_id and document_id not in document_ids: + document_ids.append(document_id) + + scope = str(ref.get('scope') or '').strip().lower() + if not scope: + continue + scope_types.add(scope) + + if scope == 'group': + group_id = str(ref.get('group_id') or ref.get('scope_id') or '').strip() + if group_id and group_id not in group_ids: + group_ids.append(group_id) + elif scope == 'public': + public_workspace_id = str(ref.get('public_workspace_id') or ref.get('scope_id') or '').strip() + if public_workspace_id and public_workspace_id not in public_workspace_ids: + public_workspace_ids.append(public_workspace_id) + + if len(scope_types) == 1: + doc_scope = next(iter(scope_types)) + else: + doc_scope = 'all' + + return { + 'document_ids': document_ids, + 'doc_scope': doc_scope, + 'active_group_ids': group_ids, + 'active_group_id': group_ids[0] if group_ids else None, + 'active_public_workspace_ids': public_workspace_ids, + 'active_public_workspace_id': public_workspace_ids[0] if public_workspace_ids else None, + 'scope_types': sorted(scope_types), + } + + +def build_history_only_assessment_messages(history_segments, default_system_prompt=''): + """Construct the prompt context used to decide whether history alone is sufficient.""" + assessment_messages = [] + summary_of_older = str((history_segments or {}).get('summary_of_older') or '').strip() + if summary_of_older: + assessment_messages.append({ + 'role': 'system', + 'content': ( + f"\n{summary_of_older}\n" + "" + ) + }) + + normalized_default_system_prompt = str(default_system_prompt or '').strip() + if normalized_default_system_prompt: + assessment_messages.append({ + 'role': 'system', + 'content': normalized_default_system_prompt, + }) + + assessment_messages.extend((history_segments or {}).get('history_messages', [])) + return assessment_messages + + +def assess_history_only_answerability(gpt_client, gpt_model, conversation_history_for_api): + """Return whether the current question can be answered from existing conversation grounding alone.""" + assessment_prompt = ( + "You are evaluating whether the latest user question can be answered using only the " + "existing conversation context already provided. Earlier assistant turns may include " + "supporting citation context from previously grounded document answers.\n\n" + "Respond with JSON only using this schema:\n" + "{\"can_answer_from_history\": true|false, \"search_query\": \"...\", \"reason\": \"...\"}\n\n" + "Set can_answer_from_history to true only if the conversation already contains enough " + "grounded information to answer confidently without retrieving any new document excerpts. " + "If false, produce a concise standalone search_query that resolves pronouns and omitted " + "references from the conversation for use against the previously grounded documents. " + "Keep reason short." + ) + + assessment_messages = [{'role': 'system', 'content': assessment_prompt}] + assessment_messages.extend(conversation_history_for_api or []) + + assessment_response = gpt_client.chat.completions.create( + model=gpt_model, + messages=assessment_messages, + max_tokens=180, + temperature=0, + ) + response_text = str(assessment_response.choices[0].message.content or '').strip() + response_payload = _parse_json_object_from_text(response_text) or {} + + can_answer_from_history = response_payload.get('can_answer_from_history') + if isinstance(can_answer_from_history, str): + can_answer_from_history = can_answer_from_history.strip().lower() == 'true' + else: + can_answer_from_history = bool(can_answer_from_history) + + return { + 'can_answer_from_history': can_answer_from_history, + 'search_query': str(response_payload.get('search_query') or '').strip(), + 'reason': str(response_payload.get('reason') or '').strip(), + 'raw_response': response_text, + } + + +def build_history_grounding_system_message(): + """Instruction used when explicit workspace search is disabled for the current turn.""" + return { + 'role': 'system', + 'content': ( + "Workspace search is disabled for this turn. Answer only from the existing conversation " + "context and any retrieved document excerpts explicitly provided in this turn. If those " + "sources are insufficient, say that you do not have enough grounded information from the " + "prior conversation sources and ask the user to select a workspace or document." + ), + } + + +def build_assistant_history_content_with_citations(message, content): + base_content = str(content or '').strip() + citation_sections = [] + + agent_lines = _build_agent_citation_history_lines(message.get('agent_citations', [])) + if agent_lines: + citation_sections.append("Prior tool results:\n" + "\n".join(agent_lines)) + + document_lines = _build_document_citation_history_lines(message.get('hybrid_citations', [])) + if document_lines: + citation_sections.append("Prior cited documents:\n" + "\n".join(document_lines)) + + web_lines = _build_web_citation_history_lines(message.get('web_search_citations', [])) + if web_lines: + citation_sections.append("Prior cited web sources:\n" + "\n".join(web_lines)) + + if not citation_sections: + return content + + citation_context = ( + "\n" + + "\n\n".join(citation_sections) + + "\n" + ) + citation_context = _truncate_history_citation_text(citation_context, max_chars=5200) + + if not base_content: + return citation_context + + return f"{base_content}\n\n{citation_context}" + + +def build_history_context_thought_content(history_debug_info): + history_debug_info = history_debug_info or {} + stored_total = history_debug_info.get('stored_total_messages', 0) + recent_count = history_debug_info.get('recent_message_count', 0) + final_api_count = history_debug_info.get('final_api_message_count', 0) + older_count = history_debug_info.get('older_message_count', 0) + summary_requested = history_debug_info.get('summary_requested', False) + summary_used = history_debug_info.get('summary_used', False) + + summary_note = 'no older messages' + if older_count > 0: + if summary_used: + summary_note = f"summarized {history_debug_info.get('summarized_message_count', 0)} older" + elif summary_requested: + summary_note = 'older summary unavailable' + else: + summary_note = 'older summary disabled' + + return ( + f"Prepared {final_api_count} model history messages from {stored_total} stored messages " + f"(recent={recent_count}; {summary_note})" + ) + + +def build_history_context_thought_detail(history_debug_info): + history_debug_info = history_debug_info or {} + lines = [ + f"path: {history_debug_info.get('path', 'unknown')}", + ( + f"stored_total={history_debug_info.get('stored_total_messages', 0)}, " + f"history_limit={history_debug_info.get('history_limit', 0)}, " + f"older_count={history_debug_info.get('older_message_count', 0)}, " + f"recent_count={history_debug_info.get('recent_message_count', 0)}, " + f"summary_requested={history_debug_info.get('summary_requested', False)}, " + f"summary_used={history_debug_info.get('summary_used', False)}, " + f"augmentation_count={history_debug_info.get('augmentation_message_count', 0)}, " + f"default_system_prompt_inserted={history_debug_info.get('default_system_prompt_inserted', False)}" + ), + f"older_refs: {_format_history_refs_for_detail(history_debug_info.get('older_message_refs', []))}", + f"recent_refs: {_format_history_refs_for_detail(history_debug_info.get('selected_recent_message_refs', []))}", + f"summarized_refs: {_format_history_refs_for_detail(history_debug_info.get('summarized_message_refs', []))}", + f"skipped_inactive_refs: {_format_history_refs_for_detail(history_debug_info.get('skipped_inactive_message_refs', []))}", + f"skipped_masked_refs: {_format_history_refs_for_detail(history_debug_info.get('skipped_masked_message_refs', []))}", + f"masked_range_refs: {_format_history_refs_for_detail(history_debug_info.get('masked_range_message_refs', []))}", + f"history_segment_refs: {_format_history_refs_for_detail(history_debug_info.get('history_message_source_refs', []))}", + f"final_api_roles: {_format_history_refs_for_detail(history_debug_info.get('final_api_message_roles', []))}", + f"final_api_refs: {_format_history_refs_for_detail(history_debug_info.get('final_api_source_refs', []))}", + ] + return "\n".join(lines) + + +def build_history_context_debug_citation(history_debug_info, path_label): + history_debug_info = dict(history_debug_info or {}) + history_debug_info['path'] = path_label + return { + 'tool_name': 'Conversation History', + 'function_arguments': json.dumps({ + 'path': path_label, + 'stored_total_messages': history_debug_info.get('stored_total_messages', 0), + 'history_limit': history_debug_info.get('history_limit', 0), + 'older_message_count': history_debug_info.get('older_message_count', 0), + 'recent_message_count': history_debug_info.get('recent_message_count', 0), + 'final_api_message_count': history_debug_info.get('final_api_message_count', 0), + 'summary_requested': history_debug_info.get('summary_requested', False), + 'summary_used': history_debug_info.get('summary_used', False), + }), + 'function_result': build_history_context_thought_detail(history_debug_info), + 'timestamp': datetime.utcnow().isoformat(), + } + + +def enrich_history_context_debug_info( + history_debug_info, + conversation_history_for_api, + final_api_source_refs, + path_label, + augmentation_message_count=0, + default_system_prompt_inserted=False, +): + enriched = dict(history_debug_info or {}) + enriched['path'] = path_label + enriched['augmentation_message_count'] = augmentation_message_count + enriched['default_system_prompt_inserted'] = bool(default_system_prompt_inserted) + enriched['final_api_message_count'] = len(conversation_history_for_api or []) + enriched['final_api_message_roles'] = [ + str((message or {}).get('role') or 'unknown') + for message in (conversation_history_for_api or []) + ] + enriched['final_api_source_refs'] = _capture_history_refs(final_api_source_refs, max_items=20) + return enriched + + +def emit_history_context_debug(history_debug_info, conversation_id): + debug_payload = history_debug_info or {} + debug_print( + f"[History Context][{debug_payload.get('path', 'unknown')}] conversation_id={conversation_id} | " + f"{json.dumps(debug_payload, default=str)}" + ) + + +def build_conversation_history_segments( + all_messages, + conversation_history_limit, + enable_summarize_older_messages=False, + gpt_client=None, + gpt_model=None, + user_message_id=None, + fallback_user_message="", +): + """Build shared conversation history segments for chat completions.""" + conversation_history_messages = [] + summary_of_older = "" + chat_tabular_files = set() + + artifact_payload_map = build_message_artifact_payload_map(all_messages or []) + filtered_messages = filter_assistant_artifact_items(all_messages or []) + filtered_messages = hydrate_agent_citations_from_artifacts(filtered_messages, artifact_payload_map) + ordered_messages = sort_messages_by_thread(filtered_messages) + + total_messages = len(ordered_messages) + num_recent_messages = min(total_messages, conversation_history_limit) + num_older_messages = total_messages - num_recent_messages + + recent_messages = ordered_messages[-num_recent_messages:] if num_recent_messages else [] + older_messages_to_summarize = ordered_messages[:num_older_messages] + + summarized_message_refs = [] + skipped_inactive_message_refs = [] + skipped_masked_message_refs = [] + masked_range_message_refs = [] + history_message_source_refs = [] + appended_fallback_user_message = False + + if enable_summarize_older_messages and older_messages_to_summarize and gpt_client and gpt_model: + debug_print( + f"Summarizing {len(older_messages_to_summarize)} older messages for current conversation history" + ) + summary_prompt_older = ( + "Summarize the following conversation history concisely (around 50-100 words), " + "focusing on key facts, decisions, or context that might be relevant for future turns. " + "Do not add any introductory phrases like 'Here is a summary'.\n\n" + "Conversation History:\n" + ) + message_texts_older = [] + for message in older_messages_to_summarize: + role = message.get('role', 'user') + metadata = message.get('metadata', {}) + thread_info = metadata.get('thread_info', {}) + active_thread = thread_info.get('active_thread') + + if active_thread is False: + debug_print(f"[THREAD] Skipping inactive thread message {message.get('id')} from summary") + skipped_inactive_message_refs.append(_format_history_message_ref(message)) + continue + + if role in ['system', 'safety', 'blocked', 'image', 'file']: + continue + + content = message.get('content', '') + if role == 'assistant': + content = build_assistant_history_content_with_citations(message, content) + message_texts_older.append(f"{role.upper()}: {content}") + summarized_message_refs.append(_format_history_message_ref(message)) + + if message_texts_older: + summary_prompt_older += "\n".join(message_texts_older) + try: + summary_response_older = gpt_client.chat.completions.create( + model=gpt_model, + messages=[{"role": "system", "content": summary_prompt_older}], + max_tokens=150, + temperature=0.3, + ) + summary_of_older = summary_response_older.choices[0].message.content.strip() + debug_print(f"Generated summary: {summary_of_older}") + except Exception as exc: + debug_print(f"Error summarizing older conversation history: {exc}") + summary_of_older = "" + else: + debug_print("No summarizable content found in older messages.") + + allowed_roles_in_history = ['user', 'assistant'] + max_file_content_length_in_history = 50000 + max_tabular_content_length_in_history = 50000 + + for message in recent_messages: + role = message.get('role') + content = message.get('content') + metadata = message.get('metadata', {}) + + thread_info = metadata.get('thread_info', {}) + active_thread = thread_info.get('active_thread') + if active_thread is False: + debug_print( + f"[THREAD] Skipping inactive thread message {message.get('id')} " + f"(thread_id: {thread_info.get('thread_id')}, attempt: {thread_info.get('thread_attempt')})" + ) + skipped_inactive_message_refs.append(_format_history_message_ref(message)) + continue + + if metadata.get('masked', False): + debug_print(f"[MASK] Skipping fully masked message {message.get('id')}") + skipped_masked_message_refs.append(_format_history_message_ref(message)) + continue + + masked_ranges = metadata.get('masked_ranges', []) + if masked_ranges and content: + content = remove_masked_content(content, masked_ranges) + masked_range_message_refs.append(_format_history_message_ref(message)) + debug_print(f"[MASK] Applied {len(masked_ranges)} masked ranges to message {message.get('id')}") + + if role in allowed_roles_in_history: + if role == 'assistant': + content = build_assistant_history_content_with_citations(message, content) + conversation_history_messages.append({"role": role, "content": content}) + history_message_source_refs.append(_format_history_message_ref(message)) + elif role == 'file': + filename = message.get('filename', 'uploaded_file') + file_content = message.get('file_content', '') + is_table = message.get('is_table', False) + file_content_source = message.get('file_content_source', '') + + if is_table and file_content_source == 'blob': + chat_tabular_files.add(filename) + conversation_history_messages.append({ + 'role': 'system', + 'content': ( + f"[User uploaded a tabular data file named '{filename}'. " + f"The file is stored in blob storage and available for analysis. " + f"Use the tabular_processing plugin functions (list_tabular_files, describe_tabular_file, " + f"aggregate_column, filter_rows, query_tabular_data, group_by_aggregate, " + f"group_by_datetime_component) to analyze this data. " + f"The file source is 'chat'.]" + ) + }) + else: + content_limit = ( + max_tabular_content_length_in_history + if is_table else max_file_content_length_in_history + ) + display_content = file_content[:content_limit] + if len(file_content) > content_limit: + display_content += "..." + + if is_table: + conversation_history_messages.append({ + 'role': 'system', + 'content': ( + f"[User uploaded a tabular data file named '{filename}'. This is CSV format data for analysis:\n" + f"{display_content}]\n" + "This is complete tabular data in CSV format. You can perform calculations, analysis, and " + "data operations on this dataset." + ) + }) + else: + conversation_history_messages.append({ + 'role': 'system', + 'content': ( + f"[User uploaded a file named '{filename}'. Content preview:\n{display_content}]\n" + "Use this file context if relevant." + ) + }) + history_message_source_refs.append(f"system:file:{message.get('id', 'unknown')}") + elif role == 'image': + filename = message.get('filename', 'uploaded_image') + is_user_upload = metadata.get('is_user_upload', False) + + if is_user_upload: + extracted_text = message.get('extracted_text', '') + vision_analysis = message.get('vision_analysis', {}) + image_context_parts = [f"[User uploaded an image named '{filename}'.]"] + + if extracted_text: + extracted_preview = extracted_text[:max_file_content_length_in_history] + if len(extracted_text) > max_file_content_length_in_history: + extracted_preview += "..." + image_context_parts.append(f"\n\nExtracted Text (OCR):\n{extracted_preview}") + + if vision_analysis: + image_context_parts.append("\n\nAI Vision Analysis:") + if vision_analysis.get('description'): + image_context_parts.append(f"\nDescription: {vision_analysis['description']}") + if vision_analysis.get('objects'): + objects_str = ', '.join(vision_analysis['objects']) + image_context_parts.append(f"\nObjects detected: {objects_str}") + if vision_analysis.get('text'): + image_context_parts.append(f"\nText visible in image: {vision_analysis['text']}") + if vision_analysis.get('contextual_analysis'): + image_context_parts.append( + f"\nContextual analysis: {vision_analysis['contextual_analysis']}" + ) + + image_context_content = ''.join(image_context_parts) + image_context_content += "\n\nUse this image information to answer questions about the uploaded image." + + if 'data:image/' in image_context_content or ';base64,' in image_context_content: + debug_print( + f"WARNING: Base64 image data detected in chat history for {filename}! Removing to save tokens." + ) + image_context_content = ( + f"[User uploaded an image named '{filename}' - image data excluded from chat history to conserve tokens]" + ) + + debug_print( + f"[IMAGE_CONTEXT] Adding user-uploaded image to history: {filename}, " + f"context length: {len(image_context_content)} chars" + ) + conversation_history_messages.append({ + 'role': 'system', + 'content': image_context_content, + }) + else: + prompt = message.get('prompt', 'User requested image generation.') + debug_print(f"[IMAGE_CONTEXT] Adding system-generated image to history: {prompt[:100]}...") + conversation_history_messages.append({ + 'role': 'system', + 'content': f"[Assistant generated an image based on the prompt: '{prompt}']", + }) + + history_message_source_refs.append(f"system:image:{message.get('id', 'unknown')}") + + if not conversation_history_messages or conversation_history_messages[-1].get('role') != 'user': + debug_print("Warning: Last message in history is not the user's current message. Appending.") + user_msg_found = False + for message in reversed(recent_messages): + if message.get('role') != 'user': + continue + if user_message_id and message.get('id') != user_message_id: + continue + conversation_history_messages.append({ + 'role': 'user', + 'content': message.get('content', ''), + }) + history_message_source_refs.append(_format_history_message_ref(message)) + user_msg_found = True + break + + if not user_msg_found and fallback_user_message: + conversation_history_messages.append({ + 'role': 'user', + 'content': fallback_user_message, + }) + history_message_source_refs.append('user:fallback_input') + appended_fallback_user_message = True + + debug_info = { + 'history_limit': conversation_history_limit, + 'summary_requested': bool(enable_summarize_older_messages), + 'summary_used': bool(summary_of_older), + 'stored_total_messages': total_messages, + 'older_message_count': len(older_messages_to_summarize), + 'recent_message_count': len(recent_messages), + 'summarized_message_count': len(summarized_message_refs), + 'older_message_refs': _capture_history_refs( + [_format_history_message_ref(message) for message in older_messages_to_summarize] + ), + 'selected_recent_message_refs': _capture_history_refs( + [_format_history_message_ref(message) for message in recent_messages] + ), + 'summarized_message_refs': _capture_history_refs(summarized_message_refs), + 'skipped_inactive_message_refs': _capture_history_refs(skipped_inactive_message_refs), + 'skipped_masked_message_refs': _capture_history_refs(skipped_masked_message_refs), + 'masked_range_message_refs': _capture_history_refs(masked_range_message_refs), + 'history_message_source_refs': _capture_history_refs(history_message_source_refs, max_items=20), + 'appended_fallback_user_message': appended_fallback_user_message, + } + + return { + 'summary_of_older': summary_of_older, + 'history_messages': conversation_history_messages, + 'chat_tabular_files': chat_tabular_files, + 'debug_info': debug_info, + } + + def _extract_web_search_citations_from_content(content: str) -> List[Dict[str, str]]: if not content: return [] diff --git a/application/single_app/route_backend_documents.py b/application/single_app/route_backend_documents.py index bd4d7e43..b70d9980 100644 --- a/application/single_app/route_backend_documents.py +++ b/application/single_app/route_backend_documents.py @@ -4,6 +4,8 @@ from functions_authentication import * from functions_documents import * from functions_settings import * +from functions_group import get_user_groups +from functions_public_workspaces import get_user_visible_public_workspace_ids_from_settings from utils_cache import invalidate_personal_search_cache from functions_debug import * from functions_activity_logging import log_document_upload, log_document_metadata_update_transaction @@ -14,6 +16,102 @@ from swagger_wrapper import swagger_route, get_auth_security from functions_debug import debug_print + +def _extract_citation_document_id(chunk, citation_id): + document_id = (chunk or {}).get('document_id') if isinstance(chunk, dict) else None + if document_id: + return str(document_id) + + if citation_id and '_' in citation_id: + return citation_id.rsplit('_', 1)[0] + + return citation_id + + +def _try_get_document_json(user_id, document_id, group_id=None, public_workspace_id=None): + try: + doc_response, status_code = get_document( + user_id, + document_id, + group_id=group_id, + public_workspace_id=public_workspace_id, + ) + except Exception: + return None + + if status_code != 200: + return None + + if isinstance(doc_response, dict): + return doc_response + + get_json = getattr(doc_response, 'get_json', None) + if callable(get_json): + return get_json() + + return None + + +def _find_accessible_citation_document(user_id, document_id, scope_name): + if not user_id or not document_id: + return None + + settings = get_settings() + + if scope_name == 'personal': + if not settings.get('enable_user_workspace', False): + return None + return _try_get_document_json(user_id, document_id) + + if scope_name == 'group': + if not settings.get('enable_group_workspaces', False): + return None + + try: + user_groups = get_user_groups(user_id) + except Exception: + return None + + for group in user_groups: + group_id = group.get('id') + if not group_id: + continue + + document_json = _try_get_document_json( + user_id, + document_id, + group_id=group_id, + ) + if document_json: + return document_json + + return None + + if scope_name == 'public': + if not settings.get('enable_public_workspaces', False): + return None + + try: + workspace_ids = get_user_visible_public_workspace_ids_from_settings(user_id) + except Exception: + return None + + for workspace_id in workspace_ids: + if not workspace_id: + continue + + document_json = _try_get_document_json( + user_id, + document_id, + public_workspace_id=workspace_id, + ) + if document_json: + return document_json + + return None + + return None + def register_route_backend_documents(app): @app.route('/api/get_file_content', methods=['POST']) @swagger_route(security=get_auth_security()) @@ -428,48 +526,34 @@ def api_get_user_documents(): # Combine conditions into the WHERE clause where_clause = " AND ".join(query_conditions) - # --- 3) First query: get total count based on filters --- - try: - count_query_str = f"SELECT VALUE COUNT(1) FROM c WHERE {where_clause}" - # debug_print(f"Count Query: {count_query_str}") # Optional Debugging - # debug_print(f"Count Params: {query_params}") # Optional Debugging - count_items = list(cosmos_user_documents_container.query_items( - query=count_query_str, - parameters=query_params, - enable_cross_partition_query=True # May be needed if user_id is not partition key - )) - total_count = count_items[0] if count_items else 0 - - except Exception as e: - debug_print(f"Error executing count query: {e}") # Log the error - return jsonify({"error": f"Error counting documents: {str(e)}"}), 500 - - - # --- 4) Second query: fetch the page of data based on filters --- + # --- 3) Query matching documents, then collapse to current revisions before paginating --- try: offset = (page - 1) * page_size data_query_str = f""" SELECT * FROM c WHERE {where_clause} - ORDER BY c.{sort_by} {sort_order} - OFFSET {offset} LIMIT {page_size} """ - # debug_print(f"Data Query: {data_query_str}") # Optional Debugging - # debug_print(f"Data Params: {query_params}") # Optional Debugging - docs = list(cosmos_user_documents_container.query_items( + matching_docs = list(cosmos_user_documents_container.query_items( query=data_query_str, parameters=query_params, - enable_cross_partition_query=True # May be needed if user_id is not partition key + enable_cross_partition_query=True )) + current_docs = sort_documents( + select_current_documents(matching_docs), + sort_by=sort_by, + sort_order=sort_order, + ) + total_count = len(current_docs) + docs = current_docs[offset:offset + page_size] + # Add shared_approval_status and owner_id for each doc for doc in docs: doc["owner_id"] = doc.get("user_id") # Always set owner_id to the original user_id if doc.get("user_id") == user_id: doc["shared_approval_status"] = "owner" else: - # Find entry for this user in shared_user_ids status = None for entry in doc.get("shared_user_ids", []): if entry.startswith(f"{user_id},"): @@ -477,7 +561,7 @@ def api_get_user_documents(): break doc["shared_approval_status"] = status or "none" except Exception as e: - debug_print(f"Error executing data query: {e}") # Log the error + debug_print(f"Error executing data query: {e}") return jsonify({"error": f"Error fetching documents: {str(e)}"}), 500 @@ -673,15 +757,21 @@ def api_delete_user_document(document_id): user_id = get_current_user_id() if not user_id: return jsonify({'error': 'User not authenticated'}), 401 + + delete_mode = request.args.get('delete_mode', 'all_versions') + if delete_mode not in {'all_versions', 'current_only'}: + return jsonify({'error': 'Invalid delete mode'}), 400 try: - delete_document(user_id, document_id) - delete_document_chunks(document_id) + delete_result = delete_document_revision(user_id, document_id, delete_mode=delete_mode) # Invalidate search cache since document was deleted invalidate_personal_search_cache(user_id) - return jsonify({'message': 'Document deleted successfully'}), 200 + return jsonify({ + 'message': 'Document deleted successfully', + **delete_result, + }), 200 except Exception as e: return jsonify({'error': f'Error deleting document: {str(e)}'}), 500 @@ -733,53 +823,40 @@ def get_citation(): if not citation_id: return jsonify({"error": "Missing citation_id"}), 400 - try: - search_client_user = CLIENTS['search_client_user'] - chunk = search_client_user.get_document(key=citation_id) - - # Check if user owns the document or if document is shared with user - chunk_user_id = chunk.get("user_id") - chunk_shared_user_ids = chunk.get("shared_user_ids", []) - - # Allow access if user is owner or in shared_user_ids (prefix match) - is_shared = any( - entry == user_id or entry.startswith(f"{user_id},") - for entry in chunk_shared_user_ids - ) - if chunk_user_id != user_id and not is_shared: - return jsonify({"error": "Unauthorized access to citation"}), 403 - + def build_citation_response(chunk): return jsonify({ "cited_text": chunk.get("chunk_text", ""), "file_name": chunk.get("file_name", ""), "page_number": chunk.get("chunk_sequence", 0) }), 200 + def get_citation_for_scope(search_client, scope_name): + chunk = search_client.get_document(key=citation_id) + document_id = _extract_citation_document_id(chunk, citation_id) + accessible_document = _find_accessible_citation_document(user_id, document_id, scope_name) + + if not accessible_document: + return jsonify({"error": "Unauthorized access to citation"}), 403 + + return build_citation_response(chunk) + + try: + search_client_user = CLIENTS['search_client_user'] + return get_citation_for_scope(search_client_user, 'personal') + except ResourceNotFoundError: pass try: search_client_group = CLIENTS['search_client_group'] - group_chunk = search_client_group.get_document(key=citation_id) - - return jsonify({ - "cited_text": group_chunk.get("chunk_text", ""), - "file_name": group_chunk.get("file_name", ""), - "page_number": group_chunk.get("chunk_sequence", 0) - }), 200 + return get_citation_for_scope(search_client_group, 'group') except ResourceNotFoundError: pass try: search_client_public = CLIENTS['search_client_public'] - public_chunk = search_client_public.get_document(key=citation_id) - - return jsonify({ - "cited_text": public_chunk.get("chunk_text", ""), - "file_name": public_chunk.get("file_name", ""), - "page_number": public_chunk.get("chunk_sequence", 0) - }), 200 + return get_citation_for_scope(search_client_public, 'public') except ResourceNotFoundError: return jsonify({"error": "Citation not found in user, group, or public docs"}), 404 diff --git a/application/single_app/route_backend_group_documents.py b/application/single_app/route_backend_group_documents.py index 957c3ee4..d8f00a04 100644 --- a/application/single_app/route_backend_group_documents.py +++ b/application/single_app/route_backend_group_documents.py @@ -277,34 +277,26 @@ def api_get_group_documents(): where_clause = " AND ".join(query_conditions) - # --- 3) Get total count --- - try: - count_query_str = f"SELECT VALUE COUNT(1) FROM c WHERE {where_clause}" - count_items = list(cosmos_group_documents_container.query_items( - query=count_query_str, - parameters=query_params, - enable_cross_partition_query=True - )) - total_count = count_items[0] if count_items else 0 - except Exception as e: - print(f"Error executing count query for group: {e}") - return jsonify({"error": f"Error counting documents: {str(e)}"}), 500 - - # --- 4) Get paginated data --- + # --- 3) Query matching documents, then collapse to current revisions before paginating --- try: offset = (page - 1) * page_size data_query_str = f""" SELECT * FROM c WHERE {where_clause} - ORDER BY c.{sort_by} {sort_order} - OFFSET {offset} LIMIT {page_size} """ - docs = list(cosmos_group_documents_container.query_items( + matching_docs = list(cosmos_group_documents_container.query_items( query=data_query_str, parameters=query_params, enable_cross_partition_query=True )) + current_docs = sort_documents( + select_current_documents(matching_docs), + sort_by=sort_by, + sort_order=sort_order, + ) + total_count = len(current_docs) + docs = current_docs[offset:offset + page_size] except Exception as e: print(f"Error fetching group documents: {e}") return jsonify({"error": f"Error fetching documents: {str(e)}"}), 500 @@ -570,14 +562,25 @@ def api_delete_group_document(document_id): if role not in ["Owner", "Admin", "DocumentManager"]: return jsonify({'error': 'You do not have permission to delete documents in this group'}), 403 + delete_mode = request.args.get('delete_mode', 'all_versions') + if delete_mode not in {'all_versions', 'current_only'}: + return jsonify({'error': 'Invalid delete mode'}), 400 + try: - delete_document(user_id=user_id, document_id=document_id, group_id=active_group_id) - delete_document_chunks(document_id=document_id, group_id=active_group_id) + delete_result = delete_document_revision( + user_id=user_id, + document_id=document_id, + delete_mode=delete_mode, + group_id=active_group_id, + ) # Invalidate group search cache since document was deleted invalidate_group_search_cache(active_group_id) - return jsonify({'message': 'Group document deleted successfully'}), 200 + return jsonify({ + 'message': 'Group document deleted successfully', + **delete_result, + }), 200 except Exception as e: return jsonify({'error': f'Error deleting group document: {str(e)}'}), 500 diff --git a/application/single_app/route_backend_public_documents.py b/application/single_app/route_backend_public_documents.py index dab2bdb8..3b7486bd 100644 --- a/application/single_app/route_backend_public_documents.py +++ b/application/single_app/route_backend_public_documents.py @@ -207,18 +207,17 @@ def api_list_public_documents(): where = ' AND '.join(conds) - # count - count_q = f'SELECT VALUE COUNT(1) FROM c WHERE {where}' - total = list(cosmos_public_documents_container.query_items( - query=count_q, parameters=params, enable_cross_partition_query=True - )) - total_count = total[0] if total else 0 - - # data - data_q = f'SELECT * FROM c WHERE {where} ORDER BY c.{sort_by} {sort_order} OFFSET {offset} LIMIT {page_size}' - docs = list(cosmos_public_documents_container.query_items( + data_q = f'SELECT * FROM c WHERE {where}' + matching_docs = list(cosmos_public_documents_container.query_items( query=data_q, parameters=params, enable_cross_partition_query=True )) + current_docs = sort_documents( + select_current_documents(matching_docs), + sort_by=sort_by, + sort_order=sort_order, + ) + total_count = len(current_docs) + docs = current_docs[offset:offset + page_size] # legacy legacy_q = 'SELECT VALUE COUNT(1) FROM c WHERE c.public_workspace_id = @ws AND NOT IS_DEFINED(c.percentage_complete)' @@ -284,8 +283,7 @@ def api_list_public_workspace_documents(): enable_cross_partition_query=True )) - # Limit results to page_size - docs = docs[:page_size] + docs = sort_documents(select_current_documents(docs))[:page_size] return jsonify({ 'documents': docs, @@ -404,14 +402,21 @@ def api_delete_public_document(doc_id): role = get_user_role_in_public_workspace(ws_doc, user_id) if ws_doc else None if role not in ['Owner','Admin','DocumentManager']: return jsonify({'error':'Access denied'}), 403 + delete_mode = request.args.get('delete_mode', 'all_versions') + if delete_mode not in {'all_versions', 'current_only'}: + return jsonify({'error': 'Invalid delete mode'}), 400 try: - delete_document(user_id=user_id, document_id=doc_id, public_workspace_id=active_ws) - delete_document_chunks(document_id=doc_id, public_workspace_id=active_ws) + delete_result = delete_document_revision( + user_id=user_id, + document_id=doc_id, + delete_mode=delete_mode, + public_workspace_id=active_ws, + ) # Invalidate public workspace search cache since document was deleted invalidate_public_workspace_search_cache(active_ws) - return jsonify({'message':'Deleted'}), 200 + return jsonify({'message':'Deleted', **delete_result}), 200 except Exception as e: return jsonify({'error':str(e)}), 500 diff --git a/application/single_app/route_enhanced_citations.py b/application/single_app/route_enhanced_citations.py index 60675f41..29de8313 100644 --- a/application/single_app/route_enhanced_citations.py +++ b/application/single_app/route_enhanced_citations.py @@ -12,15 +12,98 @@ from functions_authentication import login_required, user_required, get_current_user_id from functions_settings import get_settings, enabled_required -from functions_documents import get_document_metadata +from functions_documents import get_document_metadata, get_document_blob_storage_info from functions_group import get_user_groups from functions_public_workspaces import get_user_visible_public_workspace_ids_from_settings from swagger_wrapper import swagger_route, get_auth_security from config import CLIENTS, storage_account_user_documents_container_name, storage_account_group_documents_container_name, storage_account_public_documents_container_name, storage_account_personal_chat_container_name, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS, TABULAR_EXTENSIONS, cosmos_messages_container, cosmos_conversations_container from functions_debug import debug_print + +def _sanitize_tabular_preview_value(value): + """Convert pandas preview values into JSON-safe display strings.""" + if hasattr(value, 'item') and not isinstance(value, (str, bytes)): + try: + value = value.item() + except (TypeError, ValueError): + pass + + if value is None: + return '' + + if pandas.api.types.is_scalar(value): + try: + if pandas.isna(value): + return '' + except (TypeError, ValueError): + pass + + if isinstance(value, bytes): + return value.decode('utf-8', errors='replace') + + if hasattr(value, 'isoformat') and not isinstance(value, str): + try: + return value.isoformat() + except TypeError: + pass + + return str(value) + + +def _serialize_tabular_preview_table(df_preview): + """Build JSON-safe tabular preview payload pieces for the browser.""" + columns = [ + _sanitize_tabular_preview_value(column) + for column in df_preview.columns.tolist() + ] + rows = [ + [_sanitize_tabular_preview_value(cell) for cell in row] + for row in df_preview.itertuples(index=False, name=None) + ] + return columns, rows + def register_enhanced_citations_routes(app): """Register enhanced citations routes""" + + @app.route("/api/enhanced_citations/document_metadata", methods=["GET"]) + @swagger_route(security=get_auth_security()) + @login_required + @user_required + @enabled_required("enable_enhanced_citations") + def get_enhanced_citation_document_metadata(): + """ + Return minimal document metadata for an exact historical or current doc_id. + This lets the chat UI render enhanced citations even when the cited + document revision is not part of the currently loaded workspace list. + """ + doc_id = request.args.get("doc_id") + if not doc_id: + return jsonify({"error": "doc_id is required"}), 400 + + user_id = get_current_user_id() + if not user_id: + return jsonify({"error": "User not authenticated"}), 401 + + try: + doc_response, status_code = get_document(user_id, doc_id) + if status_code != 200: + return doc_response, status_code + + raw_doc = doc_response.get_json() + _, blob_path = get_document_blob_storage_info(raw_doc) + + return jsonify({ + "id": raw_doc.get("id"), + "document_id": raw_doc.get("id"), + "file_name": raw_doc.get("file_name"), + "version": raw_doc.get("version"), + "is_current_version": raw_doc.get("is_current_version"), + "enhanced_citations": bool(blob_path), + }), 200 + + except Exception as e: + debug_print(f"Error getting enhanced citation document metadata: {e}") + return jsonify({"error": str(e)}), 500 @app.route("/api/enhanced_citations/image", methods=["GET"]) @swagger_route(security=get_auth_security()) @@ -435,6 +518,7 @@ def get_enhanced_citation_tabular_preview(): total_rows = len(df) truncated = total_rows > max_rows preview = df.head(max_rows) + columns, rows = _serialize_tabular_preview_table(preview) return jsonify({ "filename": file_name, @@ -443,8 +527,8 @@ def get_enhanced_citation_tabular_preview(): "sheet_count": len(sheet_names), "total_rows": total_rows if not truncated else None, "total_columns": len(df.columns), - "columns": list(df.columns), - "rows": preview.values.tolist(), + "columns": columns, + "rows": rows, "truncated": truncated }) @@ -513,16 +597,20 @@ def determine_workspace_type_and_container(raw_doc): Determine workspace type and appropriate container based on document metadata """ if raw_doc.get('public_workspace_id'): - return 'public', storage_account_public_documents_container_name + return 'public', raw_doc.get('blob_container') or storage_account_public_documents_container_name elif raw_doc.get('group_id'): - return 'group', storage_account_group_documents_container_name + return 'group', raw_doc.get('blob_container') or storage_account_group_documents_container_name else: - return 'personal', storage_account_user_documents_container_name + return 'personal', raw_doc.get('blob_container') or storage_account_user_documents_container_name def get_blob_name(raw_doc, workspace_type): """ Determine the correct blob name based on workspace type """ + _, blob_name = get_document_blob_storage_info(raw_doc) + if blob_name: + return blob_name + if workspace_type == 'public': return f"{raw_doc['public_workspace_id']}/{raw_doc['file_name']}" elif workspace_type == 'group': diff --git a/application/single_app/route_external_public_documents.py b/application/single_app/route_external_public_documents.py index 67bcbafa..88b496cc 100644 --- a/application/single_app/route_external_public_documents.py +++ b/application/single_app/route_external_public_documents.py @@ -181,34 +181,22 @@ def external_get_public_documents(): where_clause = " AND ".join(query_conditions) - # --- 3) Get total count --- - try: - count_query_str = f"SELECT VALUE COUNT(1) FROM c WHERE {where_clause}" - count_items = list(cosmos_public_documents_container.query_items( - query=count_query_str, - parameters=query_params, - enable_cross_partition_query=True - )) - total_count = count_items[0] if count_items else 0 - except Exception as e: - print(f"Error executing count query for public: {e}") - return jsonify({"error": f"Error counting documents: {str(e)}"}), 500 - - # --- 4) Get paginated data --- + # --- 3) Query matching documents, then collapse to current revisions before paginating --- try: offset = (page - 1) * page_size data_query_str = f""" SELECT * FROM c WHERE {where_clause} - ORDER BY c._ts DESC - OFFSET {offset} LIMIT {page_size} """ - docs = list(cosmos_public_documents_container.query_items( + matching_docs = list(cosmos_public_documents_container.query_items( query=data_query_str, parameters=query_params, enable_cross_partition_query=True )) + current_docs = sort_documents(select_current_documents(matching_docs)) + total_count = len(current_docs) + docs = current_docs[offset:offset + page_size] except Exception as e: print(f"Error fetching public documents: {e}") return jsonify({"error": f"Error fetching documents: {str(e)}"}), 500 @@ -398,11 +386,22 @@ def external_delete_public_document(document_id): """ user_id = request.args.get('user_id') active_workspace_id = request.args.get('active_workspace_id') + delete_mode = request.args.get('delete_mode', 'all_versions') + + if delete_mode not in {'all_versions', 'current_only'}: + return jsonify({'error': 'Invalid delete mode'}), 400 try: - delete_document(user_id=user_id, document_id=document_id, public_workspace_id=active_workspace_id) - delete_document_chunks(document_id=document_id, public_workspace_id=active_workspace_id) - return jsonify({'message': 'Public document deleted successfully'}), 200 + delete_result = delete_document_revision( + user_id=user_id, + document_id=document_id, + delete_mode=delete_mode, + public_workspace_id=active_workspace_id, + ) + return jsonify({ + 'message': 'Public document deleted successfully', + **delete_result, + }), 200 except Exception as e: return jsonify({'error': f'Error deleting public document: {str(e)}'}), 500 diff --git a/application/single_app/route_frontend_conversations.py b/application/single_app/route_frontend_conversations.py index 4ffc2371..d2b428fe 100644 --- a/application/single_app/route_frontend_conversations.py +++ b/application/single_app/route_frontend_conversations.py @@ -4,7 +4,10 @@ from functions_authentication import * from functions_debug import debug_print from functions_chat import sort_messages_by_thread -from functions_message_artifacts import filter_assistant_artifact_items +from functions_message_artifacts import ( + build_message_artifact_payload_map, + filter_assistant_artifact_items, +) from swagger_wrapper import swagger_route, get_auth_security def register_route_frontend_conversations(app): @@ -193,6 +196,42 @@ def get_conversation_messages(conversation_id): return jsonify({'messages': messages}) + @app.route('/api/conversation//agent-citation/', methods=['GET']) + @swagger_route(security=get_auth_security()) + @login_required + @user_required + def get_agent_citation_artifact(conversation_id, artifact_id): + user_id = get_current_user_id() + if not user_id: + return jsonify({'error': 'User not authenticated'}), 401 + + try: + conversation = cosmos_conversations_container.read_item( + item=conversation_id, + partition_key=conversation_id, + ) + except CosmosResourceNotFoundError: + return jsonify({'error': 'Conversation not found'}), 404 + + if conversation.get('user_id') != user_id: + return jsonify({'error': 'Unauthorized access to conversation'}), 403 + + conversation_messages = list(cosmos_messages_container.query_items( + query="SELECT * FROM c WHERE c.conversation_id = @conversation_id", + parameters=[{'name': '@conversation_id', 'value': conversation_id}], + partition_key=conversation_id, + )) + artifact_payload_map = build_message_artifact_payload_map(conversation_messages) + artifact_payload = artifact_payload_map.get(str(artifact_id or '')) + if not isinstance(artifact_payload, dict): + return jsonify({'error': 'Agent citation artifact not found'}), 404 + + citation = artifact_payload.get('citation') + if citation is None: + return jsonify({'error': 'Agent citation payload not found'}), 404 + + return jsonify({'citation': citation}) + @app.route('/api/message//metadata', methods=['GET']) @swagger_route(security=get_auth_security()) @login_required diff --git a/application/single_app/semantic_kernel_plugins/plugin_invocation_logger.py b/application/single_app/semantic_kernel_plugins/plugin_invocation_logger.py index 7dd66517..dea35f22 100644 --- a/application/single_app/semantic_kernel_plugins/plugin_invocation_logger.py +++ b/application/single_app/semantic_kernel_plugins/plugin_invocation_logger.py @@ -60,6 +60,102 @@ def to_json(self) -> str: return json.dumps(self.to_dict(), default=str, indent=2) +def _compact_plugin_log_value(value: Any, max_length: int = 160) -> Any: + """Return a compact logging-safe representation for structured plugin summaries.""" + if value is None or isinstance(value, (int, float, bool)): + return value + + if isinstance(value, str): + return value if len(value) <= max_length else f"{value[:max_length]}... [truncated]" + + if isinstance(value, list): + compact_items = [_compact_plugin_log_value(item, max_length=max_length) for item in value[:5]] + if len(value) > 5: + compact_items.append({'remaining_items': len(value) - 5}) + return compact_items + + if isinstance(value, dict): + compact_mapping = {} + for index, (key, item) in enumerate(value.items()): + if index >= 8: + compact_mapping['remaining_keys'] = len(value) - 8 + break + compact_mapping[str(key)] = _compact_plugin_log_value(item, max_length=max_length) + return compact_mapping + + return str(value) + + +def _build_plugin_result_logging_payload(plugin_name: str, function_name: str, result: Any) -> tuple: + """Build preview and structured summary payloads for plugin invocation logs.""" + result_str = str(result) + result_preview = result_str[:200] + "..." if len(result_str) > 200 else result_str + result_summary = None + + if plugin_name != 'TabularProcessingPlugin' or result is None: + return result_preview, result_summary + + try: + result_payload = json.loads(result) if isinstance(result, str) else result + except Exception: + return result_preview, result_summary + + if not isinstance(result_payload, dict): + return result_preview, result_summary + + summary = {} + key_names = ( + 'filename', + 'selected_sheet', + 'column', + 'search_value', + 'search_operator', + 'searched_columns', + 'matched_columns', + 'return_columns', + 'lookup_column', + 'target_column', + 'operation', + 'filter_applied', + 'normalize_match', + 'extract_mode', + 'extract_pattern', + 'url_path_segments', + 'distinct_count', + 'returned_values', + 'row_count', + 'rows_scanned', + 'total_matches', + 'returned_rows', + 'matched_cell_count', + 'extracted_match_count', + 'sheets_searched', + 'sheets_matched', + 'source_sheet', + 'target_sheet', + 'relationship_type', + 'source_cohort_size', + 'matched_target_row_count', + 'result', + 'error', + ) + for key_name in key_names: + if key_name in result_payload: + summary[key_name] = _compact_plugin_log_value(result_payload.get(key_name)) + + if isinstance(result_payload.get('values'), list): + summary['values_sample'] = _compact_plugin_log_value(result_payload['values'][:5]) + summary['values_sample_limited'] = len(result_payload['values']) > 5 + + if isinstance(result_payload.get('data'), list): + summary['data_sample_count'] = min(len(result_payload['data']), 5) + + if summary: + result_summary = summary + + return result_preview, result_summary + + class PluginInvocationLogger: """Centralized logger for all Semantic Kernel plugin invocations.""" @@ -127,8 +223,14 @@ def _log_to_terminal(self, invocation: PluginInvocation): if invocation.success: if invocation.result: - result_str = str(invocation.result) - log_data["result_preview"] = result_str[:200] + "..." if len(result_str) > 200 else result_str + result_preview, result_summary = _build_plugin_result_logging_payload( + invocation.plugin_name, + invocation.function_name, + invocation.result, + ) + log_data["result_preview"] = result_preview + if result_summary: + log_data["result_summary"] = result_summary log_data["result_type"] = type(invocation.result).__name__ log_event(f"Plugin function executed successfully", @@ -175,11 +277,17 @@ def _log_to_appinsights(self, invocation: PluginInvocation): # Add sanitized result if invocation.result is not None: - result_str = str(invocation.result) - if len(result_str) > 500: - log_data["result_preview"] = f"{result_str[:500]}... [truncated]" + result_preview, result_summary = _build_plugin_result_logging_payload( + invocation.plugin_name, + invocation.function_name, + invocation.result, + ) + if len(str(invocation.result)) > 500: + log_data["result_preview"] = f"{result_preview[:500]}... [truncated]" else: - log_data["result_preview"] = result_str + log_data["result_preview"] = result_preview + if result_summary: + log_data["result_summary"] = result_summary log_event( f"[Plugin Invocation] {invocation.plugin_name}.{invocation.function_name}", @@ -487,13 +595,18 @@ def _log_parameters(function_name: str, parameters: Dict[str, Any]): ) def _log_success(function_name: str, result: Any, duration_ms: float): - result_preview = str(result)[:200] + "..." if len(str(result)) > 200 else str(result) + result_preview, result_summary = _build_plugin_result_logging_payload( + plugin_name, + function_name, + result, + ) log_event( f"[Plugin Function Logger] Function completed successfully", extra={ "plugin_name": plugin_name, "function_name": function_name, "result_preview": result_preview, + "result_summary": result_summary, "duration_ms": duration_ms, "full_function_name": f"{plugin_name}.{function_name}" }, diff --git a/application/single_app/semantic_kernel_plugins/tabular_processing_plugin.py b/application/single_app/semantic_kernel_plugins/tabular_processing_plugin.py index cf7fc663..344d092a 100644 --- a/application/single_app/semantic_kernel_plugins/tabular_processing_plugin.py +++ b/application/single_app/semantic_kernel_plugins/tabular_processing_plugin.py @@ -16,6 +16,7 @@ import warnings import pandas from typing import Annotated, Dict, List, Optional, Set +from urllib.parse import urlsplit, urlunsplit from semantic_kernel.functions import kernel_function from semantic_kernel_plugins.plugin_invocation_logger import plugin_function_logger from functions_appinsights import log_event @@ -43,6 +44,7 @@ class TabularProcessingPlugin: 'count_rows', 'aggregate_column', 'filter_rows', + 'search_rows', 'query_tabular_data', 'filter_rows_by_related_values', 'count_rows_by_related_values', @@ -110,6 +112,7 @@ def __init__(self): self._blob_data_cache = {} # Per-instance cache: (container, blob_name) -> raw bytes self._workbook_metadata_cache = {} # Per-instance cache: (container, blob_name) -> workbook metadata self._default_sheet_overrides = {} # (container, blob_name) -> default sheet name + self._resolved_blob_location_overrides = {} # (source, filename) -> (container, blob_name) @classmethod def get_discovery_function_names(cls): @@ -130,6 +133,53 @@ def set_default_sheet(self, container_name: str, blob_name: str, sheet_name: str """Set the default sheet for a workbook so the model doesn't need to specify it.""" self._default_sheet_overrides[(container_name, blob_name)] = sheet_name + def remember_resolved_blob_location(self, source: str, filename: str, container_name: str, blob_name: str): + """Remember a resolved blob location so later tool calls can reuse it without resupplying scope ids.""" + normalized_filename = str(filename or '').strip() + if not normalized_filename: + return + + normalized_source = str(source or '').strip().lower() + if normalized_source: + self._resolved_blob_location_overrides[(normalized_source, normalized_filename)] = (container_name, blob_name) + + inferred_source = self._infer_source_from_container(container_name) + if inferred_source: + self._resolved_blob_location_overrides[(inferred_source, normalized_filename)] = (container_name, blob_name) + + def _infer_source_from_container(self, container_name: str) -> Optional[str]: + """Infer the logical tabular source from the backing blob container name.""" + if container_name == storage_account_user_documents_container_name: + return 'workspace' + if container_name == storage_account_personal_chat_container_name: + return 'chat' + if container_name == storage_account_group_documents_container_name: + return 'group' + if container_name == storage_account_public_documents_container_name: + return 'public' + return None + + def _get_resolved_blob_location_override(self, source: str, filename: str) -> Optional[tuple]: + """Return a remembered blob location override when one is available for this analysis run.""" + normalized_filename = str(filename or '').strip() + if not normalized_filename: + return None + + normalized_source = str(source or '').strip().lower() + exact_match = self._resolved_blob_location_overrides.get((normalized_source, normalized_filename)) + if exact_match: + return exact_match + + filename_matches = [ + blob_location + for (override_source, override_filename), blob_location in self._resolved_blob_location_overrides.items() + if override_filename == normalized_filename + ] + if len(filename_matches) == 1: + return filename_matches[0] + + return None + def _get_blob_service_client(self): """Get the blob service client from CLIENTS cache.""" client = CLIENTS.get("storage_account_office_docs_client") @@ -214,14 +264,12 @@ def _resolve_sheet_selection( if not available_sheets: raise ValueError(f"Workbook '{blob_name}' does not contain any readable sheets.") + matched_sheet_name = self._match_workbook_sheet_name(sheet_name, available_sheets) + if matched_sheet_name: + return matched_sheet_name, workbook_metadata + normalized_sheet_name = (sheet_name or '').strip() if normalized_sheet_name: - for candidate in available_sheets: - if candidate == normalized_sheet_name: - return candidate, workbook_metadata - for candidate in available_sheets: - if candidate.lower() == normalized_sheet_name.lower(): - return candidate, workbook_metadata raise ValueError( f"Sheet '{normalized_sheet_name}' was not found in workbook '{blob_name}'. " f"Available sheets: {available_sheets}." @@ -250,9 +298,9 @@ def _resolve_sheet_selection( override_key = (container_name, blob_name) if override_key in self._default_sheet_overrides: override_sheet = self._default_sheet_overrides[override_key] - for candidate in available_sheets: - if candidate == override_sheet or candidate.lower() == override_sheet.lower(): - return candidate, workbook_metadata + matched_override_sheet = self._match_workbook_sheet_name(override_sheet, available_sheets) + if matched_override_sheet: + return matched_override_sheet, workbook_metadata if require_explicit_sheet: raise ValueError( @@ -262,6 +310,34 @@ def _resolve_sheet_selection( return workbook_metadata.get('default_sheet'), workbook_metadata + def _match_workbook_sheet_name(self, requested_sheet_name: Optional[str], available_sheets: List[str]) -> Optional[str]: + """Match a workbook sheet name while tolerating trailing whitespace and case drift.""" + raw_sheet_name = None if requested_sheet_name is None else str(requested_sheet_name) + normalized_sheet_name = (raw_sheet_name or '').strip() + if not normalized_sheet_name: + return None + + for candidate in available_sheets: + if candidate == raw_sheet_name: + return candidate + + for candidate in available_sheets: + if candidate.strip() == normalized_sheet_name: + return candidate + + raw_sheet_name_casefold = (raw_sheet_name or '').casefold() + if raw_sheet_name_casefold: + for candidate in available_sheets: + if candidate.casefold() == raw_sheet_name_casefold: + return candidate + + normalized_sheet_name_casefold = normalized_sheet_name.casefold() + for candidate in available_sheets: + if candidate.strip().casefold() == normalized_sheet_name_casefold: + return candidate + + return None + def _filter_rows_across_sheets( self, container_name: str, @@ -270,6 +346,9 @@ def _filter_rows_across_sheets( column: str, operator_str: str, value: str, + additional_filter_column: Optional[str] = None, + additional_filter_operator: str = 'equals', + additional_filter_value=None, normalize_match: bool = False, max_rows: int = 100, ) -> Optional[str]: @@ -302,18 +381,23 @@ def _filter_rows_across_sheets( if column not in df.columns: continue - sheets_searched.append(sheet) try: - mask = self._build_series_match_mask( - df[column], - operator_str, - value, + filtered_df, applied_filters = self._apply_optional_dataframe_filters( + df, + filter_column=column, + filter_operator=operator_str, + filter_value=value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, normalize_match=normalize_match, ) - except ValueError: + except (KeyError, ValueError): continue - sheet_matches = int(mask.sum()) + sheets_searched.append(sheet) + + sheet_matches = len(filtered_df) if sheet_matches == 0: continue @@ -321,7 +405,7 @@ def _filter_rows_across_sheets( total_matches += sheet_matches remaining_capacity = max(0, max_rows - len(combined_results)) if remaining_capacity > 0: - filtered = df[mask].head(remaining_capacity) + filtered = filtered_df.head(remaining_capacity) for row in filtered.to_dict(orient='records'): row['_sheet'] = sheet combined_results.append(row) @@ -342,11 +426,164 @@ def _filter_rows_across_sheets( "selected_sheet": "ALL (cross-sheet search)", "sheets_searched": sheets_searched, "sheets_matched": sheets_matched, + "filter_applied": applied_filters, "total_matches": total_matches, "returned_rows": len(combined_results), "data": combined_results, }, indent=2, default=str) + def _search_rows_across_sheets( + self, + container_name: str, + blob_name: str, + filename: str, + search_value: str, + search_columns=None, + search_operator: str = 'contains', + return_columns=None, + query_expression: Optional[str] = None, + filter_column: Optional[str] = None, + filter_operator: str = 'equals', + filter_value=None, + additional_filter_column: Optional[str] = None, + additional_filter_operator: str = 'equals', + additional_filter_value=None, + normalize_match: bool = False, + max_rows: int = 100, + ) -> Optional[str]: + """Search rows across worksheets when the relevant text column is unknown or broad.""" + workbook_metadata = self._get_workbook_metadata(container_name, blob_name) + if not workbook_metadata.get('is_workbook'): + return None + + available_sheets = workbook_metadata.get('sheet_names', []) + if len(available_sheets) <= 1: + return None + + requested_search_columns = self._parse_optional_column_list_argument(search_columns) + requested_return_columns = self._parse_optional_column_list_argument(return_columns) + combined_results = [] + sheets_searched = [] + sheets_matched = [] + total_matches = 0 + applied_filters = [] + searched_columns = [] + seen_searched_columns = set() + matched_columns = [] + seen_matched_columns = set() + + for sheet in available_sheets: + df = self._read_tabular_blob_to_dataframe( + container_name, + blob_name, + sheet_name=sheet, + ) + df = self._try_numeric_conversion(df) + + try: + filtered_df, sheet_filters = self._apply_optional_dataframe_filters( + df, + query_expression=query_expression, + filter_column=filter_column, + filter_operator=filter_operator, + filter_value=filter_value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, + normalize_match=normalize_match, + ) + except KeyError: + continue + except Exception as query_error: + return json.dumps({ + 'error': f"Query/filter error: {query_error}", + 'filename': filename, + 'selected_sheet': 'ALL (cross-sheet search)', + }, indent=2, default=str) + + remaining_capacity = max(0, max_rows - len(combined_results)) + if remaining_capacity <= 0: + break + + try: + search_result = self._search_dataframe_rows( + filtered_df, + search_value=search_value, + search_columns=requested_search_columns, + search_operator=search_operator, + return_columns=requested_return_columns, + normalize_match=normalize_match, + max_rows=remaining_capacity, + ) + except KeyError: + continue + except ValueError as search_error: + return json.dumps({ + 'error': str(search_error), + 'filename': filename, + 'selected_sheet': 'ALL (cross-sheet search)', + }, indent=2, default=str) + + sheets_searched.append(sheet) + applied_filters = sheet_filters or applied_filters + for column_name in search_result['searched_columns']: + lowered_name = str(column_name).lower() + if lowered_name in seen_searched_columns: + continue + seen_searched_columns.add(lowered_name) + searched_columns.append(column_name) + + sheet_match_count = int(search_result['total_matches']) + total_matches += sheet_match_count + if sheet_match_count > 0: + sheets_matched.append(sheet) + + for column_name in search_result['matched_columns']: + lowered_name = str(column_name).lower() + if lowered_name in seen_matched_columns: + continue + seen_matched_columns.add(lowered_name) + matched_columns.append(column_name) + + for row in search_result['data']: + row['_sheet'] = sheet + combined_results.append(row) + + if not sheets_searched: + if requested_search_columns: + return json.dumps({ + 'error': 'None of the requested search_columns were found on any worksheet during cross-sheet search.', + 'filename': filename, + 'selected_sheet': 'ALL (cross-sheet search)', + 'search_columns': requested_search_columns, + }, indent=2, default=str) + return None + + log_event( + f"[TabularProcessingPlugin] Cross-sheet search_rows: " + f"searched {len(sheets_searched)} sheets, " + f"matched on {len(sheets_matched)} ({sheets_matched}), " + f"total_matches={total_matches}", + level=logging.INFO, + ) + + return json.dumps({ + 'filename': filename, + 'selected_sheet': 'ALL (cross-sheet search)', + 'search_value': search_value, + 'search_operator': search_operator, + 'searched_columns': searched_columns, + 'matched_columns': matched_columns, + 'return_columns': requested_return_columns, + 'sheets_searched': sheets_searched, + 'sheets_matched': sheets_matched, + 'filter_applied': applied_filters, + 'normalize_match': normalize_match, + 'total_matches': total_matches, + 'returned_rows': len(combined_results), + 'data': combined_results, + }, indent=2, default=str) + def _lookup_value_across_sheets( self, container_name: str, @@ -485,7 +722,11 @@ def _query_tabular_data_across_sheets( df = self._try_numeric_conversion(df) try: - result_df = df.query(query_expression) + result_df, _ = self._apply_query_expression_with_fallback( + df, + query_expression=query_expression, + normalize_match=False, + ) except Exception as query_error: query_errors.append({ 'sheet_name': sheet, @@ -558,6 +799,9 @@ def _count_rows_across_sheets( filter_column: Optional[str] = None, filter_operator: str = 'equals', filter_value=None, + additional_filter_column: Optional[str] = None, + additional_filter_operator: str = 'equals', + additional_filter_value=None, query_expression: Optional[str] = None, normalize_match: bool = False, ) -> Optional[str]: @@ -591,6 +835,9 @@ def _count_rows_across_sheets( filter_column=filter_column, filter_operator=filter_operator, filter_value=filter_value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, normalize_match=normalize_match, ) except KeyError: @@ -653,6 +900,12 @@ def _get_distinct_values_across_sheets( filter_column: Optional[str] = None, filter_operator: str = 'equals', filter_value=None, + additional_filter_column: Optional[str] = None, + additional_filter_operator: str = 'equals', + additional_filter_value=None, + extract_mode: Optional[str] = None, + extract_pattern: Optional[str] = None, + url_path_segments: Optional[int] = None, normalize_match: bool = False, max_values: int = 100, ) -> Optional[str]: @@ -668,6 +921,8 @@ def _get_distinct_values_across_sheets( sheets_searched = [] sheets_matched = [] distinct_display_values = {} + matched_cell_count = 0 + extracted_match_count = 0 query_errors = [] applied_filters = [] @@ -687,6 +942,9 @@ def _get_distinct_values_across_sheets( filter_column=filter_column, filter_operator=filter_operator, filter_value=filter_value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, normalize_match=normalize_match, ) except KeyError: @@ -701,20 +959,20 @@ def _get_distinct_values_across_sheets( sheets_searched.append(sheet) applied_filters = sheet_filters or applied_filters - for cell_value in filtered_df[column].tolist(): - display_value = str(cell_value).strip() - if not display_value: - continue - compare_variants = self._extract_cell_value_variants( - cell_value, - normalize_match=normalize_match, - ) - if not compare_variants: - continue - canonical_key = sorted(compare_variants)[0] + sheet_distinct_values, sheet_matched_cells, sheet_extracted_matches = self._collect_distinct_display_values( + filtered_df[column], + normalize_match=normalize_match, + extract_mode=extract_mode, + extract_pattern=extract_pattern, + url_path_segments=url_path_segments, + ) + matched_cell_count += sheet_matched_cells + extracted_match_count += sheet_extracted_matches + for canonical_key, display_value in sheet_distinct_values.items(): distinct_display_values.setdefault(canonical_key, display_value) - if not filtered_df.empty: + sheet_match_count = sheet_matched_cells if extract_mode else len(filtered_df) + if sheet_match_count > 0: sheets_matched.append(sheet) if not sheets_searched: @@ -741,7 +999,7 @@ def _get_distinct_values_across_sheets( return None ordered_values = sorted(distinct_display_values.values(), key=lambda item: item.casefold()) - return json.dumps({ + response_payload = { 'filename': filename, 'selected_sheet': 'ALL (cross-sheet search)', 'column': column, @@ -753,7 +1011,16 @@ def _get_distinct_values_across_sheets( 'returned_values': min(len(ordered_values), int(max_values)), 'values': ordered_values[:int(max_values)], 'values_limited': len(ordered_values) > int(max_values), - }, indent=2, default=str) + } + if extract_mode: + response_payload.update({ + 'extract_mode': extract_mode, + 'extract_pattern': extract_pattern if extract_mode == 'regex' else None, + 'url_path_segments': url_path_segments if extract_mode == 'url' else None, + 'matched_cell_count': matched_cell_count, + 'extracted_match_count': extracted_match_count, + }) + return json.dumps(response_payload, indent=2, default=str) def _evaluate_related_value_membership( self, @@ -1119,6 +1386,334 @@ def _extract_cell_value_variants(self, value, normalize_match: bool = False) -> return variants + def _normalize_distinct_extraction_arguments( + self, + extract_mode: Optional[str] = None, + extract_pattern: Optional[str] = None, + url_path_segments: Optional[str] = None, + ) -> tuple: + """Validate and normalize optional embedded extraction arguments.""" + normalized_extract_mode = str(extract_mode or '').strip().lower() or None + if normalized_extract_mode not in {None, 'url', 'regex'}: + raise ValueError("Unsupported extract_mode. Use 'url' or 'regex'.") + + normalized_extract_pattern = str(extract_pattern or '').strip() or None + if normalized_extract_mode == 'regex' and not normalized_extract_pattern: + raise ValueError('extract_pattern is required when extract_mode is regex.') + if normalized_extract_mode != 'regex': + normalized_extract_pattern = None + + parsed_url_path_segments = None + if url_path_segments not in (None, ''): + try: + parsed_url_path_segments = int(url_path_segments) + except (TypeError, ValueError): + raise ValueError('url_path_segments must be an integer when provided.') + if parsed_url_path_segments < 0: + raise ValueError('url_path_segments must be zero or greater when provided.') + + if normalized_extract_mode != 'url': + parsed_url_path_segments = None + + return normalized_extract_mode, normalized_extract_pattern, parsed_url_path_segments + + def _normalize_embedded_url_match(self, raw_match, url_path_segments: Optional[int] = None) -> Optional[str]: + """Normalize an extracted URL for stable distinct-value analysis.""" + cleaned_match = str(raw_match or '').strip().rstrip('.,;:!?)]}\"\'') + if not cleaned_match: + return None + + parsed_url = urlsplit(cleaned_match) + if not parsed_url.scheme or not parsed_url.netloc: + return cleaned_match + + path_segments = [segment for segment in parsed_url.path.split('/') if segment] + if url_path_segments is not None: + path_segments = path_segments[:url_path_segments] + + normalized_path = '' + if path_segments: + normalized_path = '/' + '/'.join(path_segments) + + return urlunsplit(( + parsed_url.scheme.lower(), + parsed_url.netloc.lower(), + normalized_path, + '', + '', + )) + + def _extract_embedded_matches_from_text( + self, + value, + extract_mode: Optional[str] = None, + extract_pattern: Optional[str] = None, + url_path_segments: Optional[int] = None, + ) -> List[str]: + """Extract embedded URL or regex matches from a composite text cell.""" + if value is None or (not isinstance(value, str) and pandas.isna(value)): + return [] + + rendered_text = str(value).strip() + if not rendered_text or not extract_mode: + return [] + + normalized_extract_mode = str(extract_mode or '').strip().lower() + extracted_matches = [] + + if normalized_extract_mode == 'url': + for raw_match in re.findall(r'https?://[^\s<>"\'\]\)]+', rendered_text, flags=re.IGNORECASE): + normalized_match = self._normalize_embedded_url_match( + raw_match, + url_path_segments=url_path_segments, + ) + if normalized_match: + extracted_matches.append(normalized_match) + elif normalized_extract_mode == 'regex': + compiled_pattern = re.compile(extract_pattern, flags=re.IGNORECASE) + for match in compiled_pattern.finditer(rendered_text): + candidate_value = None + if match.lastindex: + for group_value in match.groups(): + if group_value: + candidate_value = group_value + break + if candidate_value is None: + candidate_value = match.group(0) + + cleaned_candidate = str(candidate_value or '').strip().rstrip('.,;:!?)]}\"\'') + if cleaned_candidate: + extracted_matches.append(cleaned_candidate) + else: + raise ValueError("Unsupported extract_mode. Use 'url' or 'regex'.") + + unique_matches = [] + seen_matches = set() + for extracted_match in extracted_matches: + canonical_match = str(extracted_match).casefold().strip() + if not canonical_match or canonical_match in seen_matches: + continue + seen_matches.add(canonical_match) + unique_matches.append(str(extracted_match).strip()) + + return unique_matches + + def _collect_distinct_value_candidates( + self, + value, + normalize_match: bool = False, + extract_mode: Optional[str] = None, + extract_pattern: Optional[str] = None, + url_path_segments: Optional[int] = None, + ) -> List[dict]: + """Return display/canonical pairs for raw or embedded distinct-value extraction.""" + normalized_extract_mode = str(extract_mode or '').strip().lower() or None + + if normalized_extract_mode: + candidates = [] + for extracted_match in self._extract_embedded_matches_from_text( + value, + extract_mode=normalized_extract_mode, + extract_pattern=extract_pattern, + url_path_segments=url_path_segments, + ): + display_value = str(extracted_match).strip() + if not display_value: + continue + + if normalized_extract_mode == 'url': + canonical_key = display_value.casefold() + elif normalize_match: + canonical_key = self._normalize_entity_match_text(display_value) + else: + canonical_key = display_value.casefold() + + if not canonical_key: + continue + + candidates.append({ + 'display_value': display_value, + 'canonical_key': canonical_key, + }) + + return candidates + + if value is None or (not isinstance(value, str) and pandas.isna(value)): + return [] + + display_value = str(value).strip() + if not display_value: + return [] + + compare_variants = self._extract_cell_value_variants( + value, + normalize_match=normalize_match, + ) + if not compare_variants: + return [] + + return [{ + 'display_value': display_value, + 'canonical_key': sorted(compare_variants)[0], + }] + + def _collect_distinct_display_values( + self, + series: pandas.Series, + normalize_match: bool = False, + extract_mode: Optional[str] = None, + extract_pattern: Optional[str] = None, + url_path_segments: Optional[int] = None, + ) -> tuple: + """Collect display values and counts for deterministic distinct-value analysis.""" + distinct_display_values = {} + matched_cell_count = 0 + extracted_match_count = 0 + + for cell_value in series.tolist(): + candidates = self._collect_distinct_value_candidates( + cell_value, + normalize_match=normalize_match, + extract_mode=extract_mode, + extract_pattern=extract_pattern, + url_path_segments=url_path_segments, + ) + if not candidates: + continue + + matched_cell_count += 1 + extracted_match_count += len(candidates) + for candidate in candidates: + distinct_display_values.setdefault( + candidate['canonical_key'], + candidate['display_value'], + ) + + return distinct_display_values, matched_cell_count, extracted_match_count + + def _parse_optional_column_list_argument(self, raw_columns) -> Optional[List[str]]: + """Parse an optional comma-separated or JSON-array column list argument.""" + if raw_columns is None: + return None + + candidate_values = None + if isinstance(raw_columns, (list, tuple, set)): + candidate_values = list(raw_columns) + else: + rendered_columns = str(raw_columns).strip() + if not rendered_columns: + return None + if rendered_columns.casefold() in {'*', 'all', 'all_columns', 'all columns'}: + return None + + if rendered_columns.startswith('['): + try: + parsed_columns = json.loads(rendered_columns) + except Exception: + parsed_columns = None + if isinstance(parsed_columns, list): + candidate_values = parsed_columns + + if candidate_values is None: + candidate_values = re.split(r'[,;|\n]+', rendered_columns) + + normalized_columns = [] + seen_columns = set() + for candidate_value in candidate_values: + normalized_column = str(candidate_value or '').strip() + if not normalized_column: + continue + lowered_column = normalized_column.casefold() + if lowered_column in seen_columns: + continue + seen_columns.add(lowered_column) + normalized_columns.append(normalized_column) + + return normalized_columns or None + + def _search_dataframe_rows( + self, + df: pandas.DataFrame, + search_value, + search_columns=None, + search_operator: str = 'contains', + return_columns=None, + normalize_match: bool = False, + max_rows: int = 100, + ) -> dict: + """Search one or more columns in a DataFrame and return row-context results.""" + requested_search_columns = self._parse_optional_column_list_argument(search_columns) + requested_return_columns = self._parse_optional_column_list_argument(return_columns) + + if requested_search_columns: + resolved_search_columns = [ + column_name for column_name in requested_search_columns + if column_name in df.columns + ] + if not resolved_search_columns: + raise KeyError(requested_search_columns[0]) + else: + resolved_search_columns = list(df.columns) + + resolved_return_columns = [ + column_name for column_name in (requested_return_columns or []) + if column_name in df.columns + ] + + combined_mask = pandas.Series([False] * len(df), index=df.index) + column_masks = {} + for column_name in resolved_search_columns: + column_mask = self._build_series_match_mask( + df[column_name], + search_operator, + search_value, + normalize_match=normalize_match, + ).fillna(False) + column_masks[column_name] = column_mask + combined_mask = combined_mask | column_mask + + matched_df = df[combined_mask] + matched_columns = [] + seen_matched_columns = set() + result_rows = [] + + for row_index, row in matched_df.head(int(max_rows)).iterrows(): + row_matched_columns = [] + for column_name in resolved_search_columns: + if not bool(column_masks[column_name].loc[row_index]): + continue + row_matched_columns.append(column_name) + lowered_column = column_name.casefold() + if lowered_column not in seen_matched_columns: + seen_matched_columns.add(lowered_column) + matched_columns.append(column_name) + + if resolved_return_columns: + row_payload = { + column_name: row.get(column_name) + for column_name in resolved_return_columns + } + else: + row_payload = { + str(key): value for key, value in row.to_dict().items() + } + + row_payload['_matched_columns'] = row_matched_columns + row_payload['_matched_values'] = { + column_name: row.get(column_name) + for column_name in row_matched_columns + } + result_rows.append(row_payload) + + return { + 'searched_columns': resolved_search_columns, + 'matched_columns': matched_columns, + 'return_columns': resolved_return_columns or None, + 'total_matches': len(matched_df), + 'returned_rows': len(result_rows), + 'data': result_rows, + } + def _build_series_match_mask( self, series: pandas.Series, @@ -1197,6 +1792,213 @@ def _build_series_match_mask( raise ValueError(f"Unsupported operator: {operator}") + def _normalize_pseudo_query_column_reference(self, raw_column_name: str) -> str: + """Normalize a reviewer-style query column reference into a DataFrame column name.""" + normalized_column_name = str(raw_column_name or '').strip() + if normalized_column_name.startswith('`') and normalized_column_name.endswith('`'): + normalized_column_name = normalized_column_name[1:-1] + return normalized_column_name.strip() + + def _build_pseudo_query_string_method_mask( + self, + series: pandas.Series, + operator: str, + value, + case_sensitive: bool = False, + normalize_match: bool = False, + ) -> pandas.Series: + """Build a boolean mask for reviewer-style string method clauses.""" + if normalize_match and not case_sensitive: + return self._build_series_match_mask( + series, + operator, + value, + normalize_match=True, + ) + + if not case_sensitive: + return self._build_series_match_mask( + series, + operator, + value, + normalize_match=False, + ) + + text_series = series.astype(str) + value_text = str(value) + if operator == 'contains': + return text_series.str.contains(value_text, regex=False, case=True, na=False) + if operator == 'startswith': + return text_series.str.startswith(value_text, na=False) + if operator == 'endswith': + return text_series.str.endswith(value_text, na=False) + + raise ValueError(f"Unsupported operator: {operator}") + + def _apply_reviewer_style_query_expression( + self, + df: pandas.DataFrame, + query_expression: str, + normalize_match: bool = False, + ) -> Optional[pandas.DataFrame]: + """Apply limited reviewer-style pseudo-pandas filters when DataFrame.query syntax is invalid.""" + rendered_query_expression = str(query_expression or '').strip() + if not rendered_query_expression: + return df + + lowered_expression = rendered_query_expression.casefold() + if ' or ' in lowered_expression or '||' in rendered_query_expression or '|' in rendered_query_expression: + return None + + clause_texts = [ + clause.strip() + for clause in re.split(r'\s+(?i:and)\s+|&&', rendered_query_expression) + if clause.strip() + ] + if not clause_texts: + return None + + notnull_pattern = re.compile( + r"^\s*(?P`[^`]+`|[A-Za-z_][A-Za-z0-9_]*)\s*\.\s*notnull\(\)\s*$", + flags=re.IGNORECASE, + ) + isnull_pattern = re.compile( + r"^\s*(?P`[^`]+`|[A-Za-z_][A-Za-z0-9_]*)\s*\.\s*isnull\(\)\s*$", + flags=re.IGNORECASE, + ) + string_method_pattern = re.compile( + r"^\s*(?P`[^`]+`|[A-Za-z_][A-Za-z0-9_]*)" + r"(?:\s*\.\s*astype\(\s*str\s*\))?\s*\.\s*str\s*\.\s*" + r"(?Pcontains|startswith|endswith)\(\s*" + r"(?P['\"])(?P.*?)(?P=quote)(?P[^)]*)\)\s*$", + flags=re.IGNORECASE, + ) + equality_pattern = re.compile( + r"^\s*(?P`[^`]+`|[A-Za-z_][A-Za-z0-9_]*)\s*" + r"(?P==|!=)\s*" + r"(?P['\"])(?P.*?)(?P=quote)\s*$", + flags=re.IGNORECASE, + ) + null_literal_pattern = re.compile( + r"^\s*(?P`[^`]+`|[A-Za-z_][A-Za-z0-9_]*)\s*" + r"(?P==|!=)\s*" + r"(?Pnull|none|nan)\s*$", + flags=re.IGNORECASE, + ) + + filtered_df = df + matched_any_clause = False + + for clause_text in clause_texts: + normalized_clause_text = clause_text.strip() + while normalized_clause_text.startswith('(') and normalized_clause_text.endswith(')'): + normalized_clause_text = normalized_clause_text[1:-1].strip() + + match = notnull_pattern.match(normalized_clause_text) + if match: + column_name = self._normalize_pseudo_query_column_reference(match.group('column')) + if column_name not in filtered_df.columns: + raise KeyError(column_name) + filtered_df = filtered_df[filtered_df[column_name].notna()] + matched_any_clause = True + continue + + match = isnull_pattern.match(normalized_clause_text) + if match: + column_name = self._normalize_pseudo_query_column_reference(match.group('column')) + if column_name not in filtered_df.columns: + raise KeyError(column_name) + filtered_df = filtered_df[filtered_df[column_name].isna()] + matched_any_clause = True + continue + + match = string_method_pattern.match(normalized_clause_text) + if match: + column_name = self._normalize_pseudo_query_column_reference(match.group('column')) + if column_name not in filtered_df.columns: + raise KeyError(column_name) + + method_name = str(match.group('method') or '').strip().lower() + operator_name = { + 'contains': 'contains', + 'startswith': 'startswith', + 'endswith': 'endswith', + }.get(method_name) + if not operator_name: + return None + + args_text = str(match.group('args') or '').replace(' ', '').casefold() + if 'regex=true' in args_text: + return None + case_sensitive = 'case=true' in args_text + + mask = self._build_pseudo_query_string_method_mask( + filtered_df[column_name], + operator_name, + match.group('value'), + case_sensitive=case_sensitive, + normalize_match=normalize_match, + ) + filtered_df = filtered_df[mask] + matched_any_clause = True + continue + + match = equality_pattern.match(normalized_clause_text) + if match: + column_name = self._normalize_pseudo_query_column_reference(match.group('column')) + if column_name not in filtered_df.columns: + raise KeyError(column_name) + + operator_name = 'equals' if match.group('operator') == '==' else '!=' + mask = self._build_series_match_mask( + filtered_df[column_name], + operator_name, + match.group('value'), + normalize_match=normalize_match, + ) + filtered_df = filtered_df[mask] + matched_any_clause = True + continue + + match = null_literal_pattern.match(normalized_clause_text) + if match: + column_name = self._normalize_pseudo_query_column_reference(match.group('column')) + if column_name not in filtered_df.columns: + raise KeyError(column_name) + + if match.group('operator') == '==': + filtered_df = filtered_df[filtered_df[column_name].isna()] + else: + filtered_df = filtered_df[filtered_df[column_name].notna()] + matched_any_clause = True + continue + + return None + + return filtered_df if matched_any_clause else None + + def _apply_query_expression_with_fallback( + self, + df: pandas.DataFrame, + query_expression: Optional[str] = None, + normalize_match: bool = False, + ) -> tuple: + """Apply DataFrame.query syntax first, then fall back to limited reviewer-style parsing.""" + if not query_expression: + return df, False + + try: + return df.query(query_expression), False + except Exception as query_error: + fallback_df = self._apply_reviewer_style_query_expression( + df, + query_expression, + normalize_match=normalize_match, + ) + if fallback_df is not None: + return fallback_df, True + raise query_error + def _apply_optional_dataframe_filters( self, df: pandas.DataFrame, @@ -1204,31 +2006,67 @@ def _apply_optional_dataframe_filters( filter_column: Optional[str] = None, filter_operator: str = 'equals', filter_value=None, + additional_filter_column: Optional[str] = None, + additional_filter_operator: str = 'equals', + additional_filter_value=None, normalize_match: bool = False, ) -> tuple: - """Apply optional query and single-column filters to a DataFrame.""" + """Apply optional query and up to two single-column filters to a DataFrame.""" filtered_df = df applied_filters = [] if query_expression: - filtered_df = filtered_df.query(query_expression) - applied_filters.append(f"query_expression={query_expression}") + filtered_df, used_reviewer_style_fallback = self._apply_query_expression_with_fallback( + filtered_df, + query_expression=query_expression, + normalize_match=normalize_match, + ) + applied_filters.append( + f"query_expression={query_expression}" + + (' [reviewer-style fallback]' if used_reviewer_style_fallback else '') + ) - if filter_column: - if filter_column not in filtered_df.columns: - raise KeyError(filter_column) - if filter_value is None: - raise ValueError('filter_value is required when filter_column is provided.') + structured_filters = [ + { + 'column': filter_column, + 'operator': filter_operator, + 'value': filter_value, + 'column_argument': 'filter_column', + 'value_argument': 'filter_value', + }, + { + 'column': additional_filter_column, + 'operator': additional_filter_operator, + 'value': additional_filter_value, + 'column_argument': 'additional_filter_column', + 'value_argument': 'additional_filter_value', + }, + ] + + for filter_spec in structured_filters: + current_filter_column = filter_spec['column'] + if not current_filter_column: + continue + + if current_filter_column not in filtered_df.columns: + raise KeyError(current_filter_column) + + current_filter_value = filter_spec['value'] + if current_filter_value is None: + raise ValueError( + f"{filter_spec['value_argument']} is required when {filter_spec['column_argument']} is provided." + ) + current_filter_operator = filter_spec['operator'] or 'equals' mask = self._build_series_match_mask( - filtered_df[filter_column], - filter_operator, - filter_value, + filtered_df[current_filter_column], + current_filter_operator, + current_filter_value, normalize_match=normalize_match, ) filtered_df = filtered_df[mask] applied_filters.append( - f"{filter_column} {filter_operator or 'equals'} {filter_value}" + f"{current_filter_column} {current_filter_operator} {current_filter_value}" + (' [normalized]' if normalize_match else '') ) @@ -1917,6 +2755,10 @@ def _resolve_blob_location_with_fallback(self, user_id: str, conversation_id: st group_id: str = None, public_workspace_id: str = None) -> tuple: """Try primary source first, then fall back to other containers if blob not found.""" source = source.lower().strip() + override = self._get_resolved_blob_location_override(source, filename) + if override: + return override + attempts = [] # Primary attempt based on specified source @@ -2254,8 +3096,8 @@ def _sync_work(): @kernel_function( description=( - "Return deterministic distinct values for a column, with optional query_expression or filter criteria. " - "Use this to build a canonical cohort from a worksheet before counting or joining related rows." + "Return deterministic distinct values for a column, with optional query_expression, up to two column filters, and optional embedded URL or regex extraction from composite text cells. " + "Use this to build a canonical cohort from a worksheet before counting or joining related rows. Narrow the original text column first when category membership depends on surrounding cell context." ), name="get_distinct_values" ) @@ -2270,6 +3112,12 @@ async def get_distinct_values( filter_column: Annotated[Optional[str], "Optional column to filter on before collecting distinct values"] = None, filter_operator: Annotated[str, "Optional filter operator when filter_column is provided"] = "equals", filter_value: Annotated[Optional[str], "Optional filter value when filter_column is provided"] = None, + additional_filter_column: Annotated[Optional[str], "Optional second column to filter on before collecting distinct values"] = None, + additional_filter_operator: Annotated[str, "Optional filter operator when additional_filter_column is provided"] = "equals", + additional_filter_value: Annotated[Optional[str], "Optional filter value when additional_filter_column is provided"] = None, + extract_mode: Annotated[Optional[str], "Optional embedded extraction mode: 'url' or 'regex'"] = None, + extract_pattern: Annotated[Optional[str], "Optional regex pattern when extract_mode is 'regex'"] = None, + url_path_segments: Annotated[Optional[str], "Optional number of URL path segments to keep when extract_mode is 'url'"] = None, normalize_match: Annotated[str, "Whether to normalize string/entity matching and deduplication (true/false)"] = "true", sheet_name: Annotated[Optional[str], "Optional worksheet name for Excel files. When omitted, the plugin may perform a cross-sheet distinct-value search."] = None, sheet_index: Annotated[Optional[str], "Optional zero-based worksheet index for Excel files. Ignored when sheet_name is provided."] = None, @@ -2282,6 +3130,11 @@ async def get_distinct_values( def _sync_work(): try: normalize_match_flag = self._parse_boolean_argument(normalize_match, default=True) + normalized_extract_mode, normalized_extract_pattern, parsed_url_path_segments = self._normalize_distinct_extraction_arguments( + extract_mode=extract_mode, + extract_pattern=extract_pattern, + url_path_segments=url_path_segments, + ) container, blob_path = self._resolve_blob_location_with_fallback( user_id, conversation_id, filename, source, group_id=group_id, public_workspace_id=public_workspace_id @@ -2299,6 +3152,12 @@ def _sync_work(): filter_column=filter_column, filter_operator=filter_operator, filter_value=filter_value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, + extract_mode=normalized_extract_mode, + extract_pattern=normalized_extract_pattern, + url_path_segments=parsed_url_path_segments, normalize_match=normalize_match_flag, max_values=int(max_values), ) @@ -2328,7 +3187,10 @@ def _sync_work(): workbook_metadata, selected_sheet, column, - related_columns=[filter_column] if filter_column else None, + related_columns=[ + candidate_column for candidate_column in (filter_column, additional_filter_column) + if candidate_column + ] or None, available_columns=list(df.columns), ) ) @@ -2340,6 +3202,9 @@ def _sync_work(): filter_column=filter_column, filter_operator=filter_operator, filter_value=filter_value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, normalize_match=normalize_match_flag, ) except KeyError as missing_column_error: @@ -2352,7 +3217,10 @@ def _sync_work(): workbook_metadata, selected_sheet, missing_column, - related_columns=[column], + related_columns=[ + candidate_column for candidate_column in (column, filter_column, additional_filter_column) + if candidate_column + ], available_columns=list(df.columns), ) ) @@ -2363,23 +3231,17 @@ def _sync_work(): 'selected_sheet': selected_sheet if workbook_metadata.get('is_workbook') else None, }) - distinct_display_values = {} - for cell_value in filtered_df[column].tolist(): - display_value = str(cell_value).strip() - if not display_value: - continue - compare_variants = self._extract_cell_value_variants( - cell_value, - normalize_match=normalize_match_flag, - ) - if not compare_variants: - continue - canonical_key = sorted(compare_variants)[0] - distinct_display_values.setdefault(canonical_key, display_value) + distinct_display_values, matched_cell_count, extracted_match_count = self._collect_distinct_display_values( + filtered_df[column], + normalize_match=normalize_match_flag, + extract_mode=normalized_extract_mode, + extract_pattern=normalized_extract_pattern, + url_path_segments=parsed_url_path_segments, + ) ordered_values = sorted(distinct_display_values.values(), key=lambda item: item.casefold()) limit = int(max_values) - return json.dumps({ + response_payload = { 'filename': filename, 'selected_sheet': selected_sheet if workbook_metadata.get('is_workbook') else None, 'column': column, @@ -2389,7 +3251,16 @@ def _sync_work(): 'returned_values': min(len(ordered_values), limit), 'values': ordered_values[:limit], 'values_limited': len(ordered_values) > limit, - }, indent=2, default=str) + } + if normalized_extract_mode: + response_payload.update({ + 'extract_mode': normalized_extract_mode, + 'extract_pattern': normalized_extract_pattern if normalized_extract_mode == 'regex' else None, + 'url_path_segments': parsed_url_path_segments if normalized_extract_mode == 'url' else None, + 'matched_cell_count': matched_cell_count, + 'extracted_match_count': extracted_match_count, + }) + return json.dumps(response_payload, indent=2, default=str) except Exception as e: log_event(f"[TabularProcessingPlugin] Error getting distinct values: {e}", level=logging.WARNING) return json.dumps({"error": str(e)}) @@ -2398,7 +3269,7 @@ def _sync_work(): @kernel_function( description=( - "Return a deterministic row count after applying an optional query_expression or filter condition. " + "Return a deterministic row count after applying an optional query_expression and up to two filter conditions. " "Use this instead of estimating counts from partial returned rows when the user asks how many or what percentage." ), name="count_rows" @@ -2413,6 +3284,9 @@ async def count_rows( filter_column: Annotated[Optional[str], "Optional column to filter on before counting rows"] = None, filter_operator: Annotated[str, "Optional filter operator when filter_column is provided"] = "equals", filter_value: Annotated[Optional[str], "Optional filter value when filter_column is provided"] = None, + additional_filter_column: Annotated[Optional[str], "Optional second column to filter on before counting rows"] = None, + additional_filter_operator: Annotated[str, "Optional filter operator when additional_filter_column is provided"] = "equals", + additional_filter_value: Annotated[Optional[str], "Optional filter value when additional_filter_column is provided"] = None, normalize_match: Annotated[str, "Whether to normalize string/entity matching for text comparisons (true/false)"] = "false", sheet_name: Annotated[Optional[str], "Optional worksheet name for Excel files. When omitted, the plugin may perform a cross-sheet row count."] = None, sheet_index: Annotated[Optional[str], "Optional zero-based worksheet index for Excel files. Ignored when sheet_name is provided."] = None, @@ -2439,6 +3313,9 @@ def _sync_work(): filter_column=filter_column, filter_operator=filter_operator, filter_value=filter_value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, query_expression=query_expression, normalize_match=normalize_match_flag, ) @@ -2467,6 +3344,9 @@ def _sync_work(): filter_column=filter_column, filter_operator=filter_operator, filter_value=filter_value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, normalize_match=normalize_match_flag, ) except KeyError as missing_column_error: @@ -2479,6 +3359,10 @@ def _sync_work(): workbook_metadata, selected_sheet, missing_column, + related_columns=[ + candidate_column for candidate_column in (filter_column, additional_filter_column) + if candidate_column + ] or None, available_columns=list(df.columns), ) ) @@ -2598,7 +3482,8 @@ def _sync_work(): @kernel_function( description=( "Filter rows in a tabular file based on conditions and return matching rows. " - "Supports operators: ==, !=, >, <, >=, <=, contains, startswith, endswith." + "Supports operators: ==, !=, >, <, >=, <=, contains, startswith, endswith. " + "A second column filter can be applied for compound text or literal matching. Use this as the text-search tool when the full cell or row context matters." ), name="filter_rows" ) @@ -2611,6 +3496,9 @@ async def filter_rows( column: Annotated[str, "The column to filter on"], operator: Annotated[str, "Operator: ==, !=, >, <, >=, <=, contains, startswith, endswith"], value: Annotated[str, "The value to compare against"], + additional_filter_column: Annotated[Optional[str], "Optional second column to filter on"] = None, + additional_filter_operator: Annotated[str, "Optional filter operator when additional_filter_column is provided"] = "equals", + additional_filter_value: Annotated[Optional[str], "Optional filter value when additional_filter_column is provided"] = None, normalize_match: Annotated[str, "Whether to normalize string/entity matching for text comparisons (true/false)"] = "false", sheet_name: Annotated[Optional[str], "Optional worksheet name for Excel files. Required for analytical calls on multi-sheet workbooks unless sheet_index is provided."] = None, sheet_index: Annotated[Optional[str], "Optional zero-based worksheet index for Excel files. Ignored when sheet_name is provided."] = None, @@ -2632,7 +3520,15 @@ def _sync_work(): normalized_sheet_idx = None if sheet_index is None else str(sheet_index).strip() if not normalized_sheet and normalized_sheet_idx in (None, ''): cross_sheet_result = self._filter_rows_across_sheets( - container, blob_path, filename, column, operator, value, + container, + blob_path, + filename, + column, + operator, + value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, normalize_match=normalize_match_flag, max_rows=int(max_rows), ) @@ -2662,27 +3558,47 @@ def _sync_work(): workbook_metadata, selected_sheet, column, + related_columns=[additional_filter_column] if additional_filter_column else None, available_columns=list(df.columns), ) ) try: - mask = self._build_series_match_mask( - df[column], - operator, - value, + filtered_df, applied_filters = self._apply_optional_dataframe_filters( + df, + filter_column=column, + filter_operator=operator, + filter_value=value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, normalize_match=normalize_match_flag, ) - except ValueError: - return json.dumps({"error": f"Unsupported operator: {operator}"}) + except KeyError as missing_column_error: + missing_column = str(missing_column_error).strip("'") + return json.dumps( + self._build_missing_column_error_payload( + container, + blob_path, + filename, + workbook_metadata, + selected_sheet, + missing_column, + related_columns=[candidate_column for candidate_column in (column, additional_filter_column) if candidate_column], + available_columns=list(df.columns), + ) + ) + except ValueError as filter_error: + return json.dumps({"error": str(filter_error)}) limit = int(max_rows) - filtered = df[mask].head(limit) + filtered = filtered_df.head(limit) return json.dumps({ "filename": filename, "selected_sheet": selected_sheet if workbook_metadata.get('is_workbook') else None, + "filter_applied": applied_filters, "normalize_match": normalize_match_flag, - "total_matches": int(mask.sum()), + "total_matches": len(filtered_df), "returned_rows": len(filtered), "data": filtered.to_dict(orient='records') }, indent=2, default=str) @@ -2691,6 +3607,190 @@ def _sync_work(): return json.dumps({"error": str(e)}) return await asyncio.to_thread(_sync_work) + @kernel_function( + description=( + "Search one or more columns, or all columns when search_columns is omitted, for a value or phrase and return matching rows with row-context metadata. " + "Use this when the relevant column is unclear or when you need to search an entire worksheet or workbook for a topic before deciding which returned content is relevant." + ), + name="search_rows" + ) + @plugin_function_logger("TabularProcessingPlugin") + async def search_rows( + self, + user_id: Annotated[str, "The user ID (from Scope ID in Conversation Metadata)"], + conversation_id: Annotated[str, "The conversation ID (from Conversation Metadata)"], + filename: Annotated[str, "The filename of the tabular file"], + search_value: Annotated[str, "The text or value to search for"], + search_columns: Annotated[Optional[str], "Optional comma-separated columns to search. Omit to search all columns."] = None, + search_operator: Annotated[str, "Search operator: equals, contains, startswith, endswith"] = "contains", + return_columns: Annotated[Optional[str], "Optional comma-separated columns to include in each result row. Omit to return the full row."] = None, + query_expression: Annotated[Optional[str], "Optional pandas DataFrame.query() expression to apply before searching"] = None, + filter_column: Annotated[Optional[str], "Optional first column filter to narrow the search cohort"] = None, + filter_operator: Annotated[str, "Optional filter operator when filter_column is provided"] = "equals", + filter_value: Annotated[Optional[str], "Optional filter value when filter_column is provided"] = None, + additional_filter_column: Annotated[Optional[str], "Optional second column filter to narrow the search cohort"] = None, + additional_filter_operator: Annotated[str, "Optional filter operator when additional_filter_column is provided"] = "equals", + additional_filter_value: Annotated[Optional[str], "Optional filter value when additional_filter_column is provided"] = None, + normalize_match: Annotated[str, "Whether to normalize string/entity matching for text comparisons (true/false)"] = "false", + sheet_name: Annotated[Optional[str], "Optional worksheet name for Excel files. When omitted, the plugin may perform a cross-sheet search."] = None, + sheet_index: Annotated[Optional[str], "Optional zero-based worksheet index for Excel files. Ignored when sheet_name is provided."] = None, + source: Annotated[str, "Source: 'workspace', 'chat', 'group', or 'public'"] = "chat", + max_rows: Annotated[str, "Maximum matching rows to return"] = "100", + group_id: Annotated[Optional[str], "Group ID (for group workspace documents)"] = None, + public_workspace_id: Annotated[Optional[str], "Public workspace ID (for public workspace documents)"] = None, + ) -> Annotated[str, "JSON result containing matching rows, matched columns, and search metadata"]: + """Search rows across one or more columns while preserving row context.""" + def _sync_work(): + try: + normalize_match_flag = self._parse_boolean_argument(normalize_match, default=False) + parsed_search_columns = self._parse_optional_column_list_argument(search_columns) + parsed_return_columns = self._parse_optional_column_list_argument(return_columns) + container, blob_path = self._resolve_blob_location_with_fallback( + user_id, conversation_id, filename, source, + group_id=group_id, public_workspace_id=public_workspace_id + ) + + normalized_sheet = (sheet_name or '').strip() + normalized_sheet_idx = None if sheet_index is None else str(sheet_index).strip() + if not normalized_sheet and normalized_sheet_idx in (None, ''): + cross_sheet_result = self._search_rows_across_sheets( + container, + blob_path, + filename, + search_value=search_value, + search_columns=parsed_search_columns, + search_operator=search_operator, + return_columns=parsed_return_columns, + query_expression=query_expression, + filter_column=filter_column, + filter_operator=filter_operator, + filter_value=filter_value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, + normalize_match=normalize_match_flag, + max_rows=int(max_rows), + ) + if cross_sheet_result is not None: + return cross_sheet_result + + selected_sheet, workbook_metadata = self._resolve_sheet_selection( + container, + blob_path, + sheet_name=sheet_name, + sheet_index=sheet_index, + require_explicit_sheet=True, + ) + df = self._read_tabular_blob_to_dataframe( + container, + blob_path, + sheet_name=selected_sheet, + require_explicit_sheet=True, + ) + df = self._try_numeric_conversion(df) + + try: + filtered_df, applied_filters = self._apply_optional_dataframe_filters( + df, + query_expression=query_expression, + filter_column=filter_column, + filter_operator=filter_operator, + filter_value=filter_value, + additional_filter_column=additional_filter_column, + additional_filter_operator=additional_filter_operator, + additional_filter_value=additional_filter_value, + normalize_match=normalize_match_flag, + ) + except KeyError as missing_column_error: + missing_column = str(missing_column_error).strip("'") + return json.dumps( + self._build_missing_column_error_payload( + container, + blob_path, + filename, + workbook_metadata, + selected_sheet, + missing_column, + related_columns=[ + candidate_column + for candidate_column in ( + *(parsed_search_columns or []), + *(parsed_return_columns or []), + filter_column, + additional_filter_column, + ) + if candidate_column and candidate_column != missing_column + ] or None, + available_columns=list(df.columns), + ) + ) + except Exception as query_error: + return json.dumps({ + 'error': f"Query/filter error: {query_error}", + 'filename': filename, + 'selected_sheet': selected_sheet if workbook_metadata.get('is_workbook') else None, + }) + + try: + search_result = self._search_dataframe_rows( + filtered_df, + search_value=search_value, + search_columns=parsed_search_columns, + search_operator=search_operator, + return_columns=parsed_return_columns, + normalize_match=normalize_match_flag, + max_rows=int(max_rows), + ) + except KeyError as missing_column_error: + missing_column = str(missing_column_error).strip("'") + return json.dumps( + self._build_missing_column_error_payload( + container, + blob_path, + filename, + workbook_metadata, + selected_sheet, + missing_column, + related_columns=[ + candidate_column + for candidate_column in ( + *(parsed_search_columns or []), + *(parsed_return_columns or []), + filter_column, + additional_filter_column, + ) + if candidate_column and candidate_column != missing_column + ] or None, + available_columns=list(df.columns), + ) + ) + except ValueError as search_error: + return json.dumps({ + 'error': str(search_error), + 'filename': filename, + 'selected_sheet': selected_sheet if workbook_metadata.get('is_workbook') else None, + }) + + return json.dumps({ + 'filename': filename, + 'selected_sheet': selected_sheet if workbook_metadata.get('is_workbook') else None, + 'search_value': search_value, + 'search_operator': search_operator, + 'searched_columns': search_result['searched_columns'], + 'matched_columns': search_result['matched_columns'], + 'return_columns': search_result['return_columns'], + 'filter_applied': applied_filters, + 'normalize_match': normalize_match_flag, + 'total_matches': search_result['total_matches'], + 'returned_rows': search_result['returned_rows'], + 'data': search_result['data'], + }, indent=2, default=str) + except Exception as e: + log_event(f"[TabularProcessingPlugin] Error searching rows: {e}", level=logging.WARNING) + return json.dumps({"error": str(e)}) + + return await asyncio.to_thread(_sync_work) + @kernel_function( description=( "Execute a pandas query expression against a tabular file for advanced analysis. " @@ -2745,11 +3845,17 @@ def _sync_work(): ) df = self._try_numeric_conversion(df) - result_df = df.query(query_expression) + result_df, used_reviewer_style_fallback = self._apply_query_expression_with_fallback( + df, + query_expression=query_expression, + normalize_match=False, + ) limit = int(max_rows) return json.dumps({ "filename": filename, "selected_sheet": selected_sheet if workbook_metadata.get('is_workbook') else None, + "query_expression": query_expression, + "query_expression_fallback": used_reviewer_style_fallback, "total_matches": len(result_df), "returned_rows": min(len(result_df), limit), "data": result_df.head(limit).to_dict(orient='records') diff --git a/application/single_app/static/images/features/agent_action_grid_view.png b/application/single_app/static/images/features/agent_action_grid_view.png index 011fecaf..eaf3c01b 100644 Binary files a/application/single_app/static/images/features/agent_action_grid_view.png and b/application/single_app/static/images/features/agent_action_grid_view.png differ diff --git a/application/single_app/static/images/features/background_completion_notifications-01.png b/application/single_app/static/images/features/background_completion_notifications-01.png index f46bb7b3..7f8a3034 100644 Binary files a/application/single_app/static/images/features/background_completion_notifications-01.png and b/application/single_app/static/images/features/background_completion_notifications-01.png differ diff --git a/application/single_app/static/images/features/background_completion_notifications-02.png b/application/single_app/static/images/features/background_completion_notifications-02.png index aeac80d8..e02771b4 100644 Binary files a/application/single_app/static/images/features/background_completion_notifications-02.png and b/application/single_app/static/images/features/background_completion_notifications-02.png differ diff --git a/application/single_app/static/images/features/guided_tutorials_chat.png b/application/single_app/static/images/features/guided_tutorials_chat.png index a519385b..b25d89e9 100644 Binary files a/application/single_app/static/images/features/guided_tutorials_chat.png and b/application/single_app/static/images/features/guided_tutorials_chat.png differ diff --git a/application/single_app/static/images/features/guided_tutorials_workspace.png b/application/single_app/static/images/features/guided_tutorials_workspace.png index 246d018b..0fccf64c 100644 Binary files a/application/single_app/static/images/features/guided_tutorials_workspace.png and b/application/single_app/static/images/features/guided_tutorials_workspace.png differ diff --git a/application/single_app/static/images/features/sql_test_connection.png b/application/single_app/static/images/features/sql_test_connection.png index ad2cd4ac..8b18123a 100644 Binary files a/application/single_app/static/images/features/sql_test_connection.png and b/application/single_app/static/images/features/sql_test_connection.png differ diff --git a/application/single_app/static/images/features/tabular_analysis_enhanced_citations.png b/application/single_app/static/images/features/tabular_analysis_enhanced_citations.png index 7247eded..b6474ed9 100644 Binary files a/application/single_app/static/images/features/tabular_analysis_enhanced_citations.png and b/application/single_app/static/images/features/tabular_analysis_enhanced_citations.png differ diff --git a/application/single_app/static/images/features/thoughts_visibility.png b/application/single_app/static/images/features/thoughts_visibility.png index 0346bbed..cb987ea6 100644 Binary files a/application/single_app/static/images/features/thoughts_visibility.png and b/application/single_app/static/images/features/thoughts_visibility.png differ diff --git a/application/single_app/static/js/chat/chat-citations.js b/application/single_app/static/js/chat/chat-citations.js index 60099398..9d751ffd 100644 --- a/application/single_app/static/js/chat/chat-citations.js +++ b/application/single_app/static/js/chat/chat-citations.js @@ -10,6 +10,9 @@ import { showEnhancedCitationModal } from './chat-enhanced-citations.js'; // ------------------ const chatboxEl = document.getElementById("chatbox"); +const AGENT_CITATION_PREVIEW_ROWS = 3; +const AGENT_CITATION_EXPANDED_ROWS = 25; +let activeAgentCitationState = null; function escapeAttribute(value) { return String(value) @@ -310,8 +313,194 @@ export function showMetadataModal(metadataType, metadataContent, fileName) { modal.show(); } -export function showAgentCitationModal(toolName, toolArgs, toolResult) { - // Create or reuse the agent citation modal +function parseAgentCitationValue(value) { + if (value === null || value === undefined || value === "") { + return null; + } + + if (typeof value === "object") { + return value; + } + + if (typeof value !== "string") { + return value; + } + + const trimmedValue = value.trim(); + if (!trimmedValue || (trimmedValue[0] !== "{" && trimmedValue[0] !== "[")) { + return value; + } + + try { + return JSON.parse(trimmedValue); + } catch (error) { + return value; + } +} + +function prettyPrintAgentCitationValue(value) { + if (value === null || value === undefined || value === "") { + return "No result"; + } + + if (typeof value === "string") { + return value; + } + + try { + return JSON.stringify(value, null, 2); + } catch (error) { + return String(value); + } +} + +function cloneAgentCitationPayload(value) { + if (value === null || value === undefined) { + return value; + } + + try { + return JSON.parse(JSON.stringify(value)); + } catch (error) { + return value; + } +} + +function isTabularAgentCitationResult(resultPayload) { + return Boolean( + resultPayload + && typeof resultPayload === "object" + && !Array.isArray(resultPayload) + && Array.isArray(resultPayload.data) + && ( + Object.prototype.hasOwnProperty.call(resultPayload, "returned_rows") + || Object.prototype.hasOwnProperty.call(resultPayload, "total_matches") + || Object.prototype.hasOwnProperty.call(resultPayload, "filename") + || Object.prototype.hasOwnProperty.call(resultPayload, "selected_sheet") + ) + ); +} + +function getAgentCitationRowLimit(rowMode, totalRowCount) { + if (rowMode === "all") { + return totalRowCount; + } + + if (rowMode === "expanded25") { + return Math.min(totalRowCount, AGENT_CITATION_EXPANDED_ROWS); + } + + return Math.min(totalRowCount, AGENT_CITATION_PREVIEW_ROWS); +} + +function buildAgentCitationResultView(resultPayload, rowMode) { + if (!isTabularAgentCitationResult(resultPayload)) { + return { + resultText: prettyPrintAgentCitationValue(resultPayload), + summaryText: "", + controls: [], + }; + } + + const allRows = Array.isArray(resultPayload.data) ? resultPayload.data : []; + const totalRowCount = allRows.length; + const displayedRowCount = getAgentCitationRowLimit(rowMode, totalRowCount); + const displayedPayload = cloneAgentCitationPayload(resultPayload) || {}; + displayedPayload.data = allRows.slice(0, displayedRowCount); + displayedPayload.displayed_rows = displayedRowCount; + displayedPayload.data_rows_limited = displayedRowCount < totalRowCount; + + const summaryParts = []; + if (Object.prototype.hasOwnProperty.call(resultPayload, "total_matches")) { + summaryParts.push(`total_matches: ${resultPayload.total_matches}`); + } + if (Object.prototype.hasOwnProperty.call(resultPayload, "returned_rows")) { + summaryParts.push(`returned_rows: ${resultPayload.returned_rows}`); + } + summaryParts.push(`showing ${displayedRowCount} row${displayedRowCount === 1 ? "" : "s"}`); + + const controls = []; + if (totalRowCount > AGENT_CITATION_PREVIEW_ROWS && rowMode !== "preview") { + controls.push({ mode: "preview", label: "Show preview" }); + } + if ( + totalRowCount > AGENT_CITATION_EXPANDED_ROWS + && rowMode !== "expanded25" + ) { + controls.push({ mode: "expanded25", label: "Show 25 rows" }); + } + if ( + totalRowCount > AGENT_CITATION_PREVIEW_ROWS + && rowMode !== "all" + ) { + controls.push({ mode: "all", label: "Show all rows" }); + } + + return { + resultText: JSON.stringify(displayedPayload, null, 2), + summaryText: summaryParts.join(" โ€ข "), + controls, + }; +} + +function renderAgentCitationResult(toolResultEl, toolResultSummaryEl, toolResultActionsEl) { + if (!toolResultEl || !toolResultSummaryEl || !toolResultActionsEl || !activeAgentCitationState) { + return; + } + + const resultView = buildAgentCitationResultView( + activeAgentCitationState.parsedResult, + activeAgentCitationState.rowMode, + ); + + toolResultEl.textContent = resultView.resultText || "No result"; + toolResultSummaryEl.textContent = resultView.summaryText || ""; + toolResultSummaryEl.classList.toggle("d-none", !resultView.summaryText); + + toolResultActionsEl.innerHTML = ""; + toolResultActionsEl.classList.toggle("d-none", resultView.controls.length === 0); + resultView.controls.forEach((control) => { + const button = document.createElement("button"); + button.type = "button"; + button.className = "btn btn-sm btn-outline-secondary"; + button.textContent = control.label; + button.setAttribute("data-row-mode", control.mode); + button.addEventListener("click", () => { + activeAgentCitationState.rowMode = control.mode; + renderAgentCitationResult(toolResultEl, toolResultSummaryEl, toolResultActionsEl); + }); + toolResultActionsEl.appendChild(button); + }); +} + +async function fetchAgentCitationArtifact(conversationId, artifactId) { + if (!conversationId || !artifactId) { + return null; + } + + const response = await fetch( + `/api/conversation/${encodeURIComponent(conversationId)}/agent-citation/${encodeURIComponent(artifactId)}`, + { + method: "GET", + headers: { "Content-Type": "application/json" }, + } + ); + + let payload = null; + try { + payload = await response.json(); + } catch (error) { + payload = null; + } + + if (!response.ok) { + throw new Error(payload?.error || `Server responded with status ${response.status}`); + } + + return payload?.citation || null; +} + +export async function showAgentCitationModal(toolName, toolArgs, toolResult, options = {}) { let modalContainer = document.getElementById("agent-citation-modal"); if (!modalContainer) { modalContainer = document.createElement("div"); @@ -344,8 +533,12 @@ export function showAgentCitationModal(toolName, toolArgs, toolResult) {

             
             
-
Function Result:
-

+              
+
Function Result:
+
+
+
+

             
@@ -354,109 +547,62 @@ export function showAgentCitationModal(toolName, toolArgs, toolResult) { document.body.appendChild(modalContainer); } - // Update the content const toolNameEl = document.getElementById("agent-tool-name"); const toolArgsEl = document.getElementById("agent-tool-args"); const toolResultEl = document.getElementById("agent-tool-result"); + const toolResultSummaryEl = document.getElementById("agent-tool-result-summary"); + const toolResultActionsEl = document.getElementById("agent-tool-result-actions"); const toolSourceEl = document.getElementById("agent-tool-source"); const toolUrlEl = document.getElementById("agent-tool-url"); const toolUrlMetaEl = document.getElementById("agent-tool-url-meta"); - if (toolNameEl) { - toolNameEl.textContent = toolName || "Unknown"; - } - - let parsedArgs = null; - if (toolArgsEl) { - // Handle empty or no parameters more gracefully - let argsContent = ""; - + const artifactId = options.artifactId || ""; + const conversationId = options.conversationId + || window.chatConversations?.getCurrentConversationId?.() + || window.currentConversationId + || ""; + let citationPayload = { + tool_name: toolName, + function_arguments: toolArgs, + function_result: toolResult, + }; + + if (artifactId && conversationId) { + showLoadingIndicator(); try { - if (!toolArgs || toolArgs === "" || toolArgs === "{}") { - argsContent = "No parameters required"; - } else { - parsedArgs = JSON.parse(toolArgs); - // Check if it's an empty object - if (typeof parsedArgs === 'object' && Object.keys(parsedArgs).length === 0) { - argsContent = "No parameters required"; - } else { - argsContent = JSON.stringify(parsedArgs, null, 2); - } - } - } catch (e) { - // If it's not valid JSON, check if it's an object representation - if (toolArgs === "[object Object]" || !toolArgs || toolArgs.trim() === "") { - argsContent = "No parameters required"; - } else { - argsContent = toolArgs; + const hydratedCitation = await fetchAgentCitationArtifact(conversationId, artifactId); + if (hydratedCitation && typeof hydratedCitation === "object") { + citationPayload = hydratedCitation; } - } - - // Add truncation with expand/collapse if content is long - if (argsContent.length > 300 && argsContent !== "No parameters required") { - const truncatedContent = argsContent.substring(0, 300); - const remainingContent = argsContent.substring(300); - - toolArgsEl.innerHTML = ` -
- ${escapeHtml(truncatedContent)} - -
- `; - } else { - toolArgsEl.textContent = argsContent; + } catch (error) { + console.warn("Failed to hydrate agent citation artifact, using compact payload.", error); + } finally { + hideLoadingIndicator(); } } - - if (toolResultEl) { - // Handle result formatting and truncation with expand/collapse - let resultContent = ""; - let parsedResult = null; - - try { - if (!toolResult || toolResult === "" || toolResult === "{}") { - resultContent = "No result"; - } else if (toolResult === "[object Object]") { - resultContent = "No result data available"; - } else { - // Try to parse as JSON first - try { - parsedResult = JSON.parse(toolResult); - resultContent = JSON.stringify(parsedResult, null, 2); - } catch (parseError) { - // If not JSON, treat as string - resultContent = toolResult; - } - } - } catch (e) { - resultContent = toolResult || "No result"; - } + const parsedArgs = parseAgentCitationValue(citationPayload.function_arguments ?? toolArgs); + const parsedResult = parseAgentCitationValue(citationPayload.function_result ?? toolResult); + activeAgentCitationState = { + rowMode: "preview", + parsedArgs, + parsedResult, + }; + + if (toolNameEl) { + toolNameEl.textContent = citationPayload.tool_name || toolName || "Unknown"; + } + + if (toolArgsEl) { + toolArgsEl.textContent = parsedArgs === null + ? "No parameters required" + : prettyPrintAgentCitationValue(parsedArgs); + } + + if (toolResultEl && toolResultSummaryEl && toolResultActionsEl) { const citationDetails = extractAgentCitationDetails(parsedResult || parsedArgs); updateAgentCitationSource(toolSourceEl, toolUrlEl, toolUrlMetaEl, citationDetails); - - // Add truncation with expand/collapse if content is long - if (resultContent.length > 300) { - const truncatedContent = resultContent.substring(0, 300); - const remainingContent = resultContent.substring(300); - - toolResultEl.innerHTML = ` -
- ${escapeHtml(truncatedContent)} - -
- `; - } else { - toolResultEl.textContent = resultContent; - } + renderAgentCitationResult(toolResultEl, toolResultSummaryEl, toolResultActionsEl); } const modal = new bootstrap.Modal(modalContainer); @@ -681,6 +827,10 @@ if (chatboxEl) { const toolName = target.getAttribute("data-tool-name"); const toolArgs = target.getAttribute("data-tool-args"); const toolResult = target.getAttribute("data-tool-result"); + const artifactId = target.getAttribute("data-artifact-id"); + const conversationId = target.getAttribute("data-conversation-id") + || window.chatConversations?.getCurrentConversationId?.() + || window.currentConversationId; if (!toolName) { console.warn("Agent citation link clicked but data-tool-name is missing."); @@ -688,7 +838,10 @@ if (chatboxEl) { return; } - showAgentCitationModal(toolName, toolArgs, toolResult); + void showAgentCitationModal(toolName, toolArgs, toolResult, { + artifactId, + conversationId, + }); } else if (target && target.matches("a.file-link")) { // Keep existing file link logic event.preventDefault(); @@ -719,42 +872,4 @@ function escapeHtml(text) { div.textContent = text; return div.innerHTML; } - -// Global function to toggle result expansion (called from inline onclick) -window.toggleResultExpansion = function(button) { - const resultContent = button.closest('.result-content'); - const remaining = resultContent.querySelector('.result-remaining'); - const icon = button.querySelector('i'); - - if (remaining.style.display === 'none') { - // Expand - remaining.style.display = 'inline'; - icon.className = 'bi bi-chevron-up'; - button.title = 'Show less'; - } else { - // Collapse - remaining.style.display = 'none'; - icon.className = 'bi bi-chevron-down'; - button.title = 'Show more'; - } -}; - -// Global function to toggle arguments expansion (called from inline onclick) -window.toggleArgsExpansion = function(button) { - const argsContent = button.closest('.args-content'); - const remaining = argsContent.querySelector('.args-remaining'); - const icon = button.querySelector('i'); - - if (remaining.style.display === 'none') { - // Expand - remaining.style.display = 'inline'; - icon.className = 'bi bi-chevron-up'; - button.title = 'Show less'; - } else { - // Collapse - remaining.style.display = 'none'; - icon.className = 'bi bi-chevron-down'; - button.title = 'Show more'; - } -}; // --------------------------------------- \ No newline at end of file diff --git a/application/single_app/static/js/chat/chat-documents.js b/application/single_app/static/js/chat/chat-documents.js index 82b0f898..dde90dc5 100644 --- a/application/single_app/static/js/chat/chat-documents.js +++ b/application/single_app/static/js/chat/chat-documents.js @@ -34,6 +34,7 @@ const scopeSearchInput = document.getElementById("scope-search-input"); export let personalDocs = []; export let groupDocs = []; export let publicDocs = []; +const citationMetadataCache = new Map(); // Items removed from the DOM by tag filtering (stored so they can be re-added) // Each entry: { element, nextSibling } @@ -856,9 +857,46 @@ export function getDocumentMetadata(docId) { if (publicMatch) { return publicMatch; } + const cachedMatch = citationMetadataCache.get(docId); + if (cachedMatch) { + return cachedMatch; + } return null; // Not found in any list } +export async function fetchDocumentMetadata(docId) { + if (!docId) { + return null; + } + + const existingMetadata = getDocumentMetadata(docId); + if (existingMetadata) { + return existingMetadata; + } + + try { + const response = await fetch(`/api/enhanced_citations/document_metadata?doc_id=${encodeURIComponent(docId)}`, { + credentials: 'same-origin', + }); + + if (!response.ok) { + return null; + } + + const metadata = await response.json(); + if (metadata && metadata.id) { + citationMetadataCache.set(metadata.id, metadata); + } + if (metadata && metadata.document_id) { + citationMetadataCache.set(metadata.document_id, metadata); + } + return metadata; + } catch (error) { + console.warn('Error fetching citation document metadata:', error); + return null; + } +} + /* --------------------------------------------------------------------------- Loading Documents --------------------------------------------------------------------------- */ @@ -1034,6 +1072,7 @@ export async function loadTagsForScope() { // Clear existing options in both hidden select and custom dropdown chatTagsFilter.innerHTML = ''; if (tagsDropdownItems) tagsDropdownItems.innerHTML = ''; + resetTagSelectionState(); try { const scopes = getEffectiveScopes(); @@ -1244,6 +1283,24 @@ function hideTagsDropdown() { } } +function resetTagSelectionState() { + if (chatTagsFilter) { + Array.from(chatTagsFilter.options).forEach(option => { + option.selected = false; + }); + } + + if (tagsDropdownItems) { + tagsDropdownItems.querySelectorAll('.tag-checkbox').forEach(checkbox => { + checkbox.checked = false; + }); + } + + tagsSearchController?.resetFilter(); + syncTagsDropdownButtonText(); + filterDocumentsBySelectedTags(); +} + /* --------------------------------------------------------------------------- Sync Tags Dropdown Button Text with Selection State --------------------------------------------------------------------------- */ diff --git a/application/single_app/static/js/chat/chat-enhanced-citations.js b/application/single_app/static/js/chat/chat-enhanced-citations.js index 93779da9..561a7831 100644 --- a/application/single_app/static/js/chat/chat-enhanced-citations.js +++ b/application/single_app/static/js/chat/chat-enhanced-citations.js @@ -3,7 +3,7 @@ import { showToast } from "./chat-toast.js"; import { showLoadingIndicator, hideLoadingIndicator } from "./chat-loading-indicator.js"; -import { getDocumentMetadata } from './chat-documents.js'; +import { getDocumentMetadata, fetchDocumentMetadata } from './chat-documents.js'; /** * Determine file type from filename extension @@ -36,9 +36,14 @@ export function getFileType(fileName) { * @param {string} citationId - Citation ID for fallback * @param {string|null} initialSheetName - Workbook sheet to open initially for tabular files */ -export function showEnhancedCitationModal(docId, pageNumberOrTimestamp, citationId, initialSheetName = null) { - // Get document metadata to determine file type - const docMetadata = getDocumentMetadata(docId); +export async function showEnhancedCitationModal(docId, pageNumberOrTimestamp, citationId, initialSheetName = null) { + // Get document metadata to determine file type. Historical cited revisions + // are not in the current workspace list, so fetch on demand when needed. + let docMetadata = getDocumentMetadata(docId); + if (!docMetadata || !docMetadata.file_name) { + docMetadata = await fetchDocumentMetadata(docId); + } + if (!docMetadata || !docMetadata.file_name) { console.warn('Document metadata not found, falling back to text citation'); // Import fetchCitedText dynamically to avoid circular imports diff --git a/application/single_app/static/js/chat/chat-messages.js b/application/single_app/static/js/chat/chat-messages.js index 0c28e682..d350eb3a 100644 --- a/application/single_app/static/js/chat/chat-messages.js +++ b/application/single_app/static/js/chat/chat-messages.js @@ -448,6 +448,8 @@ function createCitationsHtml( data-tool-name="${escapeHtml(cite.tool_name || '')}" data-tool-args="${escapeHtml(toolArgs)}" data-tool-result="${escapeHtml(toolResult)}" + data-artifact-id="${escapeHtml(cite.artifact_id || '')}" + data-conversation-id="${escapeHtml(window.currentConversationId || '')}" title="Agent tool: ${escapeHtml(displayText)} - Click to view details"> ${escapeHtml(displayText)} `; @@ -2448,6 +2450,46 @@ function toggleMessageMetadata(messageDiv, messageId) { * Load message metadata into the drawer for AI/image/file messages */ function loadMessageMetadataForDisplay(messageId, container) { + function renderHistoryContextRefRow(label, refs) { + if (!Array.isArray(refs) || refs.length === 0) { + return `
${label}: none
`; + } + + return ` +
+
${label}:
+
${escapeHtml(refs.join(', '))}
+
+ `; + } + + function renderHistoryContextSection(historyContext) { + if (!historyContext || typeof historyContext !== 'object') { + return ''; + } + + let sectionHtml = '
'; + sectionHtml += '
History Context
'; + sectionHtml += '
'; + sectionHtml += `
Path: ${escapeHtml(String(historyContext.path || 'unknown'))}
`; + sectionHtml += `
Stored Messages: ${Number(historyContext.stored_total_messages || 0)}
`; + sectionHtml += `
History Limit: ${Number(historyContext.history_limit || 0)}
`; + sectionHtml += `
Older Messages: ${Number(historyContext.older_message_count || 0)}
`; + sectionHtml += `
Recent Selected: ${Number(historyContext.recent_message_count || 0)}
`; + sectionHtml += `
Final API Messages: ${Number(historyContext.final_api_message_count || 0)}
`; + sectionHtml += `
Summary Requested: ${historyContext.summary_requested ? 'Yes' : 'No'}
`; + sectionHtml += `
Summary Used: ${historyContext.summary_used ? 'Yes' : 'No'}
`; + sectionHtml += `
Default System Prompt: ${historyContext.default_system_prompt_inserted ? 'Inserted' : 'Not inserted'}
`; + sectionHtml += renderHistoryContextRefRow('Recent Refs', historyContext.selected_recent_message_refs); + sectionHtml += renderHistoryContextRefRow('Summarized Refs', historyContext.summarized_message_refs); + sectionHtml += renderHistoryContextRefRow('Skipped Inactive', historyContext.skipped_inactive_message_refs); + sectionHtml += renderHistoryContextRefRow('Skipped Masked', historyContext.skipped_masked_message_refs); + sectionHtml += renderHistoryContextRefRow('Final API Refs', historyContext.final_api_source_refs); + sectionHtml += '
'; + + return sectionHtml; + } + fetch(`/api/message/${messageId}/metadata`) .then(response => { if (!response.ok) { @@ -2471,6 +2513,7 @@ function loadMessageMetadataForDisplay(messageId, container) { active_thread: metadata.active_thread, thread_attempt: metadata.thread_attempt }; + const historyContext = metadata.metadata?.history_context || null; if (threadInfo.thread_id) { html += '
'; @@ -2533,6 +2576,10 @@ function loadMessageMetadataForDisplay(messageId, container) { html += '
'; } + + if (metadata.role === 'assistant' && historyContext) { + html += renderHistoryContextSection(historyContext); + } html += ''; container.innerHTML = html; diff --git a/application/single_app/static/js/chat/chat-thoughts.js b/application/single_app/static/js/chat/chat-thoughts.js index e5ada65f..a405d918 100644 --- a/application/single_app/static/js/chat/chat-thoughts.js +++ b/application/single_app/static/js/chat/chat-thoughts.js @@ -14,6 +14,7 @@ let activeStreamingServerMessageId = null; // --------------------------------------------------------------------------- function getThoughtIcon(stepType) { const iconMap = { + 'history_context': 'bi-diagram-3', 'search': 'bi-search', 'tabular_analysis': 'bi-table', 'web_search': 'bi-globe', diff --git a/application/single_app/static/js/public/public_workspace.js b/application/single_app/static/js/public/public_workspace.js index 3b8f54a0..c24dfe2c 100644 --- a/application/single_app/static/js/public/public_workspace.js +++ b/application/single_app/static/js/public/public_workspace.js @@ -48,6 +48,178 @@ const publicPromptModal = new bootstrap.Modal(document.getElementById('publicPro const publicDocMetadataModal = new bootstrap.Modal(document.getElementById('publicDocMetadataModal')); const publicTagManagementModal = new bootstrap.Modal(document.getElementById('publicTagManagementModal')); const publicTagSelectionModal = new bootstrap.Modal(document.getElementById('publicTagSelectionModal')); +const publicDocumentDeleteModalElement = document.getElementById('publicDocumentDeleteModal'); +const publicDocumentDeleteModal = publicDocumentDeleteModalElement ? new bootstrap.Modal(publicDocumentDeleteModalElement) : null; +const publicDocumentDeleteModalTitle = document.getElementById('publicDocumentDeleteModalLabel'); +const publicDocumentDeleteModalBody = document.getElementById('publicDocumentDeleteModalBody'); +const publicDeleteCurrentBtn = document.getElementById('publicDeleteCurrentBtn'); +const publicDeleteAllBtn = document.getElementById('publicDeleteAllBtn'); + +function getPublicDeleteModalContent(documentCount) { + if (documentCount === 1) { + return { + title: 'Delete Public Document', + body: ` +

Choose how to delete this public document revision.

+

Delete Current Version removes the visible revision and keeps older revisions for future comparison.

+

Delete All Versions permanently removes every stored revision for this document.

+ `, + }; + } + + return { + title: 'Delete Selected Public Documents', + body: ` +

Choose how to delete ${documentCount} selected current public document revision(s).

+

Delete Current Version removes only the visible revision for each selected document and keeps older revisions.

+

Delete All Versions permanently removes every stored revision for each selected document.

+ `, + }; +} + +function showPublicDocumentDeleteFeedback(message, variant = 'danger') { + if (typeof window.showToast === 'function') { + window.showToast(message, variant); + return; + } + + let container = document.getElementById('publicDocumentDeleteFeedbackContainer'); + if (!container) { + container = document.createElement('div'); + container.id = 'publicDocumentDeleteFeedbackContainer'; + container.className = 'toast-container position-fixed top-0 end-0 p-3'; + document.body.appendChild(container); + } + + if (window.bootstrap && typeof window.bootstrap.Toast === 'function') { + const toastElement = document.createElement('div'); + toastElement.className = `toast align-items-center text-white bg-${variant} border-0`; + toastElement.setAttribute('role', 'alert'); + toastElement.setAttribute('aria-live', 'assertive'); + toastElement.setAttribute('aria-atomic', 'true'); + + const wrapper = document.createElement('div'); + wrapper.className = 'd-flex'; + + const body = document.createElement('div'); + body.className = 'toast-body'; + body.textContent = message; + + const closeButton = document.createElement('button'); + closeButton.type = 'button'; + closeButton.className = 'btn-close btn-close-white me-2 m-auto'; + closeButton.setAttribute('data-bs-dismiss', 'toast'); + closeButton.setAttribute('aria-label', 'Close'); + + wrapper.appendChild(body); + wrapper.appendChild(closeButton); + toastElement.appendChild(wrapper); + container.appendChild(toastElement); + + const toast = new window.bootstrap.Toast(toastElement); + toast.show(); + toastElement.addEventListener('hidden.bs.toast', () => { + toastElement.remove(); + }); + return; + } + + const alertElement = document.createElement('div'); + alertElement.className = `alert alert-${variant} alert-dismissible fade show mb-2`; + alertElement.setAttribute('role', 'alert'); + + const body = document.createElement('span'); + body.textContent = message; + + const closeButton = document.createElement('button'); + closeButton.type = 'button'; + closeButton.className = 'btn-close'; + closeButton.setAttribute('data-bs-dismiss', 'alert'); + closeButton.setAttribute('aria-label', 'Close'); + + alertElement.appendChild(body); + alertElement.appendChild(closeButton); + container.appendChild(alertElement); +} + +function isPublicDocumentDeleteModalReady() { + return Boolean( + publicDocumentDeleteModal && + publicDocumentDeleteModalElement && + publicDocumentDeleteModalElement.isConnected && + publicDocumentDeleteModalBody && + publicDocumentDeleteModalBody.isConnected && + publicDeleteCurrentBtn && + publicDeleteCurrentBtn.isConnected && + publicDeleteAllBtn && + publicDeleteAllBtn.isConnected + ); +} + +function promptPublicDeleteMode(documentCount = 1) { + if (!isPublicDocumentDeleteModalReady()) { + showPublicDocumentDeleteFeedback('Delete confirmation dialog is unavailable. Refresh the page and try again.'); + return Promise.resolve(null); + } + + const modalContent = getPublicDeleteModalContent(documentCount); + if (publicDocumentDeleteModalTitle) { + publicDocumentDeleteModalTitle.textContent = modalContent.title; + } + publicDocumentDeleteModalBody.innerHTML = modalContent.body; + + return new Promise((resolve) => { + let settled = false; + + const cleanup = () => { + publicDocumentDeleteModalElement.removeEventListener('hidden.bs.modal', handleHidden); + publicDeleteCurrentBtn.removeEventListener('click', handleCurrentOnly); + publicDeleteAllBtn.removeEventListener('click', handleAllVersions); + }; + + const finalize = (value) => { + if (settled) { + return; + } + settled = true; + cleanup(); + resolve(value); + }; + + const handleHidden = () => finalize(null); + const handleCurrentOnly = () => { + publicDocumentDeleteModal.hide(); + finalize('current_only'); + }; + const handleAllVersions = () => { + publicDocumentDeleteModal.hide(); + finalize('all_versions'); + }; + + publicDocumentDeleteModalElement.addEventListener('hidden.bs.modal', handleHidden); + publicDeleteCurrentBtn.addEventListener('click', handleCurrentOnly); + publicDeleteAllBtn.addEventListener('click', handleAllVersions); + publicDocumentDeleteModal.show(); + }); +} + +async function requestPublicDocumentDeletion(documentId, deleteMode) { + const query = new URLSearchParams({ delete_mode: deleteMode }); + const response = await fetch(`/api/public_documents/${documentId}?${query.toString()}`, { method: 'DELETE' }); + + let responseData = {}; + try { + responseData = await response.json(); + } catch (error) { + responseData = {}; + } + + if (!response.ok) { + throw responseData.error ? responseData : { error: `Server responded with status ${response.status}` }; + } + + return responseData; +} // Editors let publicSimplemde = null; @@ -794,7 +966,32 @@ async function onPublicUploadClick() { xhr.send(formData); }); } -window.deletePublicDocument=async function(id, event){ if(!confirm('Delete?')) return; try{ await fetch(`/api/public_documents/${id}`,{method:'DELETE'}); fetchPublicDocs(); }catch(e){ alert(`Error deleting: ${e.error||e.message}`);} }; +window.deletePublicDocument = async function(id, event) { + const deleteMode = await promptPublicDeleteMode(1); + if (!deleteMode) { + return; + } + + const deleteTrigger = event ? event.target.closest('a, button') : null; + const originalDeleteTriggerHtml = deleteTrigger ? deleteTrigger.innerHTML : null; + if (deleteTrigger) { + deleteTrigger.classList.add('disabled'); + deleteTrigger.setAttribute('aria-disabled', 'true'); + deleteTrigger.innerHTML = ''; + } + + try { + await requestPublicDocumentDeletion(id, deleteMode); + fetchPublicDocs(); + } catch (e) { + showPublicWorkspaceToast(`Error deleting: ${e.error || e.message}`, 'danger'); + if (deleteTrigger && document.body.contains(deleteTrigger)) { + deleteTrigger.classList.remove('disabled'); + deleteTrigger.removeAttribute('aria-disabled'); + deleteTrigger.innerHTML = originalDeleteTriggerHtml; + } + } +}; window.searchPublicDocumentInChat = function(docId) { window.location.href = `/chats?search_documents=true&doc_scope=public&document_id=${docId}&workspace_id=${activePublicId}`; @@ -854,34 +1051,45 @@ function clearPublicSelection() { function deletePublicSelectedDocuments() { if (publicSelectedDocuments.size === 0) return; - if (!confirm(`Are you sure you want to delete ${publicSelectedDocuments.size} selected document(s)? This action cannot be undone.`)) return; - const deleteBtn = document.getElementById('public-delete-selected-btn'); - if (deleteBtn) { - deleteBtn.disabled = true; - deleteBtn.innerHTML = 'Deleting...'; - } + promptPublicDeleteMode(publicSelectedDocuments.size).then((deleteMode) => { + if (!deleteMode) { + return; + } - const deletePromises = Array.from(publicSelectedDocuments).map(docId => - fetch(`/api/public_documents/${docId}`, { method: 'DELETE' }) - .then(r => r.ok ? r.json() : Promise.reject(r)) - ); + const deleteBtn = document.getElementById('public-delete-selected-btn'); + if (deleteBtn) { + deleteBtn.disabled = true; + deleteBtn.innerHTML = 'Deleting...'; + } - Promise.allSettled(deletePromises) - .then(results => { - const successful = results.filter(r => r.status === 'fulfilled').length; - const failed = results.filter(r => r.status === 'rejected').length; - if (failed > 0) alert(`Deleted ${successful} document(s). ${failed} failed to delete.`); - publicSelectedDocuments.clear(); - updatePublicBulkActionButtons(); - fetchPublicDocs(); - }) - .finally(() => { - if (deleteBtn) { - deleteBtn.disabled = false; - deleteBtn.innerHTML = 'Delete Selected'; - } - }); + const deletePromises = Array.from(publicSelectedDocuments).map((docId) => requestPublicDocumentDeletion(docId, deleteMode)); + + Promise.allSettled(deletePromises) + .then((results) => { + const successful = results.filter((result) => result.status === 'fulfilled').length; + const failed = results.filter((result) => result.status === 'rejected').length; + if (failed > 0) { + const toastType = successful === 0 ? 'danger' : 'warning'; + showPublicWorkspaceToast(`Deleted ${successful} document(s). ${failed} failed to delete.`, toastType); + } + + if (publicSelectionMode) { + togglePublicSelectionMode(); + } else { + publicSelectedDocuments.clear(); + updatePublicBulkActionButtons(); + } + + fetchPublicDocs(); + }) + .finally(() => { + if (deleteBtn) { + deleteBtn.disabled = false; + deleteBtn.innerHTML = 'Delete Selected'; + } + }); + }); } function chatWithPublicSelected() { diff --git a/application/single_app/static/js/public/public_workspace_utility.js b/application/single_app/static/js/public/public_workspace_utility.js index d55959c3..85316737 100644 --- a/application/single_app/static/js/public/public_workspace_utility.js +++ b/application/single_app/static/js/public/public_workspace_utility.js @@ -13,6 +13,53 @@ function escapeHtml(unsafe) { return div.innerHTML; } +/** + * Shows a Bootstrap toast for public workspace actions. + * @param {string} message - The message to display + * @param {string} [type='info'] - Bootstrap contextual color + * @param {number} [duration=5000] - Toast delay in milliseconds + */ +function showPublicWorkspaceToast(message, type = 'info', duration = 5000) { + const safeMessage = escapeHtml(message || ''); + let toastContainer = document.getElementById('toast-container'); + + if (!toastContainer) { + toastContainer = document.createElement('div'); + toastContainer.id = 'toast-container'; + toastContainer.className = 'toast-container position-fixed bottom-0 end-0 p-3'; + toastContainer.style.zIndex = '1100'; + document.body.appendChild(toastContainer); + } + + const toastId = `public-workspace-toast-${Date.now()}-${Math.floor(Math.random() * 1000)}`; + toastContainer.insertAdjacentHTML('beforeend', ` + + `); + + const toastElement = document.getElementById(toastId); + if (!toastElement) { + return; + } + + if (!window.bootstrap || !window.bootstrap.Toast) { + toastElement.classList.add('show'); + return; + } + + const toast = new bootstrap.Toast(toastElement, { delay: duration }); + toast.show(); + toastElement.addEventListener('hidden.bs.toast', () => { + toastElement.remove(); + }); +} + +window.showPublicWorkspaceToast = showPublicWorkspaceToast; + /** * Updates the workspace status alert display based on workspace status * @param {Object} options - Configuration options diff --git a/application/single_app/static/js/workspace/workspace-documents.js b/application/single_app/static/js/workspace/workspace-documents.js index 481fe310..eefd0237 100644 --- a/application/single_app/static/js/workspace/workspace-documents.js +++ b/application/single_app/static/js/workspace/workspace-documents.js @@ -29,6 +29,12 @@ const docMetadataForm = document.getElementById("doc-metadata-form"); const docsSharedOnlyFilter = document.getElementById("docs-shared-only-filter"); const deleteSelectedBtn = document.getElementById("delete-selected-btn"); const clearSelectionBtn = document.getElementById("clear-selection-btn"); +const documentDeleteModalElement = document.getElementById("documentDeleteModal"); +const documentDeleteModal = documentDeleteModalElement ? new bootstrap.Modal(documentDeleteModalElement) : null; +const documentDeleteModalTitle = document.getElementById("documentDeleteModalLabel"); +const documentDeleteModalBody = document.getElementById("documentDeleteModalBody"); +const documentDeleteCurrentBtn = document.getElementById("documentDeleteCurrentBtn"); +const documentDeleteAllBtn = document.getElementById("documentDeleteAllBtn"); // Selection mode variables let selectionModeActive = false; @@ -84,6 +90,172 @@ function isColorLight(hexColor) { return luminance > 0.5; } +function getDocumentDeleteModalContent(documentCount) { + if (documentCount === 1) { + return { + title: "Delete Document", + body: ` +

Choose how to delete this document revision.

+

Delete Current Version removes the visible revision and keeps older revisions for later comparison.

+

Delete All Versions permanently removes every stored revision for this document.

+ `, + }; + } + + return { + title: "Delete Selected Documents", + body: ` +

Choose how to delete ${documentCount} selected current document revision(s).

+

Delete Current Version removes only the visible revision for each selected document and keeps older revisions.

+

Delete All Versions permanently removes every stored revision for each selected document.

+ `, + }; +} + +function showDocumentDeleteFeedback(message, variant = "danger") { + if (typeof window.showToast === "function") { + window.showToast(message, variant); + return; + } + + let container = document.getElementById("documentDeleteFeedbackContainer"); + if (!container) { + container = document.createElement("div"); + container.id = "documentDeleteFeedbackContainer"; + container.className = "toast-container position-fixed top-0 end-0 p-3"; + document.body.appendChild(container); + } + + if (window.bootstrap && typeof window.bootstrap.Toast === "function") { + const toastElement = document.createElement("div"); + toastElement.className = `toast align-items-center text-white bg-${variant} border-0`; + toastElement.setAttribute("role", "alert"); + toastElement.setAttribute("aria-live", "assertive"); + toastElement.setAttribute("aria-atomic", "true"); + + const wrapper = document.createElement("div"); + wrapper.className = "d-flex"; + + const body = document.createElement("div"); + body.className = "toast-body"; + body.textContent = message; + + const closeButton = document.createElement("button"); + closeButton.type = "button"; + closeButton.className = "btn-close btn-close-white me-2 m-auto"; + closeButton.setAttribute("data-bs-dismiss", "toast"); + closeButton.setAttribute("aria-label", "Close"); + + wrapper.appendChild(body); + wrapper.appendChild(closeButton); + toastElement.appendChild(wrapper); + container.appendChild(toastElement); + + const toast = new window.bootstrap.Toast(toastElement); + toast.show(); + toastElement.addEventListener("hidden.bs.toast", () => { + toastElement.remove(); + }); + return; + } + + const alertElement = document.createElement("div"); + alertElement.className = `alert alert-${variant} alert-dismissible fade show mb-2`; + alertElement.setAttribute("role", "alert"); + + const body = document.createElement("span"); + body.textContent = message; + + const closeButton = document.createElement("button"); + closeButton.type = "button"; + closeButton.className = "btn-close"; + closeButton.setAttribute("data-bs-dismiss", "alert"); + closeButton.setAttribute("aria-label", "Close"); + + alertElement.appendChild(body); + alertElement.appendChild(closeButton); + container.appendChild(alertElement); +} + +function isDocumentDeleteModalReady() { + return Boolean( + documentDeleteModal && + documentDeleteModalElement && + documentDeleteModalElement.isConnected && + documentDeleteModalBody && + documentDeleteModalBody.isConnected && + documentDeleteCurrentBtn && + documentDeleteCurrentBtn.isConnected && + documentDeleteAllBtn && + documentDeleteAllBtn.isConnected + ); +} + +function promptDocumentDeleteMode(documentCount = 1) { + if (!isDocumentDeleteModalReady()) { + showDocumentDeleteFeedback("Delete confirmation dialog is unavailable. Refresh the page and try again."); + return Promise.resolve(null); + } + + const modalContent = getDocumentDeleteModalContent(documentCount); + if (documentDeleteModalTitle) { + documentDeleteModalTitle.textContent = modalContent.title; + } + documentDeleteModalBody.innerHTML = modalContent.body; + + return new Promise((resolve) => { + let settled = false; + + const cleanup = () => { + documentDeleteModalElement.removeEventListener("hidden.bs.modal", handleHidden); + documentDeleteCurrentBtn.removeEventListener("click", handleCurrentOnly); + documentDeleteAllBtn.removeEventListener("click", handleAllVersions); + }; + + const finalize = (value) => { + if (settled) { + return; + } + settled = true; + cleanup(); + resolve(value); + }; + + const handleHidden = () => finalize(null); + const handleCurrentOnly = () => { + documentDeleteModal.hide(); + finalize("current_only"); + }; + const handleAllVersions = () => { + documentDeleteModal.hide(); + finalize("all_versions"); + }; + + documentDeleteModalElement.addEventListener("hidden.bs.modal", handleHidden); + documentDeleteCurrentBtn.addEventListener("click", handleCurrentOnly); + documentDeleteAllBtn.addEventListener("click", handleAllVersions); + documentDeleteModal.show(); + }); +} + +async function requestDocumentDeletion(documentId, deleteMode) { + const query = new URLSearchParams({ delete_mode: deleteMode }); + const response = await fetch(`/api/documents/${documentId}?${query.toString()}`, { method: "DELETE" }); + + let responseData = {}; + try { + responseData = await response.json(); + } catch (error) { + responseData = {}; + } + + if (!response.ok) { + throw responseData.error ? responseData : { error: `Server responded with status ${response.status}` }; + } + + return responseData; +} + // ------------- Event Listeners ------------- // Page Size @@ -1252,59 +1424,37 @@ window.onExtractMetadata = function (docId, event) { }; -window.deleteDocument = function(documentId, event) { - if (!confirm("Are you sure you want to delete this document? This action cannot be undone.")) return; +window.deleteDocument = async function(documentId, event) { + const deleteMode = await promptDocumentDeleteMode(1); + if (!deleteMode) { + return; + } - const deleteBtn = event ? event.target.closest('button') : null; - if (deleteBtn) { - deleteBtn.disabled = true; - deleteBtn.innerHTML = ``; + const deleteTrigger = event ? event.target.closest('a, button') : null; + const originalDeleteTriggerHtml = deleteTrigger ? deleteTrigger.innerHTML : null; + if (deleteTrigger) { + deleteTrigger.classList.add('disabled'); + deleteTrigger.setAttribute('aria-disabled', 'true'); + deleteTrigger.innerHTML = ``; } - // Stop polling if active for this document if (activePolls.has(documentId)) { - // Find the interval ID associated with this poll to clear it (more robust approach needed if storing interval IDs) - // For now, just remove from the active set; the poll will eventually fail or stop when elements disappear activePolls.delete(documentId); - // Ideally, you'd store intervalId with the docId in a map to clear it here. } - - fetch(`/api/documents/${documentId}`, { method: "DELETE" }) - .then(response => { - if (!response.ok) { - return response.json().then(data => Promise.reject(data)).catch(() => Promise.reject({ error: `Server responded with status ${response.status}` })); - } - return response.json(); - }) - .then(data => { - console.log("Document deleted successfully:", data); - const docRow = document.getElementById(`doc-row-${documentId}`); - const detailsRow = document.getElementById(`details-row-${documentId}`); - const statusRow = document.getElementById(`status-row-${documentId}`); - if (docRow) docRow.remove(); - if (detailsRow) detailsRow.remove(); - if (statusRow) statusRow.remove(); - - // Refresh if the table body becomes empty OR to update pagination total count - if (documentsTableBody && documentsTableBody.childElementCount === 0) { - fetchUserDocuments(); // Refresh to show 'No documents' message and correct pagination - } else { - // Maybe just decrement total count locally and re-render pagination? - // For simplicity, a full refresh might be acceptable unless dealing with huge lists/slow API - fetchUserDocuments(); // Refresh to update pagination potentially - } - - }) - .catch(error => { - console.error("Error deleting document:", error); - alert("Error deleting document: " + (error.error || error.message || "Unknown error")); - // Re-enable button only if it still exists - if (deleteBtn && document.body.contains(deleteBtn)) { - deleteBtn.disabled = false; - deleteBtn.innerHTML = ''; - } - }); + try { + const responseData = await requestDocumentDeletion(documentId, deleteMode); + console.log("Document deleted successfully:", responseData); + fetchUserDocuments(); + } catch (error) { + console.error("Error deleting document:", error); + alert("Error deleting document: " + (error.error || error.message || "Unknown error")); + if (deleteTrigger && document.body.contains(deleteTrigger)) { + deleteTrigger.classList.remove('disabled'); + deleteTrigger.removeAttribute('aria-disabled'); + deleteTrigger.innerHTML = originalDeleteTriggerHtml; + } + } } window.removeSelfFromDocument = function(documentId, event) { @@ -1468,64 +1618,43 @@ function updateBulkActionButtons() { } // Delete selected documents -window.deleteSelectedDocuments = function() { +window.deleteSelectedDocuments = async function() { if (selectedDocuments.size === 0) return; - - if (!confirm(`Are you sure you want to delete ${selectedDocuments.size} document(s)? This action cannot be undone.`)) { + + const deleteMode = await promptDocumentDeleteMode(selectedDocuments.size); + if (!deleteMode) { return; } - + const documentIds = Array.from(selectedDocuments); - let completed = 0; - let failed = 0; - - // Process each document deletion sequentially - documentIds.forEach(docId => { - fetch(`/api/documents/${docId}`, { method: "DELETE" }) - .then(response => { - if (response.ok) { - completed++; - const docRow = document.getElementById(`doc-row-${docId}`); - const detailsRow = document.getElementById(`details-row-${docId}`); - const statusRow = document.getElementById(`status-row-${docId}`); - if (docRow) docRow.remove(); - if (detailsRow) detailsRow.remove(); - if (statusRow) statusRow.remove(); - } else { - failed++; - } - - // Update status when all operations complete - if (completed + failed === documentIds.length) { - if (failed > 0) { - alert(`Deleted ${completed} document(s), but failed to delete ${failed} document(s).`); - } else { - alert(`Successfully deleted ${completed} document(s).`); - } - - // Refresh the documents list - fetchUserDocuments(); - - // Exit selection mode - window.toggleSelectionMode(); - } - }) - .catch(error => { - failed++; - console.error("Error deleting document:", error); - - // Update status when all operations complete - if (completed + failed === documentIds.length) { - alert(`Deleted ${completed} document(s), but failed to delete ${failed} document(s).`); - - // Refresh the documents list - fetchUserDocuments(); - - // Exit selection mode - window.toggleSelectionMode(); - } - }); - }); + if (deleteSelectedBtn) { + deleteSelectedBtn.disabled = true; + deleteSelectedBtn.innerHTML = `Deleting...`; + } + + documentIds.forEach((docId) => activePolls.delete(docId)); + + const results = await Promise.allSettled(documentIds.map((docId) => requestDocumentDeletion(docId, deleteMode))); + const completed = results.filter((result) => result.status === 'fulfilled').length; + const failed = results.filter((result) => result.status === 'rejected').length; + + if (failed > 0) { + alert(`Deleted ${completed} document(s), but failed to delete ${failed} document(s).`); + } + + if (selectionModeActive) { + window.toggleSelectionMode(); + } else { + selectedDocuments.clear(); + updateBulkActionButtons(); + } + + fetchUserDocuments(); + + if (deleteSelectedBtn) { + deleteSelectedBtn.disabled = false; + deleteSelectedBtn.innerHTML = 'Delete Selected'; + } }; // Remove self from selected shared documents diff --git a/application/single_app/templates/group_workspaces.html b/application/single_app/templates/group_workspaces.html index ea22c764..771649df 100644 --- a/application/single_app/templates/group_workspaces.html +++ b/application/single_app/templates/group_workspaces.html @@ -1688,6 +1688,199 @@ const groupTagSelectionModal = new bootstrap.Modal( document.getElementById("groupTagSelectionModal") ); + const groupDocumentDeleteModalElement = document.getElementById( + "groupDocumentDeleteModal" + ); + const groupDocumentDeleteModal = groupDocumentDeleteModalElement + ? new bootstrap.Modal(groupDocumentDeleteModalElement) + : null; + const groupDocumentDeleteModalTitle = document.getElementById( + "groupDocumentDeleteModalLabel" + ); + const groupDocumentDeleteModalBody = document.getElementById( + "groupDocumentDeleteModalBody" + ); + const groupDeleteCurrentBtn = document.getElementById( + "groupDeleteCurrentBtn" + ); + const groupDeleteAllBtn = document.getElementById("groupDeleteAllBtn"); + + function getGroupDeleteModalContent(documentCount) { + if (documentCount === 1) { + return { + title: "Delete Group Document", + body: ` +

Choose how to delete this group document revision.

+

Delete Current Version removes the visible revision and keeps older revisions for later comparison.

+

Delete All Versions permanently removes every stored revision for this document.

+ `, + }; + } + + return { + title: "Delete Selected Group Documents", + body: ` +

Choose how to delete ${documentCount} selected current group document revision(s).

+

Delete Current Version removes only the visible revision for each selected document and keeps older revisions.

+

Delete All Versions permanently removes every stored revision for each selected document.

+ `, + }; + } + + function showGroupDocumentDeleteFeedback(message, variant = "danger") { + if (typeof window.showToast === "function") { + window.showToast(message, variant); + return; + } + + let container = document.getElementById("groupDocumentDeleteFeedbackContainer"); + if (!container) { + container = document.createElement("div"); + container.id = "groupDocumentDeleteFeedbackContainer"; + container.className = "toast-container position-fixed top-0 end-0 p-3"; + document.body.appendChild(container); + } + + if (window.bootstrap && typeof window.bootstrap.Toast === "function") { + const toastElement = document.createElement("div"); + toastElement.className = `toast align-items-center text-white bg-${variant} border-0`; + toastElement.setAttribute("role", "alert"); + toastElement.setAttribute("aria-live", "assertive"); + toastElement.setAttribute("aria-atomic", "true"); + + const wrapper = document.createElement("div"); + wrapper.className = "d-flex"; + + const body = document.createElement("div"); + body.className = "toast-body"; + body.textContent = message; + + const closeButton = document.createElement("button"); + closeButton.type = "button"; + closeButton.className = "btn-close btn-close-white me-2 m-auto"; + closeButton.setAttribute("data-bs-dismiss", "toast"); + closeButton.setAttribute("aria-label", "Close"); + + wrapper.appendChild(body); + wrapper.appendChild(closeButton); + toastElement.appendChild(wrapper); + container.appendChild(toastElement); + + const toast = new window.bootstrap.Toast(toastElement); + toast.show(); + toastElement.addEventListener("hidden.bs.toast", () => { + toastElement.remove(); + }); + return; + } + + const alertElement = document.createElement("div"); + alertElement.className = `alert alert-${variant} alert-dismissible fade show mb-2`; + alertElement.setAttribute("role", "alert"); + + const body = document.createElement("span"); + body.textContent = message; + + const closeButton = document.createElement("button"); + closeButton.type = "button"; + closeButton.className = "btn-close"; + closeButton.setAttribute("data-bs-dismiss", "alert"); + closeButton.setAttribute("aria-label", "Close"); + + alertElement.appendChild(body); + alertElement.appendChild(closeButton); + container.appendChild(alertElement); + } + + function isGroupDocumentDeleteModalReady() { + return Boolean( + groupDocumentDeleteModal && + groupDocumentDeleteModalElement && + groupDocumentDeleteModalElement.isConnected && + groupDocumentDeleteModalBody && + groupDocumentDeleteModalBody.isConnected && + groupDeleteCurrentBtn && + groupDeleteCurrentBtn.isConnected && + groupDeleteAllBtn && + groupDeleteAllBtn.isConnected + ); + } + + function promptGroupDeleteMode(documentCount = 1) { + if (!isGroupDocumentDeleteModalReady()) { + showGroupDocumentDeleteFeedback("Delete confirmation dialog is unavailable. Refresh the page and try again."); + return Promise.resolve(null); + } + + const modalContent = getGroupDeleteModalContent(documentCount); + if (groupDocumentDeleteModalTitle) { + groupDocumentDeleteModalTitle.textContent = modalContent.title; + } + groupDocumentDeleteModalBody.innerHTML = modalContent.body; + + return new Promise((resolve) => { + let settled = false; + + const cleanup = () => { + groupDocumentDeleteModalElement.removeEventListener( + "hidden.bs.modal", + handleHidden + ); + groupDeleteCurrentBtn.removeEventListener("click", handleCurrentOnly); + groupDeleteAllBtn.removeEventListener("click", handleAllVersions); + }; + + const finalize = (value) => { + if (settled) { + return; + } + settled = true; + cleanup(); + resolve(value); + }; + + const handleHidden = () => finalize(null); + const handleCurrentOnly = () => { + groupDocumentDeleteModal.hide(); + finalize("current_only"); + }; + const handleAllVersions = () => { + groupDocumentDeleteModal.hide(); + finalize("all_versions"); + }; + + groupDocumentDeleteModalElement.addEventListener( + "hidden.bs.modal", + handleHidden + ); + groupDeleteCurrentBtn.addEventListener("click", handleCurrentOnly); + groupDeleteAllBtn.addEventListener("click", handleAllVersions); + groupDocumentDeleteModal.show(); + }); + } + + async function requestGroupDocumentDeletion(documentId, deleteMode) { + const query = new URLSearchParams({ delete_mode: deleteMode }); + const response = await fetch( + `/api/group_documents/${documentId}?${query.toString()}`, + { method: "DELETE" } + ); + + let responseData = {}; + try { + responseData = await response.json(); + } catch (error) { + responseData = {}; + } + + if (!response.ok) { + throw responseData.error + ? responseData + : { error: `Server responded with status ${response.status}` }; + } + + return responseData; + } // --- Editors --- const groupPromptContentEl = document.getElementById("group-prompt-content"); @@ -2129,7 +2322,7 @@ .finally(() => { if (deleteBtn) { deleteBtn.disabled = false; - deleteBtn.innerHTML = ` Delete`; + deleteBtn.innerHTML = `Delete Selected`; } }); } @@ -2418,6 +2611,27 @@ + + `; } } @@ -2833,39 +3047,43 @@ function fetchGroupDocuments() { if (!groupDocumentsTableBody || !activeGroupId) return; // Need table and active group - const placeholder = document.getElementById( - "group-legacy-update-prompt-placeholder" - ); - if (placeholder) { - // remove old alert div if present - const old = placeholder.querySelector("#group-legacy-update-alert"); - if (old) old.remove(); - } - - // Show loading state - groupDocumentsTableBody.innerHTML = `
Loading group documents...`; - if (groupDocsPaginationContainer) - groupDocsPaginationContainer.innerHTML = ""; // Clear pagination + groupDocumentsTableBody.innerHTML = ` + + +
Loading...
+ Loading group documents... + + `; + if (groupDocsPaginationContainer) groupDocsPaginationContainer.innerHTML = ""; - // Build query parameters for group documents endpoint const params = new URLSearchParams({ page: groupDocsCurrentPage, page_size: groupDocsPageSize, - // Crucially, the backend /api/group_documents needs to know WHICH group - // It gets this from the user's active group setting server-side. - // We add filters here: }); - if (groupDocsSearchTerm) params.append("search", groupDocsSearchTerm); - if (groupDocsClassificationFilter) + if (groupDocsSearchTerm) { + params.append("search", groupDocsSearchTerm); + } + if (groupDocsClassificationFilter) { params.append("classification", groupDocsClassificationFilter); - if (groupDocsAuthorFilter) params.append("author", groupDocsAuthorFilter); - if (groupDocsKeywordsFilter) + } + if (groupDocsAuthorFilter) { + params.append("author", groupDocsAuthorFilter); + } + if (groupDocsKeywordsFilter) { params.append("keywords", groupDocsKeywordsFilter); - if (groupDocsAbstractFilter) + } + if (groupDocsAbstractFilter) { params.append("abstract", groupDocsAbstractFilter); - if (groupDocsTagsFilter) params.append("tags", groupDocsTagsFilter); - if (groupDocsSortBy !== '_ts') params.append("sort_by", groupDocsSortBy); - if (groupDocsSortOrder !== 'desc') params.append("sort_order", groupDocsSortOrder); + } + if (groupDocsTagsFilter) { + params.append("tags", groupDocsTagsFilter); + } + if (groupDocsSortBy !== "_ts") { + params.append("sort_by", groupDocsSortBy); + } + if (groupDocsSortOrder !== "desc") { + params.append("sort_order", groupDocsSortOrder); + } console.log("Fetching group documents with params:", params.toString()); @@ -3564,7 +3782,7 @@ window.onExtractGroupMetadata = onExtractGroupMetadata; // Expose globally // --- Delete Group Document --- - function deleteGroupDocument(documentId, event) { + async function deleteGroupDocument(documentId, event) { // Renamed function // Permission check should happen server-side, but can add UI check too if ( @@ -3573,16 +3791,17 @@ alert("You do not have permission to delete documents in this group."); return; } - if ( - !confirm( - "Are you sure you want to delete this group document? This action cannot be undone." - ) - ) + + const deleteMode = await promptGroupDeleteMode(1); + if (!deleteMode) { return; + } - const deleteBtn = event ? event.target.closest("button") : null; + const deleteBtn = event ? event.target.closest("a, button") : null; + const originalDeleteBtnHtml = deleteBtn ? deleteBtn.innerHTML : null; if (deleteBtn) { - deleteBtn.disabled = true; + deleteBtn.classList.add("disabled"); + deleteBtn.setAttribute("aria-disabled", "true"); deleteBtn.innerHTML = ``; } @@ -3594,32 +3813,17 @@ // Use the group DELETE endpoint. Pass group_id as query param IF backend requires it. // Assuming backend gets active_group_id from session/context. // If needed: `/api/group_documents/${documentId}?group_id=${activeGroupId}` - fetch(`/api/group_documents/${documentId}`, { method: "DELETE" }) - .then((response) => - response.ok - ? response.json() - : response.json().then((err) => Promise.reject(err)) - ) + requestGroupDocumentDeletion(documentId, deleteMode) .then((data) => { console.log("Group document deleted:", data); - const docRow = document.getElementById(`group-doc-row-${documentId}`); - const detailsRow = document.getElementById( - `group-details-row-${documentId}` - ); - const statusRow = document.getElementById( - `group-status-row-${documentId}` - ); - if (docRow) docRow.remove(); - if (detailsRow) detailsRow.remove(); - if (statusRow) statusRow.remove(); - // Refresh to update pagination etc. fetchGroupDocuments(); }) .catch((error) => { alert("Error deleting document: " + (error.error || error.message)); if (deleteBtn && document.body.contains(deleteBtn)) { - deleteBtn.disabled = false; - deleteBtn.innerHTML = ''; + deleteBtn.classList.remove("disabled"); + deleteBtn.removeAttribute("aria-disabled"); + deleteBtn.innerHTML = originalDeleteBtnHtml; } }); } diff --git a/application/single_app/templates/public_workspaces.html b/application/single_app/templates/public_workspaces.html index 2d7b446b..447410ac 100644 --- a/application/single_app/templates/public_workspaces.html +++ b/application/single_app/templates/public_workspaces.html @@ -462,6 +462,28 @@
Currently Shared With:
Workspace Tutorial +