From 30c699629997be42df91d03df212d41f55d21cd6 Mon Sep 17 00:00:00 2001 From: linmm Date: Mon, 18 May 2026 19:56:43 +0800 Subject: [PATCH 1/6] fix(graph): resolve edge creation failure due to vertex ID mismatch When loading graph data, the HugeGraph server assigns vertex IDs (e.g., "1:Sarah") that differ from LLM-predicted IDs (e.g., "person:Sarah"). This causes edge creation to fail with IllegalArgumentException because the edge references use the original LLM-predicted IDs which don't match actual vertex IDs in the graph. Add a vid_mapping to track the ID mapping and update edge references after vertex creation. Co-Authored-By: Claude Opus 4.6 --- .../operators/hugegraph_op/commit_to_hugegraph.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py index d464b80ef..a87628fd9 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py @@ -81,6 +81,7 @@ def load_into_graph(self, vertices, edges, schema): # pylint: disable=too-many- vertex_label_map = {v_label["name"]: v_label for v_label in schema["vertexlabels"]} edge_label_map = {e_label["name"]: e_label for e_label in schema["edgelabels"]} property_label_map = {p_label["name"]: p_label for p_label in schema["propertykeys"]} + vid_mapping = {} # mapping from LLM-generated vertex ID to actual server vertex ID for vertex in vertices: input_label = vertex["label"] @@ -146,12 +147,15 @@ def load_into_graph(self, vertices, edges, schema): # pylint: disable=too-many- continue # TODO: we could try batch add vertices first, setback to single-mode if failed + original_id = vertex.get("id") vid = self._handle_graph_creation(self.client.graph().addVertex, input_label, input_properties).id vertex["id"] = vid + if original_id: + vid_mapping[original_id] = vid for edge in edges: - start = edge["outV"] - end = edge["inV"] + start = vid_mapping.get(edge.get("outV"), edge.get("outV")) + end = vid_mapping.get(edge.get("inV"), edge.get("inV")) label = edge["label"] properties = edge["properties"] From 13b152733ef8631cf32ad724755004c0f910eaac Mon Sep 17 00:00:00 2001 From: imbajin Date: Tue, 19 May 2026 12:21:31 +0800 Subject: [PATCH 2/6] fix(graph): cover edge vertex id mapping - add regression coverage for LLM vertex ids differing from created ids - verify edges use server-created vertex ids after vertex creation - keep the change scoped to commit_to_hugegraph tests --- .../hugegraph_op/test_commit_to_hugegraph.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py b/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py index 634fdb961..2399ec100 100644 --- a/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py +++ b/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py @@ -351,6 +351,48 @@ def test_load_into_graph_with_data_type_validation_success(self, mock_handle_gra # Verify that _handle_graph_creation was called for each vertex and edge self.assertEqual(mock_handle_graph_creation.call_count, 3) # 2 vertices + 1 edge + @patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation") + def test_load_into_graph_maps_llm_vertex_ids_to_created_vertex_ids(self, mock_handle_graph_creation): + """Test edges use server-created vertex ids when LLM ids differ.""" + mock_handle_graph_creation.side_effect = [ + MagicMock(id="1:Tom Hanks"), + MagicMock(id="2:Forrest Gump"), + MagicMock(id="edge_id"), + ] + + vertices = [ + { + "id": "person:Tom Hanks", + "label": "person", + "properties": {"name": "Tom Hanks", "age": 67}, + }, + { + "id": "movie:Forrest Gump", + "label": "movie", + "properties": {"title": "Forrest Gump", "year": 1994}, + }, + ] + edges = [ + { + "label": "acted_in", + "properties": {"role": "Forrest Gump"}, + "outV": "person:Tom Hanks", + "inV": "movie:Forrest Gump", + } + ] + + self.commit2graph.load_into_graph(vertices, edges, self.schema) + + self.assertEqual(vertices[0]["id"], "1:Tom Hanks") + self.assertEqual(vertices[1]["id"], "2:Forrest Gump") + mock_handle_graph_creation.assert_any_call( + self.commit2graph.client.graph().addEdge, + "acted_in", + "1:Tom Hanks", + "2:Forrest Gump", + {"role": "Forrest Gump"}, + ) + @patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation") def test_load_into_graph_with_data_type_validation_failure(self, mock_handle_graph_creation): """Test load_into_graph method with data type validation failure.""" From cff9714c00e8a8ab7c27789b97d233c7c2825dd3 Mon Sep 17 00:00:00 2001 From: imbajin Date: Tue, 19 May 2026 12:42:06 +0800 Subject: [PATCH 3/6] fix(graph): clarify extraction prompt id rules - define deterministic vertex ids from schema label ids - require edges to reference emitted vertex ids - add prompt contract tests and validate prompt output with agent --- .../src/hugegraph_llm/config/prompt_config.py | 178 +++++++++--------- .../src/tests/config/test_prompt_config.py | 46 +++++ 2 files changed, 138 insertions(+), 86 deletions(-) create mode 100644 hugegraph-llm/src/tests/config/test_prompt_config.py diff --git a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py index d56c830e8..5101a4196 100644 --- a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py +++ b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py @@ -47,62 +47,54 @@ def __init__(self, llm_config_object): # Note: Users should modify the prompt(examples) according to the real schema and text (property_graph_extract.py) extract_graph_prompt_EN: str = """## Main Task -Given the following graph schema and a piece of text, your task is to analyze the text and extract information that fits into the schema's structure, formatting the information into vertices and edges as specified. - -## Basic Rules: -### Schema Format: -Graph Schema: -- "vertices": [List of vertex labels and their properties] -- "edges": [List of edge labels, their source and target vertex labels, and properties] - -### Content Rule: -Please read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema. -You are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword. -For each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures: - -#### Vertex Format: -{"id":"vertexLabelID:entityName","label":"vertexLabel","type":"vertex","properties":{"propertyName":"propertyValue", ...}} - -where: - - "vertexLabelID": int - - "vertexLabel": str - - "entityName": str - - "type": "vertex" - - "properties": dict - -#### Edge Format: -{"id":"vertexlabelID:pk1!pk2!pk3", label":"edgeLabel","type":"edge","outV":"sourceVertexId","outVLabel":"sourceVertexLabel","inV":"targetVertexId","inVLabel":"targetVertexLabel","properties":{"propertyName":"propertyValue",...}} - -where: - - "id": int or str (conditional) (optional) - - "edgeLabel": str - - "type": "edge" - - "outV": str - - "outVLabel": str - - "inV": str - - "inVLabel": str - - "properties": dict - - "sourceVertexId": "vertexLabelID:entityName" - - "targetVertexId": "vertexLabelID:entityName" - -Strictly follow these rules: -1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information. -2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean). -3. If there are multiple primary keys, the strategy for generating VID is: vertexlabelID:pk1!pk2!pk3 (pk means primary key, and '!' is the separator). This id must be generated ONLY if there are multiple primary keys. If there is only one primary key, the strategy for generating VID is: int (sequencially increasing). -4. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema. -5. Translate the schema fields into Chinese if the given text input is Chinese (Optional) - -Refer to the following baseline example to understand the output generation requirements: -## Example: -### Input example: -#### text: -Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, in his professional life, works as a journalist. - -#### graph schema example: -{"vertices":[{"vertex_label":"person","properties":["name","age","occupation"]}], "edges":[{"edge_label":"roommate", "source_vertex_label":"person","target_vertex_label":"person","properties":["date"]]} - -### Output example: -{"vertices":[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"attorney"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"journalist"}}], "edges":[{"id": 1, "label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]}""" +Extract only the vertices and edges that are supported by the given graph schema and input text. Return valid JSON only. + +## Schema Contract +The Graph schema uses this shape: +- vertexlabels[]: each vertex label has "id", "name", "primary_keys", "properties", and optional "nullable_keys". +- edgelabels[]: each edge label has "name", "source_label", "target_label", and "properties". +- propertykeys[]: each property key has "name", "data_type", and "cardinality". + +## Output Contract +Return exactly one JSON object: {"vertices": [...], "edges": [...]} + +Vertex object: +{"id":"vertex id","label":"vertex label","type":"vertex","properties":{"propertyName":"propertyValue", ...}} +- Every vertex must include "type":"vertex"; do not omit it. + +Edge object: +{"label":"edge label","type":"edge","outV":"source vertex id","outVLabel":"source vertex label","inV":"target vertex id","inVLabel":"target vertex label","properties":{"propertyName":"propertyValue", ...}} +- Every edge must include "type":"edge"; do not omit it. + +## Deterministic Vertex ID Rules +For every vertex, first find the schema entry where vertexlabels[].name equals the output label. +- vertexLabelID must be taken from that schema entry's vertexlabels[].id. Never invent it from the label text. +- If primary_keys has exactly one key: id = "{vertexLabelID}:{properties.}". +- If primary_keys has multiple keys: id = "{vertexLabelID}:{properties.}!{properties.}" in the same order as schema primary_keys. +- Never use label names such as "person:Sarah" as vertex ids when schema gives a numeric vertex label id. + +## Edge Reference Rules +- outV and inV must exactly match the id of vertices in the same output. +- outVLabel/inVLabel must match the corresponding source/target vertex label. +- Only output an edge if both endpoint vertices are also present in vertices. +- Do not create an edge label that is not present in edgelabels[]. + +## Extraction Rules +1. Do not extract labels or properties that are absent from the schema. +2. Do not translate schema field names, labels, or property keys. Keep schema names exactly as provided. +3. Preserve property data types according to propertykeys[]; for example, INT stays number and BOOLEAN stays boolean. +4. Remove empty properties. Do not invent missing facts. +5. Output JSON only; no Markdown fences, prose, comments, or trailing text. + +## Example +Input text: +Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James works as a journalist. + +Graph schema example: +{"vertexlabels":[{"id":1,"name":"person","primary_keys":["name"],"properties":["name","age","occupation"],"nullable_keys":["age","occupation"]}],"edgelabels":[{"name":"roommate","source_label":"person","target_label":"person","properties":["date"]}],"propertykeys":[{"name":"name","data_type":"TEXT","cardinality":"SINGLE"},{"name":"age","data_type":"INT","cardinality":"SINGLE"},{"name":"occupation","data_type":"TEXT","cardinality":"SINGLE"},{"name":"date","data_type":"TEXT","cardinality":"SINGLE"}]} + +Output: +{"vertices":[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"attorney"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"journalist"}}],"edges":[{"label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]}""" graph_schema: str = """{ "vertexlabels": [ @@ -275,40 +267,54 @@ def __init__(self, llm_config_object): """ extract_graph_prompt_CN: str = """## 主要任务 -根据以下图谱和一段文本,你的任务是分析文本并提取符合模式结构的信息,将信息格式化为顶点和边。 - -## 基本规则 -### 模式格式 -图谱模式: -- 顶点:[顶点标签及其属性列表] -- 边:[边标签、源顶点标签、目标顶点标签及其属性列表] - -### 内容规则 -请仔细阅读提供的文本,识别与模式中定义的顶点和边相对应的信息。对于每一条匹配顶点或边的信息,按以下 JSON 结构格式化: - -#### 顶点格式: -{"id":"顶点标签 ID:实体名称","label":"顶点标签","type":"vertex","properties":{"属性名":"属性值", ...}} - -#### 边格式: -{"label":"边标签","type":"edge","outV":"源顶点 ID","outVLabel":"源顶点标签","inV":"目标顶点 ID","inVLabel":"目标顶点标签","properties":{"属性名":"属性值",...}} - -同时遵循以下规则: -1. 不要提取给定模式中不存在的属性字段或标签 -2. 确保提取的属性集与给定模式类型一致(如'age'应为数字,'select'应为布尔值) -3. 如果有多个主键,生成 VID 的策略是:顶点标签 ID:pk1!pk2!pk3(pk 表示主键,'!'是分隔符) -4. 以 JSON 格式输出,仅包含顶点和边,移除空属性,基于文本/规则和模式提取和格式化 -5. 如果给定文本为中文但模式为英文,则将模式字段翻译成中文(可选) +只抽取输入文本和给定图谱 schema 共同支持的顶点与边。只返回合法 JSON。 + +## Schema 契约 +图谱 schema 使用以下结构: +- vertexlabels[]:每个顶点标签包含 "id"、"name"、"primary_keys"、"properties",以及可选的 "nullable_keys"。 +- edgelabels[]:每个边标签包含 "name"、"source_label"、"target_label"、"properties"。 +- propertykeys[]:每个属性包含 "name"、"data_type"、"cardinality"。 + +## 输出契约 +必须返回唯一 JSON 对象:{"vertices": [...], "edges": [...]} + +顶点对象: +{"id":"顶点 id","label":"顶点标签","type":"vertex","properties":{"属性名":"属性值", ...}} +- 每个顶点必须包含 "type":"vertex",不能省略。 + +边对象: +{"label":"边标签","type":"edge","outV":"源顶点 id","outVLabel":"源顶点标签","inV":"目标顶点 id","inVLabel":"目标顶点标签","properties":{"属性名":"属性值", ...}} +- 每条边必须包含 "type":"edge",不能省略。 + +## 确定性顶点 ID 规则 +对每个顶点,先找到 schema 中 vertexlabels[].name 等于输出 label 的条目。 +- vertexLabelID 必须取自该 schema 条目的 vertexlabels[].id,不能从标签文本猜测。 +- 如果 primary_keys 只有一个字段:id = "{vertexLabelID}:{properties.}"。 +- 如果 primary_keys 有多个字段:id = "{vertexLabelID}:{properties.}!{properties.}",顺序必须与 schema primary_keys 一致。 +- 当 schema 提供数字顶点标签 id 时,不要使用 "person:Sarah" 这样的标签名作为顶点 id。 + +## 边引用规则 +- outV 和 inV 必须严格等于本次输出 vertices 中的 id。 +- outVLabel/inVLabel 必须分别匹配对应源/目标顶点标签。 +- 只有当两个端点顶点都出现在 vertices 中时,才输出该边。 +- 不要输出 edgelabels[] 中不存在的边标签。 + +## 抽取规则 +1. 不要抽取 schema 中不存在的标签或属性。 +2. 不要翻译 schema 字段名、标签名或属性 key,必须与 schema 原文完全一致。 +3. 根据 propertykeys[] 保持属性类型,例如 INT 保持数字,BOOLEAN 保持布尔值。 +4. 移除空属性。不要编造缺失事实。 +5. 只输出 JSON;不要输出 Markdown 代码块、解释文本、注释或尾随文本。 ## 示例 -### 输入示例: -#### 文本 -认识 Sarah,一位 30 岁的律师,和她的室友 James,他们从 2010 年开始合住。James 在职业生活中是一名记者。 +输入文本: +认识 Sarah,一位 30 岁的律师,和她的室友 James,他们从 2010 年开始合住。James 是一名记者。 -#### 图谱模式 -{"vertices":[{"vertex_label":"person","properties":["name","age","occupation"]}], "edges":[{"edge_label":"roommate", "source_vertex_label":"person","target_vertex_label":"person","properties":["date"]]} +图谱 schema 示例: +{"vertexlabels":[{"id":1,"name":"person","primary_keys":["name"],"properties":["name","age","occupation"],"nullable_keys":["age","occupation"]}],"edgelabels":[{"name":"roommate","source_label":"person","target_label":"person","properties":["date"]}],"propertykeys":[{"name":"name","data_type":"TEXT","cardinality":"SINGLE"},{"name":"age","data_type":"INT","cardinality":"SINGLE"},{"name":"occupation","data_type":"TEXT","cardinality":"SINGLE"},{"name":"date","data_type":"TEXT","cardinality":"SINGLE"}]} -### 输出示例: -[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"律师"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"记者"}},{"label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}] +输出: +{"vertices":[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"律师"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"记者"}}],"edges":[{"label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]} """ gremlin_generate_prompt_CN: str = """ diff --git a/hugegraph-llm/src/tests/config/test_prompt_config.py b/hugegraph-llm/src/tests/config/test_prompt_config.py new file mode 100644 index 000000000..6a49c8b1f --- /dev/null +++ b/hugegraph-llm/src/tests/config/test_prompt_config.py @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from hugegraph_llm.config.prompt_config import PromptConfig + + +def test_extract_graph_prompt_en_defines_deterministic_vertex_id_rules(): + prompt = PromptConfig.extract_graph_prompt_EN + + assert "vertexlabels[].id" in prompt + assert "id = \"{vertexLabelID}:{properties.}\"" in prompt + assert "id = \"{vertexLabelID}:{properties.}!{properties.}\"" in prompt + assert 'Never use label names such as "person:Sarah"' in prompt + assert "outV and inV must exactly match the id of vertices in the same output" in prompt + assert 'Every vertex must include "type":"vertex"' in prompt + assert 'Every edge must include "type":"edge"' in prompt + assert "Do not translate schema field names" in prompt + assert '{"vertices": [...], "edges": [...]}' in prompt + + +def test_extract_graph_prompt_cn_matches_en_vertex_id_contract(): + prompt = PromptConfig.extract_graph_prompt_CN + + assert "vertexlabels[].id" in prompt + assert 'id = "{vertexLabelID}:{properties.}"' in prompt + assert 'id = "{vertexLabelID}:{properties.}!{properties.}"' in prompt + assert '不要使用 "person:Sarah"' in prompt + assert "outV 和 inV 必须严格等于本次输出 vertices 中的 id" in prompt + assert '每个顶点必须包含 "type":"vertex"' in prompt + assert '每条边必须包含 "type":"edge"' in prompt + assert "不要翻译 schema 字段名" in prompt + assert '{"vertices": [...], "edges": [...]}' in prompt From edf1d10d44ab7141ea0f97b1da7f3738b8d1ced5 Mon Sep 17 00:00:00 2001 From: imbajin Date: Tue, 19 May 2026 12:43:29 +0800 Subject: [PATCH 4/6] docs: refine agent guidance - add root AGENTS guidance for repo-wide module boundaries - refactor hugegraph-llm AGENTS into concise module rules - emphasize sufficient and effective test coverage - align llm test commands with CI external-service skips --- AGENTS.md | 48 +++++++++++++++ hugegraph-llm/AGENTS.md | 125 ++++++++++++++-------------------------- 2 files changed, 92 insertions(+), 81 deletions(-) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..0376b4e28 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,48 @@ +# AGENTS.md + +Guidance for AI agents working in this repository. Keep README content in README files; keep this file focused on decisions agents commonly get wrong. + +## Stack & Modules + +- This is a Python `uv` workspace. Prefer root-level workspace commands unless a module-specific file says otherwise. +- `hugegraph-llm/` is the primary and most frequently changed module. When editing or reviewing it, read `hugegraph-llm/AGENTS.md` first. +- `hugegraph-python-client/` is a supporting dependency for HugeGraph access. Change it only when the client contract itself must change, and verify `hugegraph-llm` callers when you do. +- Treat `hugegraph-ml/` and `vermeer-python-client/` as lower-frequency modules. Do not expand changes into them without a direct reason. + +## Testing Expectations + +- Any code change must include sufficient and effective test coverage for the changed behavior, regression risk, or failure path. +- Do not add tests that only improve coverage numbers while mocking away the behavior being changed. +- If a change cannot reasonably include automated tests, state why and provide the manual verification performed. +- Cross-module or shared dependency changes must test the affected downstream module, not only the package where the edit was made. + +## Code Search Anchors + +- `hugegraph-llm/src/hugegraph_llm/` - main LLM, RAG, KG, prompt, API, and vector-index code. +- `hugegraph-python-client/src/pyhugegraph/` - Python client used by LLM code to talk to HugeGraph. +- `pyproject.toml` and module `pyproject.toml` files - workspace membership, dependency groups, lint settings, Python versions. +- `rules/README.md` - staged AI-assisted workflow for multi-file features, API contract changes, or cross-module design changes. + +## Build & Test + +```bash +uv sync --all-extras +uv run ruff format --check . +uv run ruff check . +``` + +- Run tests for the affected module rather than defaulting to a full-repository test sweep. +- For `hugegraph-llm`, use the module CI split between unit-style tests and integration tests. +- For `hugegraph-python-client`, include client tests and any `hugegraph-llm` tests needed to validate caller compatibility. + +## Agent Workflow + +- Before editing, identify whether the change belongs to `hugegraph-llm`, `hugegraph-python-client`, or root workspace configuration. +- For multi-file features, API contract changes, or cross-module design changes, read `rules/README.md` first. +- Keep changes scoped to the module that owns the behavior. Avoid opportunistic rewrites in sibling modules. + +## Cross-module Notes + +- Root dependency or workspace changes can affect multiple packages; verify the package that consumes the changed dependency. +- `hugegraph-llm` imports `hugegraph-python-client`; client API changes must preserve or deliberately update those call sites. +- Do not duplicate README quick-start, Docker, or deployment instructions in AGENTS files. diff --git a/hugegraph-llm/AGENTS.md b/hugegraph-llm/AGENTS.md index 4ca973fff..bc50fb5d4 100644 --- a/hugegraph-llm/AGENTS.md +++ b/hugegraph-llm/AGENTS.md @@ -1,93 +1,56 @@ -# Basic Introduction +# hugegraph-llm AGENTS.md -This file provides guidance to AI coding tools and developers when working with code in this repository. +Module-specific guidance for AI agents. Root `../AGENTS.md` still applies; this file only adds rules that matter inside `hugegraph-llm`. -## Project Overview +## Module Focus -HugeGraph-LLM is a comprehensive toolkit that bridges graph databases and large language models, -part of the Apache HugeGraph AI ecosystem. It enables seamless integration between HugeGraph and LLMs for building -intelligent applications with three main capabilities: Knowledge Graph Construction, Graph-Enhanced RAG, -and Text2Gremlin query generation. +- This module owns GraphRAG, knowledge graph construction, and Text2Gremlin behavior. +- Prefer changes in the owning layer first. If a fix crosses API, flow, node, operator, model, prompt, or index boundaries, preserve the existing contract or update tests for the new contract explicitly. +- `hugegraph-python-client` is the HugeGraph access boundary. Prefer adapting LLM-side code unless the client contract is actually wrong. -## Tech Stack +## Testing Expectations -- **Language**: Python 3.10+ (uv package manager required) -- **Framework**: FastAPI + Gradio for web interfaces -- **Graph Database**: HugeGraph Server 1.5+ -- **LLM Integration**: LiteLLM (supports OpenAI, Ollama, Qianfan, etc.) -- **Vector Operations**: FAISS, NumPy, and will support multiple Vector DB soon -- **Code style**: ruff & mypy (on the way, soon) -- **Key Dependencies**: hugegraph-python-client +- Any code change must add or update tests that exercise the changed behavior, regression risk, or failure path. +- For pipeline changes, cover the relevant flow, node, or operator contract instead of only testing a helper in isolation. +- For API or request/response changes, cover the public model or endpoint behavior. +- For prompt or Text2Gremlin changes, preserve and test the expected output contract, especially Gremlin-only fenced output when callers depend on it. +- External-service tests may be skipped only through explicit, traceable skip controls. Do not hide failures by silently swallowing HugeGraph, LLM provider, or vector DB connection errors. -## Essential Commands +## Code Search Anchors + +- `src/hugegraph_llm/api/` and `src/hugegraph_llm/api/models/` - FastAPI endpoints and request/response models. +- `src/hugegraph_llm/flows/`, `src/hugegraph_llm/nodes/`, and `src/hugegraph_llm/operators/` - pipeline orchestration and executable units. +- `src/hugegraph_llm/config/` and `src/hugegraph_llm/resources/` - runtime config and prompt resources. +- `src/hugegraph_llm/indices/` - vector index implementations and backends. +- `src/tests/` - unit, integration, and contract tests for this module. + +## Build & Test + +From the repository root: -### Running the Application ```bash -# Install dependencies and create virtual environment (uv already installed) -uv sync -# Activate virtual environment -source .venv/bin/activate -# Launch main RAG demo application -python -m hugegraph_llm.demo.rag_demo.app -# Custom host/port -python -m hugegraph_llm.demo.rag_demo.app --host 127.0.0.1 --port 18001 +uv sync --extra llm --extra dev ``` -### Testing +From `hugegraph-llm/`, these commands mirror the CI split: + ```bash -pytest src/tests/ -# Or using unittest -python -m unittest discover src/tests/ +SKIP_EXTERNAL_SERVICES=true uv run pytest src/tests/config/ src/tests/document/ src/tests/middleware/ src/tests/operators/ src/tests/models/ src/tests/indices/ src/tests/test_utils.py -v --tb=short +SKIP_EXTERNAL_SERVICES=true uv run pytest src/tests/integration/test_graph_rag_pipeline.py src/tests/integration/test_kg_construction.py src/tests/integration/test_rag_pipeline.py -v --tb=short ``` -PS: we skip Docker Deployment details here. - -## Architecture Overview - -### Core Directory Structure -- `src/hugegraph_llm/api/` - FastAPI endpoints (rag_api.py, admin_api.py) -- `src/hugegraph_llm/demo/rag_demo/` - Main Gradio UI application -- `src/hugegraph_llm/operators/` - Core processing pipelines -- `src/hugegraph_llm/models/` - LLM, embedding, reranker implementations -- `src/hugegraph_llm/indices/` - Vector and graph indexing -- `src/hugegraph_llm/config/` - Configuration management -- `src/hugegraph_llm/utils/` - Utilities, logging, decorators - -### Key Processing Pipelines - -1. **KG Construction** (`operators/kg_construction_task.py`) - - Text chunking and vectorization pipeline - - Schema management and validation - - Information extraction using LLMs - - Graph data commitment to HugeGraph - -2. **Graph RAG** (`operators/graph_rag_task.py`) - - Multi-modal retrieval (vector, graph, hybrid) - - Keyword extraction and entity matching - - Graph traversal and Gremlin query generation - - Result merging and reranking - -3. **Text2Gremlin** (`operators/gremlin_generate_task.py`) - - Natural language to Gremlin query conversion - - Template-based and few-shot learning approaches - -### Configuration Management - -- Main config: `.env` file (generate with `config.generate` module) -- Prompt config: `src/hugegraph_llm/resources/demo/config_prompt.yaml` -- HugeGraph connection settings in environment variables -- LLM provider configuration through `LiteLLM` & `openai/ollama` client - -## Development Workflow - -1. **Prerequisites**: Ensure HugeGraph Server is running and LLM provider is configured -2. **Environment Setup**: Use UV for dependency management, activate virtual environment -3. **Configuration**: Generate configs and set up .env file with proper credentials -4. **Development**: Use Gradio demo for interactive testing, FastAPI for programmatic access -5. **Testing**: Unit tests use standard unittest framework in src/tests/ - -## Important Notes - -- Always use `uv` package manager instead of `pip` for dependency management -- HugeGraph Server must be accessible while running the app -- The system supports multiple LLM providers through `LiteLLM` abstraction -- Each file should be better < 600 lines for maintainability + +- Use narrower `pytest` targets while iterating, but finish with coverage that matches the touched behavior. +- For Python code changes, run root `uv run ruff format --check .` and `uv run ruff check .` before handoff. + +## LLM-specific Rules + +- Preserve Text2Gremlin prompt/output contracts unless the task explicitly changes them. +- Keep GraphRAG retrieval, KG construction, and Text2Gremlin paths behaviorally separate; shared helpers should not blur pipeline semantics. +- Do not introduce a new LLM, embedding, reranker, or vector DB dependency without wiring it through existing config patterns. +- Treat HugeGraph Server, LLM providers, and vector databases as external services with explicit configuration and explicit test skip behavior. + +## Style + +- Python is `>=3.10,<3.12` for this module. +- Use `uv` for dependency management; do not document or rely on ad hoc `pip install` workflows. +- Ruff and mypy behavior comes from `pyproject.toml`; do not duplicate their rule sets here. From 3242c932987ee3cda8280b710c72bc8897ec4f8d Mon Sep 17 00:00:00 2001 From: imbajin Date: Tue, 19 May 2026 13:01:54 +0800 Subject: [PATCH 5/6] fix(graph): infer grouped extraction item types - infer vertex and edge type from grouped extraction arrays - remove redundant item-level type requirement from extraction prompts - align prompt example resources with deterministic vertex id rules - add parser and prompt-contract regression coverage --- .../src/hugegraph_llm/config/prompt_config.py | 16 +- .../llm_op/property_graph_extract.py | 7 +- .../prompt_examples/prompt_examples.json | 8 +- .../src/tests/config/test_prompt_config.py | 138 ++++++++++++++---- .../llm_op/test_property_graph_extract.py | 70 +++++++++ 5 files changed, 197 insertions(+), 42 deletions(-) diff --git a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py index 5101a4196..bf42d3865 100644 --- a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py +++ b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py @@ -59,12 +59,10 @@ def __init__(self, llm_config_object): Return exactly one JSON object: {"vertices": [...], "edges": [...]} Vertex object: -{"id":"vertex id","label":"vertex label","type":"vertex","properties":{"propertyName":"propertyValue", ...}} -- Every vertex must include "type":"vertex"; do not omit it. +{"id":"vertex id","label":"vertex label","properties":{"propertyName":"propertyValue", ...}} Edge object: -{"label":"edge label","type":"edge","outV":"source vertex id","outVLabel":"source vertex label","inV":"target vertex id","inVLabel":"target vertex label","properties":{"propertyName":"propertyValue", ...}} -- Every edge must include "type":"edge"; do not omit it. +{"label":"edge label","outV":"source vertex id","outVLabel":"source vertex label","inV":"target vertex id","inVLabel":"target vertex label","properties":{"propertyName":"propertyValue", ...}} ## Deterministic Vertex ID Rules For every vertex, first find the schema entry where vertexlabels[].name equals the output label. @@ -94,7 +92,7 @@ def __init__(self, llm_config_object): {"vertexlabels":[{"id":1,"name":"person","primary_keys":["name"],"properties":["name","age","occupation"],"nullable_keys":["age","occupation"]}],"edgelabels":[{"name":"roommate","source_label":"person","target_label":"person","properties":["date"]}],"propertykeys":[{"name":"name","data_type":"TEXT","cardinality":"SINGLE"},{"name":"age","data_type":"INT","cardinality":"SINGLE"},{"name":"occupation","data_type":"TEXT","cardinality":"SINGLE"},{"name":"date","data_type":"TEXT","cardinality":"SINGLE"}]} Output: -{"vertices":[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"attorney"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"journalist"}}],"edges":[{"label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]}""" +{"vertices":[{"id":"1:Sarah","label":"person","properties":{"name":"Sarah","age":30,"occupation":"attorney"}},{"id":"1:James","label":"person","properties":{"name":"James","occupation":"journalist"}}],"edges":[{"label":"roommate","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]}""" graph_schema: str = """{ "vertexlabels": [ @@ -279,12 +277,10 @@ def __init__(self, llm_config_object): 必须返回唯一 JSON 对象:{"vertices": [...], "edges": [...]} 顶点对象: -{"id":"顶点 id","label":"顶点标签","type":"vertex","properties":{"属性名":"属性值", ...}} -- 每个顶点必须包含 "type":"vertex",不能省略。 +{"id":"顶点 id","label":"顶点标签","properties":{"属性名":"属性值", ...}} 边对象: -{"label":"边标签","type":"edge","outV":"源顶点 id","outVLabel":"源顶点标签","inV":"目标顶点 id","inVLabel":"目标顶点标签","properties":{"属性名":"属性值", ...}} -- 每条边必须包含 "type":"edge",不能省略。 +{"label":"边标签","outV":"源顶点 id","outVLabel":"源顶点标签","inV":"目标顶点 id","inVLabel":"目标顶点标签","properties":{"属性名":"属性值", ...}} ## 确定性顶点 ID 规则 对每个顶点,先找到 schema 中 vertexlabels[].name 等于输出 label 的条目。 @@ -314,7 +310,7 @@ def __init__(self, llm_config_object): {"vertexlabels":[{"id":1,"name":"person","primary_keys":["name"],"properties":["name","age","occupation"],"nullable_keys":["age","occupation"]}],"edgelabels":[{"name":"roommate","source_label":"person","target_label":"person","properties":["date"]}],"propertykeys":[{"name":"name","data_type":"TEXT","cardinality":"SINGLE"},{"name":"age","data_type":"INT","cardinality":"SINGLE"},{"name":"occupation","data_type":"TEXT","cardinality":"SINGLE"},{"name":"date","data_type":"TEXT","cardinality":"SINGLE"}]} 输出: -{"vertices":[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"律师"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"记者"}}],"edges":[{"label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]} +{"vertices":[{"id":"1:Sarah","label":"person","properties":{"name":"Sarah","age":30,"occupation":"律师"}},{"id":"1:James","label":"person","properties":{"name":"James","occupation":"记者"}}],"edges":[{"label":"roommate","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]} """ gremlin_generate_prompt_CN: str = """ diff --git a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py index bf3630192..bb84434fb 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py @@ -147,11 +147,14 @@ def process_items(item_list, valid_labels, item_type): if not isinstance(item, dict): log.warning("Invalid property graph item type '%s'.", type(item)) continue + item = dict(item) + item_type_value = item.get("type", item_type) + item["type"] = item_type_value if not self.NECESSARY_ITEM_KEYS.issubset(item.keys()): log.warning("Invalid item keys '%s'.", item.keys()) continue - if item["type"] != item_type: - log.warning("Invalid %s type '%s' has been ignored.", item_type, item["type"]) + if item_type_value != item_type: + log.warning("Invalid %s type '%s' has been ignored.", item_type, item_type_value) continue if item["label"] not in valid_labels: log.warning( diff --git a/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json b/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json index f3bd33c37..c4acda03d 100644 --- a/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json +++ b/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json @@ -3,24 +3,24 @@ "name": "Official Person-Relationship Extraction", "description": "A standard template for extracting Person and Webpage entities, along with their relationships (Roommate, Owns), from descriptive text.", "text": "Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, in his professional life, works as a journalist. Additionally, Sarah is the proud owner of the website www.sarahsplace.com.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of text, your task is to analyze the text and extract information that fits into the schema's structure, formatting the information into vertices and edges as specified.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"type\":\"vertex\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"type\":\"edge\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. If there are multiple primary keys, the strategy for generating VID is: vertexlabelID:pk1!pk2!pk3 (pk means primary key, and '!' is the separator). This id must be generated ONLY if there are multiple primary keys. If there is only one primary key, the strategy for generating VID is: int (sequencially increasing).\n4. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n5. Translate the schema fields into Chinese if the given text input is Chinese (Optional)\n\n## Example:\n### Input example:\n#### text:\nMeet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, in his professional life, works as a journalist.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"person\",\"properties\":[\"name\",\"age\",\"occupation\"]}], \"edges\":[{\"edge_label\":\"roommate\", \"source_vertex_label\":\"person\",\"target_vertex_label\":\"person\",\"properties\":[\"date\"]}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Sarah\",\"label\":\"person\",\"type\":\"vertex\",\"properties\":{\"name\":\"Sarah\",\"age\":30,\"occupation\":\"attorney\"}},{\"id\":\"1:James\",\"label\":\"person\",\"type\":\"vertex\",\"properties\":{\"name\":\"James\",\"occupation\":\"journalist\"}}], \"edges\":[{\"label\":\"roommate\",\"type\":\"edge\",\"outV\":\"1:Sarah\",\"outVLabel\":\"person\",\"inV\":\"1:James\",\"inVLabel\":\"person\",\"properties\":{\"date\":\"2010\"}}]}" + "prompt": "## Main Task\nGiven the following graph schema and a piece of text, your task is to analyze the text and extract information that fits into the schema's structure, formatting the information into vertices and edges as specified.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Use the vertex label id as the id prefix. If there is one primary key, generate VID as vertexlabelID:primaryKeyValue. If there are multiple primary keys, generate VID as vertexlabelID:pk1!pk2!pk3 in schema primary-key order.\n4. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n5. Translate the schema fields into Chinese if the given text input is Chinese (Optional)\n\n## Example:\n### Input example:\n#### text:\nMeet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, in his professional life, works as a journalist.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"person\",\"properties\":[\"name\",\"age\",\"occupation\"]}], \"edges\":[{\"edge_label\":\"roommate\", \"source_vertex_label\":\"person\",\"target_vertex_label\":\"person\",\"properties\":[\"date\"]}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Sarah\",\"label\":\"person\",\"properties\":{\"name\":\"Sarah\",\"age\":30,\"occupation\":\"attorney\"}},{\"id\":\"1:James\",\"label\":\"person\",\"properties\":{\"name\":\"James\",\"occupation\":\"journalist\"}}], \"edges\":[{\"label\":\"roommate\",\"outV\":\"1:Sarah\",\"outVLabel\":\"person\",\"inV\":\"1:James\",\"inVLabel\":\"person\",\"properties\":{\"date\":\"2010\"}}]}" }, { "name": "Traffic Accident Element Extraction", "description": "Extracts key elements from a traffic accident report, including persons involved, vehicles, and responsibilities.", "text": "On March 15, 2024, John Smith, driving a red Porsche with license plate NY-88888, collided with a scooter ridden by Mike Lee at the intersection of People's Road and Liberation Road. The collision resulted in a fracture in Mike Lee's right leg. The traffic police determined that John Smith was fully responsible for running a red light.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of text about a traffic accident, your task is to extract information that fits into the schema's structure, formatting the information into vertices and edges as specified.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"type\":\"vertex\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"type\":\"edge\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nOn March 15, 2024, John Smith, driving a red Porsche with license plate NY-88888, collided with a scooter ridden by Mike Lee at the intersection of People's Road and Liberation Road. The collision resulted in a fracture in Mike Lee's right leg. The traffic police determined that John Smith was fully responsible for running a red light.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Person\",\"properties\":[\"name\",\"injury_level\"]},{\"vertex_label\":\"Vehicle\",\"properties\":[\"license_plate\",\"type\",\"color\"]},{\"vertex_label\":\"Accident\",\"properties\":[\"date\",\"location\",\"responsible_party\"]}], \"edges\":[{\"edge_label\":\"drives\",\"source_label\":\"Person\",\"target_label\":\"Vehicle\"},{\"edge_label\":\"participated_in\",\"source_label\":\"Person\",\"target_label\":\"Accident\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:John Smith\",\"label\":\"Person\",\"type\":\"vertex\",\"properties\":{\"name\":\"John Smith\"}},{\"id\":\"1:Mike Lee\",\"label\":\"Person\",\"type\":\"vertex\",\"properties\":{\"name\":\"Mike Lee\",\"injury_level\":\"right leg fracture\"}},{\"id\":\"2:NY-88888\",\"label\":\"Vehicle\",\"type\":\"vertex\",\"properties\":{\"license_plate\":\"NY-88888\",\"type\":\"Porsche\",\"color\":\"red\"}}],\"edges\":[{\"label\":\"drives\",\"type\":\"edge\",\"outV\":\"1:John Smith\",\"outVLabel\":\"Person\",\"inV\":\"2:NY-88888\",\"inVLabel\":\"Vehicle\",\"properties\":{}}]}" + "prompt": "## Main Task\nGiven the following graph schema and a piece of text about a traffic accident, your task is to extract information that fits into the schema's structure, formatting the information into vertices and edges as specified.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nOn March 15, 2024, John Smith, driving a red Porsche with license plate NY-88888, collided with a scooter ridden by Mike Lee at the intersection of People's Road and Liberation Road. The collision resulted in a fracture in Mike Lee's right leg. The traffic police determined that John Smith was fully responsible for running a red light.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Person\",\"properties\":[\"name\",\"injury_level\"]},{\"vertex_label\":\"Vehicle\",\"properties\":[\"license_plate\",\"type\",\"color\"]},{\"vertex_label\":\"Accident\",\"properties\":[\"date\",\"location\",\"responsible_party\"]}], \"edges\":[{\"edge_label\":\"drives\",\"source_label\":\"Person\",\"target_label\":\"Vehicle\"},{\"edge_label\":\"participated_in\",\"source_label\":\"Person\",\"target_label\":\"Accident\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:John Smith\",\"label\":\"Person\",\"properties\":{\"name\":\"John Smith\"}},{\"id\":\"1:Mike Lee\",\"label\":\"Person\",\"properties\":{\"name\":\"Mike Lee\",\"injury_level\":\"right leg fracture\"}},{\"id\":\"2:NY-88888\",\"label\":\"Vehicle\",\"properties\":{\"license_plate\":\"NY-88888\",\"type\":\"Porsche\",\"color\":\"red\"}}],\"edges\":[{\"label\":\"drives\",\"outV\":\"1:John Smith\",\"outVLabel\":\"Person\",\"inV\":\"2:NY-88888\",\"inVLabel\":\"Vehicle\",\"properties\":{}}]}" }, { "name": "Financial Event Extraction", "description": "Extracts key financial information such as companies, acquisition events, and amounts from financial news.", "text": "Tech giant Company A announced yesterday that it will fully acquire startup Company B, which operates in the artificial intelligence sector, for a price of $2 billion. The acquisition is expected to be completed by the end of the year.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of financial news, your task is to extract information about corporate mergers and acquisitions.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"type\":\"vertex\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"type\":\"edge\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nTech giant Company A announced yesterday that it will fully acquire startup Company B, which operates in the artificial intelligence sector, for a price of $2 billion. The acquisition is expected to be completed by the end of the year.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Company\",\"properties\":[\"name\",\"field\"]},{\"vertex_label\":\"Acquisition\",\"properties\":[\"price\",\"currency\",\"status\"]}], \"edges\":[{\"edge_label\":\"acquirer\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\"},{\"edge_label\":\"acquired\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Company A\",\"label\":\"Company\",\"type\":\"vertex\",\"properties\":{\"name\":\"Company A\",\"field\":\"Tech\"}},{\"id\":\"1:Company B\",\"label\":\"Company\",\"type\":\"vertex\",\"properties\":{\"name\":\"Company B\",\"field\":\"artificial intelligence\"}},{\"id\":\"2:A acquires B\",\"label\":\"Acquisition\",\"type\":\"vertex\",\"properties\":{\"price\":2000000000,\"currency\":\"USD\",\"status\":\"expected to be completed by year-end\"}}],\"edges\":[{\"label\":\"acquirer\",\"type\":\"edge\",\"outV\":\"2:A acquires B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company A\",\"inVLabel\":\"Company\",\"properties\":{}},{\"label\":\"acquired\",\"type\":\"edge\",\"outV\":\"2:A acquires B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company B\",\"inVLabel\":\"Company\",\"properties\":{}}]}" + "prompt": "## Main Task\nGiven the following graph schema and a piece of financial news, your task is to extract information about corporate mergers and acquisitions.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nTech giant Company A announced yesterday that it will fully acquire startup Company B, which operates in the artificial intelligence sector, for a price of $2 billion. The acquisition is expected to be completed by the end of the year.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Company\",\"properties\":[\"name\",\"field\"]},{\"vertex_label\":\"Acquisition\",\"properties\":[\"price\",\"currency\",\"status\"]}], \"edges\":[{\"edge_label\":\"acquirer\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\"},{\"edge_label\":\"acquired\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Company A\",\"label\":\"Company\",\"properties\":{\"name\":\"Company A\",\"field\":\"Tech\"}},{\"id\":\"1:Company B\",\"label\":\"Company\",\"properties\":{\"name\":\"Company B\",\"field\":\"artificial intelligence\"}},{\"id\":\"2:A acquires B\",\"label\":\"Acquisition\",\"properties\":{\"price\":2000000000,\"currency\":\"USD\",\"status\":\"expected to be completed by year-end\"}}],\"edges\":[{\"label\":\"acquirer\",\"outV\":\"2:A acquires B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company A\",\"inVLabel\":\"Company\",\"properties\":{}},{\"label\":\"acquired\",\"outV\":\"2:A acquires B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company B\",\"inVLabel\":\"Company\",\"properties\":{}}]}" }, { "name": "Medical Diagnosis Extraction", "description": "Extracts patients, symptoms, diagnosis results, and recommended drugs from medical record text.", "text": "Patient Li Hua, presents with a headache and fever for three days. After examination, the diagnosis is a viral cold. It is recommended to take the drug 'Gankang' for treatment.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of medical record, your task is to extract entities and relationships related to diagnosis and treatment.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"type\":\"vertex\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"type\":\"edge\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nPatient Li Hua, presents with a headache and fever for three days. After examination, the diagnosis is a viral cold. It is recommended to take the drug 'Gankang' for treatment.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Patient\",\"properties\":[\"name\"]},{\"vertex_label\":\"Symptom\",\"properties\":[\"name\"]},{\"vertex_label\":\"Diagnosis\",\"properties\":[\"name\"]},{\"vertex_label\":\"Drug\",\"properties\":[\"name\"]}], \"edges\":[{\"edge_label\":\"has_symptom\",\"source_label\":\"Patient\",\"target_label\":\"Symptom\"},{\"edge_label\":\"diagnosed_with\",\"source_label\":\"Patient\",\"target_label\":\"Diagnosis\"},{\"edge_label\":\"recommends_drug\",\"source_label\":\"Diagnosis\",\"target_label\":\"Drug\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Li Hua\",\"label\":\"Patient\",\"type\":\"vertex\",\"properties\":{\"name\":\"Li Hua\"}},{\"id\":\"2:headache\",\"label\":\"Symptom\",\"type\":\"vertex\",\"properties\":{\"name\":\"headache\"}},{\"id\":\"2:fever\",\"label\":\"Symptom\",\"type\":\"vertex\",\"properties\":{\"name\":\"fever\"}},{\"id\":\"3:viral cold\",\"label\":\"Diagnosis\",\"type\":\"vertex\",\"properties\":{\"name\":\"viral cold\"}},{\"id\":\"4:Gankang\",\"label\":\"Drug\",\"type\":\"vertex\",\"properties\":{\"name\":\"Gankang\"}}],\"edges\":[{\"label\":\"has_symptom\",\"type\":\"edge\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"2:headache\",\"inVLabel\":\"Symptom\",\"properties\":{}},{\"label\":\"diagnosed_with\",\"type\":\"edge\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"3:viral cold\",\"inVLabel\":\"Diagnosis\",\"properties\":{}},{\"label\":\"recommends_drug\",\"type\":\"edge\",\"outV\":\"3:viral cold\",\"outVLabel\":\"Diagnosis\",\"inV\":\"4:Gankang\",\"inVLabel\":\"Drug\",\"properties\":{}}]}" + "prompt": "## Main Task\nGiven the following graph schema and a piece of medical record, your task is to extract entities and relationships related to diagnosis and treatment.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nPatient Li Hua, presents with a headache and fever for three days. After examination, the diagnosis is a viral cold. It is recommended to take the drug 'Gankang' for treatment.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Patient\",\"properties\":[\"name\"]},{\"vertex_label\":\"Symptom\",\"properties\":[\"name\"]},{\"vertex_label\":\"Diagnosis\",\"properties\":[\"name\"]},{\"vertex_label\":\"Drug\",\"properties\":[\"name\"]}], \"edges\":[{\"edge_label\":\"has_symptom\",\"source_label\":\"Patient\",\"target_label\":\"Symptom\"},{\"edge_label\":\"diagnosed_with\",\"source_label\":\"Patient\",\"target_label\":\"Diagnosis\"},{\"edge_label\":\"recommends_drug\",\"source_label\":\"Diagnosis\",\"target_label\":\"Drug\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Li Hua\",\"label\":\"Patient\",\"properties\":{\"name\":\"Li Hua\"}},{\"id\":\"2:headache\",\"label\":\"Symptom\",\"properties\":{\"name\":\"headache\"}},{\"id\":\"2:fever\",\"label\":\"Symptom\",\"properties\":{\"name\":\"fever\"}},{\"id\":\"3:viral cold\",\"label\":\"Diagnosis\",\"properties\":{\"name\":\"viral cold\"}},{\"id\":\"4:Gankang\",\"label\":\"Drug\",\"properties\":{\"name\":\"Gankang\"}}],\"edges\":[{\"label\":\"has_symptom\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"2:headache\",\"inVLabel\":\"Symptom\",\"properties\":{}},{\"label\":\"diagnosed_with\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"3:viral cold\",\"inVLabel\":\"Diagnosis\",\"properties\":{}},{\"label\":\"recommends_drug\",\"outV\":\"3:viral cold\",\"outVLabel\":\"Diagnosis\",\"inV\":\"4:Gankang\",\"inVLabel\":\"Drug\",\"properties\":{}}]}" } ] diff --git a/hugegraph-llm/src/tests/config/test_prompt_config.py b/hugegraph-llm/src/tests/config/test_prompt_config.py index 6a49c8b1f..103d6cecf 100644 --- a/hugegraph-llm/src/tests/config/test_prompt_config.py +++ b/hugegraph-llm/src/tests/config/test_prompt_config.py @@ -15,32 +15,118 @@ # specific language governing permissions and limitations # under the License. +import json +from pathlib import Path +from unittest.mock import MagicMock + from hugegraph_llm.config.prompt_config import PromptConfig +from hugegraph_llm.models.llms.base import BaseLLM +from hugegraph_llm.operators.llm_op.property_graph_extract import PropertyGraphExtract + + +def _json_objects_after_marker(prompt, marker): + start = prompt.index(marker) + len(marker) + decoder = json.JSONDecoder() + objects = [] + index = start + while True: + index = prompt.find("{", index) + if index == -1: + return objects + try: + value, end = decoder.raw_decode(prompt[index:]) + except json.JSONDecodeError: + index += 1 + continue + objects.append(value) + index += end + + +def _example_schema_and_output(prompt, example_marker): + objects = _json_objects_after_marker(prompt, example_marker) + schema = next(obj for obj in objects if "vertexlabels" in obj and "edgelabels" in obj) + output = next(obj for obj in objects if "vertices" in obj and "edges" in obj) + return schema, output + + +def _assert_prompt_example_contract(prompt, example_marker): + schema, output = _example_schema_and_output(prompt, example_marker) + _assert_output_matches_schema_contract(schema, output) + + +def _assert_output_matches_schema_contract(schema, output): + assert set(output) == {"vertices", "edges"} + assert output["vertices"] + assert output["edges"] + + vertex_ids = {vertex["id"] for vertex in output["vertices"]} + vertex_labels = {vertex["label"] for vertex in output["vertices"]} + schema_vertices = {vertex["name"]: vertex for vertex in schema["vertexlabels"]} + schema_edge_labels = {edge["name"] for edge in schema["edgelabels"]} + + for vertex in output["vertices"]: + assert set(vertex) == {"id", "label", "properties"} + schema_vertex = schema_vertices[vertex["label"]] + primary_values = [str(vertex["properties"][key]) for key in schema_vertex["primary_keys"]] + expected_id = f"{schema_vertex['id']}:{'!'.join(primary_values)}" + assert vertex["id"] == expected_id + assert not vertex["id"].startswith(f"{vertex['label']}:") + assert isinstance(vertex["properties"], dict) + + for edge in output["edges"]: + assert set(edge) == {"label", "outV", "outVLabel", "inV", "inVLabel", "properties"} + assert edge["label"] in schema_edge_labels + assert edge["outV"] in vertex_ids + assert edge["inV"] in vertex_ids + assert edge["outVLabel"] in vertex_labels + assert edge["inVLabel"] in vertex_labels + assert isinstance(edge["properties"], dict) + + extractor = PropertyGraphExtract(llm=MagicMock(spec=BaseLLM)) + parsed_items = extractor._extract_and_filter_label(schema, json.dumps(output)) + assert {item["type"] for item in parsed_items} == {"vertex", "edge"} + assert len(parsed_items) == len(output["vertices"]) + len(output["edges"]) + + +def test_extract_graph_prompt_en_example_matches_parser_contract(): + _assert_prompt_example_contract(PromptConfig.extract_graph_prompt_EN, "## Example") + + +def test_extract_graph_prompt_cn_example_matches_parser_contract(): + _assert_prompt_example_contract(PromptConfig.extract_graph_prompt_CN, "## 示例") + + +def test_extract_graph_prompt_example_contract_rejects_label_name_vertex_id(): + schema, output = _example_schema_and_output(PromptConfig.extract_graph_prompt_EN, "## Example") + output["vertices"][0]["id"] = "person:Sarah" + + try: + _assert_output_matches_schema_contract(schema, output) + except AssertionError: + return + + raise AssertionError("Prompt example contract accepted a label-name vertex id") + + +def test_extract_graph_prompt_example_contract_rejects_dangling_edge_reference(): + schema, output = _example_schema_and_output(PromptConfig.extract_graph_prompt_EN, "## Example") + output["edges"][0]["outV"] = "1:Missing" + + try: + _assert_output_matches_schema_contract(schema, output) + except AssertionError: + return + + raise AssertionError("Prompt example contract accepted an edge reference outside vertices") + +def test_prompt_examples_do_not_require_redundant_item_type(): + examples_path = ( + Path(__file__).parents[2] / "hugegraph_llm" / "resources" / "prompt_examples" / "prompt_examples.json" + ) + examples = json.loads(examples_path.read_text(encoding="utf-8")) -def test_extract_graph_prompt_en_defines_deterministic_vertex_id_rules(): - prompt = PromptConfig.extract_graph_prompt_EN - - assert "vertexlabels[].id" in prompt - assert "id = \"{vertexLabelID}:{properties.}\"" in prompt - assert "id = \"{vertexLabelID}:{properties.}!{properties.}\"" in prompt - assert 'Never use label names such as "person:Sarah"' in prompt - assert "outV and inV must exactly match the id of vertices in the same output" in prompt - assert 'Every vertex must include "type":"vertex"' in prompt - assert 'Every edge must include "type":"edge"' in prompt - assert "Do not translate schema field names" in prompt - assert '{"vertices": [...], "edges": [...]}' in prompt - - -def test_extract_graph_prompt_cn_matches_en_vertex_id_contract(): - prompt = PromptConfig.extract_graph_prompt_CN - - assert "vertexlabels[].id" in prompt - assert 'id = "{vertexLabelID}:{properties.}"' in prompt - assert 'id = "{vertexLabelID}:{properties.}!{properties.}"' in prompt - assert '不要使用 "person:Sarah"' in prompt - assert "outV 和 inV 必须严格等于本次输出 vertices 中的 id" in prompt - assert '每个顶点必须包含 "type":"vertex"' in prompt - assert '每条边必须包含 "type":"edge"' in prompt - assert "不要翻译 schema 字段名" in prompt - assert '{"vertices": [...], "edges": [...]}' in prompt + for example in examples: + prompt = example["prompt"] + assert '"type":"vertex"' not in prompt + assert '"type":"edge"' not in prompt diff --git a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py index 5a2dee09e..132fb6f54 100644 --- a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py +++ b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py @@ -200,6 +200,48 @@ def test_extract_and_filter_label_valid_json(self): self.assertEqual(result[1]["type"], "edge") self.assertEqual(result[1]["label"], "acted_in") + def test_extract_and_filter_label_infers_type_from_grouped_arrays(self): + """Infer item type from vertices/edges containers when LLM omits it.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + } + ], + "edges": [ + { + "label": "acted_in", + "properties": { + "role": "Forrest Gump" + }, + "source": { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + "target": { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(len(result), 2) + self.assertEqual(result[0]["type"], "vertex") + self.assertEqual(result[0]["label"], "person") + self.assertEqual(result[1]["type"], "edge") + self.assertEqual(result[1]["label"], "acted_in") + def test_extract_and_filter_label_invalid_json(self): """Test the _extract_and_filter_label method with invalid JSON.""" extractor = PropertyGraphExtract(llm=self.mock_llm) @@ -233,6 +275,34 @@ def test_extract_and_filter_label_invalid_item_type(self): self.assertEqual(result, []) + def test_extract_and_filter_label_rejects_explicit_type_mismatch(self): + """Do not override an explicit item type that conflicts with its container.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "type": "edge", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + } + ], + "edges": [ + { + "type": "vertex", + "label": "acted_in", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(result, []) + def test_extract_and_filter_label_invalid_label(self): """Test the _extract_and_filter_label method with invalid label.""" extractor = PropertyGraphExtract(llm=self.mock_llm) From 6755a22f2f9c53b5da03ff51145d668945e29dd6 Mon Sep 17 00:00:00 2001 From: imbajin Date: Tue, 19 May 2026 19:13:52 +0800 Subject: [PATCH 6/6] fix(graph): normalize extracted graph ids - derive primary-key vertex ids from schema after LLM extraction - resolve or reject edge endpoints before graph commit - align prompt examples with the extraction contract - remove unrelated AGENTS changes from the PR diff --- AGENTS.md | 48 -- hugegraph-llm/AGENTS.md | 125 +++-- .../hugegraph_op/commit_to_hugegraph.py | 11 +- .../llm_op/property_graph_extract.py | 105 +++- .../prompt_examples/prompt_examples.json | 8 +- .../src/tests/config/test_prompt_config.py | 27 +- .../hugegraph_op/test_commit_to_hugegraph.py | 128 +++++ .../llm_op/test_property_graph_extract.py | 469 +++++++++++++++++- 8 files changed, 791 insertions(+), 130 deletions(-) delete mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index 0376b4e28..000000000 --- a/AGENTS.md +++ /dev/null @@ -1,48 +0,0 @@ -# AGENTS.md - -Guidance for AI agents working in this repository. Keep README content in README files; keep this file focused on decisions agents commonly get wrong. - -## Stack & Modules - -- This is a Python `uv` workspace. Prefer root-level workspace commands unless a module-specific file says otherwise. -- `hugegraph-llm/` is the primary and most frequently changed module. When editing or reviewing it, read `hugegraph-llm/AGENTS.md` first. -- `hugegraph-python-client/` is a supporting dependency for HugeGraph access. Change it only when the client contract itself must change, and verify `hugegraph-llm` callers when you do. -- Treat `hugegraph-ml/` and `vermeer-python-client/` as lower-frequency modules. Do not expand changes into them without a direct reason. - -## Testing Expectations - -- Any code change must include sufficient and effective test coverage for the changed behavior, regression risk, or failure path. -- Do not add tests that only improve coverage numbers while mocking away the behavior being changed. -- If a change cannot reasonably include automated tests, state why and provide the manual verification performed. -- Cross-module or shared dependency changes must test the affected downstream module, not only the package where the edit was made. - -## Code Search Anchors - -- `hugegraph-llm/src/hugegraph_llm/` - main LLM, RAG, KG, prompt, API, and vector-index code. -- `hugegraph-python-client/src/pyhugegraph/` - Python client used by LLM code to talk to HugeGraph. -- `pyproject.toml` and module `pyproject.toml` files - workspace membership, dependency groups, lint settings, Python versions. -- `rules/README.md` - staged AI-assisted workflow for multi-file features, API contract changes, or cross-module design changes. - -## Build & Test - -```bash -uv sync --all-extras -uv run ruff format --check . -uv run ruff check . -``` - -- Run tests for the affected module rather than defaulting to a full-repository test sweep. -- For `hugegraph-llm`, use the module CI split between unit-style tests and integration tests. -- For `hugegraph-python-client`, include client tests and any `hugegraph-llm` tests needed to validate caller compatibility. - -## Agent Workflow - -- Before editing, identify whether the change belongs to `hugegraph-llm`, `hugegraph-python-client`, or root workspace configuration. -- For multi-file features, API contract changes, or cross-module design changes, read `rules/README.md` first. -- Keep changes scoped to the module that owns the behavior. Avoid opportunistic rewrites in sibling modules. - -## Cross-module Notes - -- Root dependency or workspace changes can affect multiple packages; verify the package that consumes the changed dependency. -- `hugegraph-llm` imports `hugegraph-python-client`; client API changes must preserve or deliberately update those call sites. -- Do not duplicate README quick-start, Docker, or deployment instructions in AGENTS files. diff --git a/hugegraph-llm/AGENTS.md b/hugegraph-llm/AGENTS.md index bc50fb5d4..4ca973fff 100644 --- a/hugegraph-llm/AGENTS.md +++ b/hugegraph-llm/AGENTS.md @@ -1,56 +1,93 @@ -# hugegraph-llm AGENTS.md +# Basic Introduction -Module-specific guidance for AI agents. Root `../AGENTS.md` still applies; this file only adds rules that matter inside `hugegraph-llm`. +This file provides guidance to AI coding tools and developers when working with code in this repository. -## Module Focus +## Project Overview -- This module owns GraphRAG, knowledge graph construction, and Text2Gremlin behavior. -- Prefer changes in the owning layer first. If a fix crosses API, flow, node, operator, model, prompt, or index boundaries, preserve the existing contract or update tests for the new contract explicitly. -- `hugegraph-python-client` is the HugeGraph access boundary. Prefer adapting LLM-side code unless the client contract is actually wrong. +HugeGraph-LLM is a comprehensive toolkit that bridges graph databases and large language models, +part of the Apache HugeGraph AI ecosystem. It enables seamless integration between HugeGraph and LLMs for building +intelligent applications with three main capabilities: Knowledge Graph Construction, Graph-Enhanced RAG, +and Text2Gremlin query generation. -## Testing Expectations +## Tech Stack -- Any code change must add or update tests that exercise the changed behavior, regression risk, or failure path. -- For pipeline changes, cover the relevant flow, node, or operator contract instead of only testing a helper in isolation. -- For API or request/response changes, cover the public model or endpoint behavior. -- For prompt or Text2Gremlin changes, preserve and test the expected output contract, especially Gremlin-only fenced output when callers depend on it. -- External-service tests may be skipped only through explicit, traceable skip controls. Do not hide failures by silently swallowing HugeGraph, LLM provider, or vector DB connection errors. +- **Language**: Python 3.10+ (uv package manager required) +- **Framework**: FastAPI + Gradio for web interfaces +- **Graph Database**: HugeGraph Server 1.5+ +- **LLM Integration**: LiteLLM (supports OpenAI, Ollama, Qianfan, etc.) +- **Vector Operations**: FAISS, NumPy, and will support multiple Vector DB soon +- **Code style**: ruff & mypy (on the way, soon) +- **Key Dependencies**: hugegraph-python-client -## Code Search Anchors - -- `src/hugegraph_llm/api/` and `src/hugegraph_llm/api/models/` - FastAPI endpoints and request/response models. -- `src/hugegraph_llm/flows/`, `src/hugegraph_llm/nodes/`, and `src/hugegraph_llm/operators/` - pipeline orchestration and executable units. -- `src/hugegraph_llm/config/` and `src/hugegraph_llm/resources/` - runtime config and prompt resources. -- `src/hugegraph_llm/indices/` - vector index implementations and backends. -- `src/tests/` - unit, integration, and contract tests for this module. - -## Build & Test - -From the repository root: +## Essential Commands +### Running the Application ```bash -uv sync --extra llm --extra dev +# Install dependencies and create virtual environment (uv already installed) +uv sync +# Activate virtual environment +source .venv/bin/activate +# Launch main RAG demo application +python -m hugegraph_llm.demo.rag_demo.app +# Custom host/port +python -m hugegraph_llm.demo.rag_demo.app --host 127.0.0.1 --port 18001 ``` -From `hugegraph-llm/`, these commands mirror the CI split: - +### Testing ```bash -SKIP_EXTERNAL_SERVICES=true uv run pytest src/tests/config/ src/tests/document/ src/tests/middleware/ src/tests/operators/ src/tests/models/ src/tests/indices/ src/tests/test_utils.py -v --tb=short -SKIP_EXTERNAL_SERVICES=true uv run pytest src/tests/integration/test_graph_rag_pipeline.py src/tests/integration/test_kg_construction.py src/tests/integration/test_rag_pipeline.py -v --tb=short +pytest src/tests/ +# Or using unittest +python -m unittest discover src/tests/ ``` - -- Use narrower `pytest` targets while iterating, but finish with coverage that matches the touched behavior. -- For Python code changes, run root `uv run ruff format --check .` and `uv run ruff check .` before handoff. - -## LLM-specific Rules - -- Preserve Text2Gremlin prompt/output contracts unless the task explicitly changes them. -- Keep GraphRAG retrieval, KG construction, and Text2Gremlin paths behaviorally separate; shared helpers should not blur pipeline semantics. -- Do not introduce a new LLM, embedding, reranker, or vector DB dependency without wiring it through existing config patterns. -- Treat HugeGraph Server, LLM providers, and vector databases as external services with explicit configuration and explicit test skip behavior. - -## Style - -- Python is `>=3.10,<3.12` for this module. -- Use `uv` for dependency management; do not document or rely on ad hoc `pip install` workflows. -- Ruff and mypy behavior comes from `pyproject.toml`; do not duplicate their rule sets here. +PS: we skip Docker Deployment details here. + +## Architecture Overview + +### Core Directory Structure +- `src/hugegraph_llm/api/` - FastAPI endpoints (rag_api.py, admin_api.py) +- `src/hugegraph_llm/demo/rag_demo/` - Main Gradio UI application +- `src/hugegraph_llm/operators/` - Core processing pipelines +- `src/hugegraph_llm/models/` - LLM, embedding, reranker implementations +- `src/hugegraph_llm/indices/` - Vector and graph indexing +- `src/hugegraph_llm/config/` - Configuration management +- `src/hugegraph_llm/utils/` - Utilities, logging, decorators + +### Key Processing Pipelines + +1. **KG Construction** (`operators/kg_construction_task.py`) + - Text chunking and vectorization pipeline + - Schema management and validation + - Information extraction using LLMs + - Graph data commitment to HugeGraph + +2. **Graph RAG** (`operators/graph_rag_task.py`) + - Multi-modal retrieval (vector, graph, hybrid) + - Keyword extraction and entity matching + - Graph traversal and Gremlin query generation + - Result merging and reranking + +3. **Text2Gremlin** (`operators/gremlin_generate_task.py`) + - Natural language to Gremlin query conversion + - Template-based and few-shot learning approaches + +### Configuration Management + +- Main config: `.env` file (generate with `config.generate` module) +- Prompt config: `src/hugegraph_llm/resources/demo/config_prompt.yaml` +- HugeGraph connection settings in environment variables +- LLM provider configuration through `LiteLLM` & `openai/ollama` client + +## Development Workflow + +1. **Prerequisites**: Ensure HugeGraph Server is running and LLM provider is configured +2. **Environment Setup**: Use UV for dependency management, activate virtual environment +3. **Configuration**: Generate configs and set up .env file with proper credentials +4. **Development**: Use Gradio demo for interactive testing, FastAPI for programmatic access +5. **Testing**: Unit tests use standard unittest framework in src/tests/ + +## Important Notes + +- Always use `uv` package manager instead of `pip` for dependency management +- HugeGraph Server must be accessible while running the app +- The system supports multiple LLM providers through `LiteLLM` abstraction +- Each file should be better < 600 lines for maintainability diff --git a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py index a87628fd9..daade304d 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py @@ -148,7 +148,16 @@ def load_into_graph(self, vertices, edges, schema): # pylint: disable=too-many- # TODO: we could try batch add vertices first, setback to single-mode if failed original_id = vertex.get("id") - vid = self._handle_graph_creation(self.client.graph().addVertex, input_label, input_properties).id + if vertex_label.get("id_strategy") == "CUSTOMIZE_STRING" and original_id: + result = self._handle_graph_creation( + self.client.graph().addVertex, + input_label, + input_properties, + id=original_id, + ) + else: + result = self._handle_graph_creation(self.client.graph().addVertex, input_label, input_properties) + vid = result.id vertex["id"] = vid if original_id: vid_mapping[original_id] = vid diff --git a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py index 212b5d544..ec4f7f332 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py @@ -120,6 +120,94 @@ def extract_property_graph_by_llm(self, schema, chunk): prompt = self.example_prompt + prompt return self.llm.generate(prompt=prompt) + @staticmethod + def _primary_key_id(vertex_label, properties): + id_strategy = vertex_label.get("id_strategy") + if id_strategy and str(id_strategy).upper() != "PRIMARY_KEY": + return None + primary_keys = vertex_label.get("primary_keys", []) + if not primary_keys or "id" not in vertex_label: + return None + values = [] + for key in primary_keys: + value = properties.get(key) + if value is None or value == "": + return None + values.append(str(value)) + return f"{vertex_label['id']}:{'!'.join(values)}" + + def _normalize_vertices(self, vertices, vertex_label_map): + vertex_id_map = {} + normalized_vertices = [] + for vertex in vertices: + label = vertex["label"] + properties = vertex["properties"] + canonical_id = self._primary_key_id(vertex_label_map[label], properties) + original_id = vertex.get("id") + if canonical_id is None: + if original_id: + vertex_id_map[(label, original_id)] = original_id + normalized_vertices.append(vertex) + continue + + vertex["id"] = canonical_id + vertex_id_map[(label, canonical_id)] = canonical_id + if original_id: + vertex_id_map[(label, original_id)] = canonical_id + normalized_vertices.append(vertex) + return normalized_vertices, vertex_id_map + + def _resolve_endpoint(self, edge, endpoint_key, label_key, legacy_key, vertex_label_map, vertex_id_map): + endpoint = edge.get(endpoint_key) + label = edge.get(label_key) + if endpoint and label: + return vertex_id_map.get((label, endpoint)), label + + legacy_endpoint = edge.get(legacy_key) + if not isinstance(legacy_endpoint, dict): + return None, label + + label = legacy_endpoint.get("label") + properties = legacy_endpoint.get("properties", {}) + if label not in vertex_label_map: + return None, label + canonical_id = self._primary_key_id(vertex_label_map[label], properties) + return vertex_id_map.get((label, canonical_id)), label + + def _normalize_edges(self, edges, edge_label_map, vertex_label_map, vertex_id_map): + normalized_edges = [] + for edge in edges: + edge_label = edge_label_map[edge["label"]] + out_v, out_v_label = self._resolve_endpoint( + edge, + "outV", + "outVLabel", + "source", + vertex_label_map, + vertex_id_map, + ) + in_v, in_v_label = self._resolve_endpoint( + edge, + "inV", + "inVLabel", + "target", + vertex_label_map, + vertex_id_map, + ) + if not out_v or not in_v: + log.warning("Invalid edge endpoints '%s' have been ignored.", edge) + continue + if out_v_label != edge_label.get("source_label") or in_v_label != edge_label.get("target_label"): + log.warning("Invalid edge endpoint labels '%s' have been ignored.", edge) + continue + + edge["outV"] = out_v + edge["outVLabel"] = out_v_label + edge["inV"] = in_v + edge["inVLabel"] = in_v_label + normalized_edges.append(edge) + return normalized_edges + def _extract_and_filter_label(self, schema, text) -> List[Dict[str, Any]]: # Strip markdown code blocks (e.g. ```json ... ```) text = re.sub(r"```\w*\n?", "", text) @@ -147,10 +235,13 @@ def _extract_and_filter_label(self, schema, text) -> List[Dict[str, Any]]: return items # Create sets for valid vertex and edge labels based on the schema - vertex_label_set = {vertex["name"] for vertex in schema["vertexlabels"]} - edge_label_set = {edge["name"] for edge in schema["edgelabels"]} + vertex_label_map = {vertex["name"]: vertex for vertex in schema["vertexlabels"]} + edge_label_map = {edge["name"]: edge for edge in schema["edgelabels"]} + vertex_label_set = set(vertex_label_map) + edge_label_set = set(edge_label_map) def process_items(item_list, valid_labels, item_type): + parsed_items = [] for item in item_list: if not isinstance(item, dict): log.warning("Invalid property graph item type '%s'.", type(item)) @@ -171,10 +262,14 @@ def process_items(item_list, valid_labels, item_type): item["label"], ) continue - items.append(item) + parsed_items.append(item) + return parsed_items - process_items(property_graph["vertices"], vertex_label_set, "vertex") - process_items(property_graph["edges"], edge_label_set, "edge") + vertex_items = process_items(property_graph["vertices"], vertex_label_set, "vertex") + vertices, vertex_id_map = self._normalize_vertices(vertex_items, vertex_label_map) + edge_items = process_items(property_graph["edges"], edge_label_set, "edge") + edges = self._normalize_edges(edge_items, edge_label_map, vertex_label_map, vertex_id_map) + items = vertices + edges except json.JSONDecodeError: log.critical("Invalid property graph JSON! Please check the extracted JSON data carefully") return items diff --git a/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json b/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json index c4acda03d..3e7b17f44 100644 --- a/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json +++ b/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json @@ -3,24 +3,24 @@ "name": "Official Person-Relationship Extraction", "description": "A standard template for extracting Person and Webpage entities, along with their relationships (Roommate, Owns), from descriptive text.", "text": "Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, in his professional life, works as a journalist. Additionally, Sarah is the proud owner of the website www.sarahsplace.com.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of text, your task is to analyze the text and extract information that fits into the schema's structure, formatting the information into vertices and edges as specified.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Use the vertex label id as the id prefix. If there is one primary key, generate VID as vertexlabelID:primaryKeyValue. If there are multiple primary keys, generate VID as vertexlabelID:pk1!pk2!pk3 in schema primary-key order.\n4. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n5. Translate the schema fields into Chinese if the given text input is Chinese (Optional)\n\n## Example:\n### Input example:\n#### text:\nMeet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, in his professional life, works as a journalist.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"person\",\"properties\":[\"name\",\"age\",\"occupation\"]}], \"edges\":[{\"edge_label\":\"roommate\", \"source_vertex_label\":\"person\",\"target_vertex_label\":\"person\",\"properties\":[\"date\"]}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Sarah\",\"label\":\"person\",\"properties\":{\"name\":\"Sarah\",\"age\":30,\"occupation\":\"attorney\"}},{\"id\":\"1:James\",\"label\":\"person\",\"properties\":{\"name\":\"James\",\"occupation\":\"journalist\"}}], \"edges\":[{\"label\":\"roommate\",\"outV\":\"1:Sarah\",\"outVLabel\":\"person\",\"inV\":\"1:James\",\"inVLabel\":\"person\",\"properties\":{\"date\":\"2010\"}}]}" + "prompt": "## Main Task\nExtract only the vertices and edges supported by the given graph schema and input text. Return valid JSON only.\n\n## Schema Contract\nThe graph schema uses vertexlabels[], edgelabels[], and propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic vertex ids.\n\n## Output Contract\nReturn exactly one JSON object: {\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex id\",\"label\":\"vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\nEdge object: {\"label\":\"edge label\",\"outV\":\"source vertex id\",\"outVLabel\":\"source vertex label\",\"inV\":\"target vertex id\",\"inVLabel\":\"target vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n## Vertex ID Rules\n- If primary_keys has one key: id = \"{vertexLabelID}:{properties.}\".\n- If primary_keys has multiple keys: id = \"{vertexLabelID}:{properties.}!{properties.}\" in schema primary-key order.\n- Never use label names such as \"person:Sarah\" when schema gives a numeric vertex label id.\n\n## Extraction Rules\n- Do not extract labels or properties absent from the schema.\n- Do not translate schema field names, labels, or property keys.\n- Preserve property data types according to propertykeys[].\n- Only output an edge if outV and inV reference vertices in the same output.\n- Output JSON only; no Markdown fences, prose, comments, or trailing text.\n\n## Example\nInput text:\nMeet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James works as a journalist.\n\nGraph schema example:\n{\"vertexlabels\":[{\"id\":1,\"name\":\"person\",\"primary_keys\":[\"name\"],\"properties\":[\"name\",\"age\",\"occupation\"],\"nullable_keys\":[\"age\",\"occupation\"]}],\"edgelabels\":[{\"name\":\"roommate\",\"source_label\":\"person\",\"target_label\":\"person\",\"properties\":[\"date\"]}],\"propertykeys\":[{\"name\":\"name\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"age\",\"data_type\":\"INT\",\"cardinality\":\"SINGLE\"},{\"name\":\"occupation\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"date\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"}]}\n\nOutput:\n{\"vertices\":[{\"id\":\"1:Sarah\",\"label\":\"person\",\"properties\":{\"name\":\"Sarah\",\"age\":30,\"occupation\":\"attorney\"}},{\"id\":\"1:James\",\"label\":\"person\",\"properties\":{\"name\":\"James\",\"occupation\":\"journalist\"}}],\"edges\":[{\"label\":\"roommate\",\"outV\":\"1:Sarah\",\"outVLabel\":\"person\",\"inV\":\"1:James\",\"inVLabel\":\"person\",\"properties\":{\"date\":\"2010\"}}]}" }, { "name": "Traffic Accident Element Extraction", "description": "Extracts key elements from a traffic accident report, including persons involved, vehicles, and responsibilities.", "text": "On March 15, 2024, John Smith, driving a red Porsche with license plate NY-88888, collided with a scooter ridden by Mike Lee at the intersection of People's Road and Liberation Road. The collision resulted in a fracture in Mike Lee's right leg. The traffic police determined that John Smith was fully responsible for running a red light.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of text about a traffic accident, your task is to extract information that fits into the schema's structure, formatting the information into vertices and edges as specified.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nOn March 15, 2024, John Smith, driving a red Porsche with license plate NY-88888, collided with a scooter ridden by Mike Lee at the intersection of People's Road and Liberation Road. The collision resulted in a fracture in Mike Lee's right leg. The traffic police determined that John Smith was fully responsible for running a red light.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Person\",\"properties\":[\"name\",\"injury_level\"]},{\"vertex_label\":\"Vehicle\",\"properties\":[\"license_plate\",\"type\",\"color\"]},{\"vertex_label\":\"Accident\",\"properties\":[\"date\",\"location\",\"responsible_party\"]}], \"edges\":[{\"edge_label\":\"drives\",\"source_label\":\"Person\",\"target_label\":\"Vehicle\"},{\"edge_label\":\"participated_in\",\"source_label\":\"Person\",\"target_label\":\"Accident\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:John Smith\",\"label\":\"Person\",\"properties\":{\"name\":\"John Smith\"}},{\"id\":\"1:Mike Lee\",\"label\":\"Person\",\"properties\":{\"name\":\"Mike Lee\",\"injury_level\":\"right leg fracture\"}},{\"id\":\"2:NY-88888\",\"label\":\"Vehicle\",\"properties\":{\"license_plate\":\"NY-88888\",\"type\":\"Porsche\",\"color\":\"red\"}}],\"edges\":[{\"label\":\"drives\",\"outV\":\"1:John Smith\",\"outVLabel\":\"Person\",\"inV\":\"2:NY-88888\",\"inVLabel\":\"Vehicle\",\"properties\":{}}]}" + "prompt": "## Main Task\nExtract only the vertices and edges supported by the given graph schema and input text. Return valid JSON only.\n\n## Schema Contract\nThe graph schema uses vertexlabels[], edgelabels[], and propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic vertex ids.\n\n## Output Contract\nReturn exactly one JSON object: {\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex id\",\"label\":\"vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\nEdge object: {\"label\":\"edge label\",\"outV\":\"source vertex id\",\"outVLabel\":\"source vertex label\",\"inV\":\"target vertex id\",\"inVLabel\":\"target vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n## Vertex ID Rules\n- If primary_keys has one key: id = \"{vertexLabelID}:{properties.}\".\n- If primary_keys has multiple keys: id = \"{vertexLabelID}:{properties.}!{properties.}\" in schema primary-key order.\n- Never use label names such as \"person:Sarah\" when schema gives a numeric vertex label id.\n\n## Extraction Rules\n- Do not extract labels or properties absent from the schema.\n- Do not translate schema field names, labels, or property keys.\n- Preserve property data types according to propertykeys[].\n- Only output an edge if outV and inV reference vertices in the same output.\n- Output JSON only; no Markdown fences, prose, comments, or trailing text.\n\n## Example\nInput text:\nOn March 15, 2024, John Smith drove a red Porsche with license plate NY-88888 and collided with a scooter ridden by Mike Lee. Mike Lee suffered a right leg fracture.\n\nGraph schema example:\n{\"vertexlabels\":[{\"id\":1,\"name\":\"Person\",\"primary_keys\":[\"name\"],\"properties\":[\"name\",\"injury_level\"],\"nullable_keys\":[\"injury_level\"]},{\"id\":2,\"name\":\"Vehicle\",\"primary_keys\":[\"license_plate\"],\"properties\":[\"license_plate\",\"type\",\"color\"],\"nullable_keys\":[\"type\",\"color\"]}],\"edgelabels\":[{\"name\":\"drives\",\"source_label\":\"Person\",\"target_label\":\"Vehicle\",\"properties\":[]}],\"propertykeys\":[{\"name\":\"name\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"injury_level\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"license_plate\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"type\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"color\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"}]}\n\nOutput:\n{\"vertices\":[{\"id\":\"1:John Smith\",\"label\":\"Person\",\"properties\":{\"name\":\"John Smith\"}},{\"id\":\"1:Mike Lee\",\"label\":\"Person\",\"properties\":{\"name\":\"Mike Lee\",\"injury_level\":\"right leg fracture\"}},{\"id\":\"2:NY-88888\",\"label\":\"Vehicle\",\"properties\":{\"license_plate\":\"NY-88888\",\"type\":\"Porsche\",\"color\":\"red\"}}],\"edges\":[{\"label\":\"drives\",\"outV\":\"1:John Smith\",\"outVLabel\":\"Person\",\"inV\":\"2:NY-88888\",\"inVLabel\":\"Vehicle\",\"properties\":{}}]}" }, { "name": "Financial Event Extraction", "description": "Extracts key financial information such as companies, acquisition events, and amounts from financial news.", "text": "Tech giant Company A announced yesterday that it will fully acquire startup Company B, which operates in the artificial intelligence sector, for a price of $2 billion. The acquisition is expected to be completed by the end of the year.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of financial news, your task is to extract information about corporate mergers and acquisitions.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nTech giant Company A announced yesterday that it will fully acquire startup Company B, which operates in the artificial intelligence sector, for a price of $2 billion. The acquisition is expected to be completed by the end of the year.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Company\",\"properties\":[\"name\",\"field\"]},{\"vertex_label\":\"Acquisition\",\"properties\":[\"price\",\"currency\",\"status\"]}], \"edges\":[{\"edge_label\":\"acquirer\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\"},{\"edge_label\":\"acquired\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Company A\",\"label\":\"Company\",\"properties\":{\"name\":\"Company A\",\"field\":\"Tech\"}},{\"id\":\"1:Company B\",\"label\":\"Company\",\"properties\":{\"name\":\"Company B\",\"field\":\"artificial intelligence\"}},{\"id\":\"2:A acquires B\",\"label\":\"Acquisition\",\"properties\":{\"price\":2000000000,\"currency\":\"USD\",\"status\":\"expected to be completed by year-end\"}}],\"edges\":[{\"label\":\"acquirer\",\"outV\":\"2:A acquires B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company A\",\"inVLabel\":\"Company\",\"properties\":{}},{\"label\":\"acquired\",\"outV\":\"2:A acquires B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company B\",\"inVLabel\":\"Company\",\"properties\":{}}]}" + "prompt": "## Main Task\nExtract only the vertices and edges supported by the given graph schema and input text. Return valid JSON only.\n\n## Schema Contract\nThe graph schema uses vertexlabels[], edgelabels[], and propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic vertex ids.\n\n## Output Contract\nReturn exactly one JSON object: {\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex id\",\"label\":\"vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\nEdge object: {\"label\":\"edge label\",\"outV\":\"source vertex id\",\"outVLabel\":\"source vertex label\",\"inV\":\"target vertex id\",\"inVLabel\":\"target vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n## Vertex ID Rules\n- If primary_keys has one key: id = \"{vertexLabelID}:{properties.}\".\n- If primary_keys has multiple keys: id = \"{vertexLabelID}:{properties.}!{properties.}\" in schema primary-key order.\n- Never use label names such as \"person:Sarah\" when schema gives a numeric vertex label id.\n\n## Extraction Rules\n- Do not extract labels or properties absent from the schema.\n- Do not translate schema field names, labels, or property keys.\n- Preserve property data types according to propertykeys[].\n- Only output an edge if outV and inV reference vertices in the same output.\n- Output JSON only; no Markdown fences, prose, comments, or trailing text.\n\n## Example\nInput text:\nTech giant Company A will fully acquire startup Company B, which operates in artificial intelligence, for $2 billion.\n\nGraph schema example:\n{\"vertexlabels\":[{\"id\":1,\"name\":\"Company\",\"primary_keys\":[\"name\"],\"properties\":[\"name\",\"field\"],\"nullable_keys\":[\"field\"]},{\"id\":2,\"name\":\"Acquisition\",\"primary_keys\":[\"deal\"],\"properties\":[\"deal\",\"price\",\"currency\"],\"nullable_keys\":[\"price\",\"currency\"]}],\"edgelabels\":[{\"name\":\"acquirer\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\",\"properties\":[]},{\"name\":\"acquired\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\",\"properties\":[]}],\"propertykeys\":[{\"name\":\"name\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"field\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"deal\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"price\",\"data_type\":\"LONG\",\"cardinality\":\"SINGLE\"},{\"name\":\"currency\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"}]}\n\nOutput:\n{\"vertices\":[{\"id\":\"1:Company A\",\"label\":\"Company\",\"properties\":{\"name\":\"Company A\",\"field\":\"technology\"}},{\"id\":\"1:Company B\",\"label\":\"Company\",\"properties\":{\"name\":\"Company B\",\"field\":\"artificial intelligence\"}},{\"id\":\"2:Company A acquires Company B\",\"label\":\"Acquisition\",\"properties\":{\"deal\":\"Company A acquires Company B\",\"price\":2000000000,\"currency\":\"USD\"}}],\"edges\":[{\"label\":\"acquirer\",\"outV\":\"2:Company A acquires Company B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company A\",\"inVLabel\":\"Company\",\"properties\":{}},{\"label\":\"acquired\",\"outV\":\"2:Company A acquires Company B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company B\",\"inVLabel\":\"Company\",\"properties\":{}}]}" }, { "name": "Medical Diagnosis Extraction", "description": "Extracts patients, symptoms, diagnosis results, and recommended drugs from medical record text.", "text": "Patient Li Hua, presents with a headache and fever for three days. After examination, the diagnosis is a viral cold. It is recommended to take the drug 'Gankang' for treatment.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of medical record, your task is to extract entities and relationships related to diagnosis and treatment.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nPatient Li Hua, presents with a headache and fever for three days. After examination, the diagnosis is a viral cold. It is recommended to take the drug 'Gankang' for treatment.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Patient\",\"properties\":[\"name\"]},{\"vertex_label\":\"Symptom\",\"properties\":[\"name\"]},{\"vertex_label\":\"Diagnosis\",\"properties\":[\"name\"]},{\"vertex_label\":\"Drug\",\"properties\":[\"name\"]}], \"edges\":[{\"edge_label\":\"has_symptom\",\"source_label\":\"Patient\",\"target_label\":\"Symptom\"},{\"edge_label\":\"diagnosed_with\",\"source_label\":\"Patient\",\"target_label\":\"Diagnosis\"},{\"edge_label\":\"recommends_drug\",\"source_label\":\"Diagnosis\",\"target_label\":\"Drug\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Li Hua\",\"label\":\"Patient\",\"properties\":{\"name\":\"Li Hua\"}},{\"id\":\"2:headache\",\"label\":\"Symptom\",\"properties\":{\"name\":\"headache\"}},{\"id\":\"2:fever\",\"label\":\"Symptom\",\"properties\":{\"name\":\"fever\"}},{\"id\":\"3:viral cold\",\"label\":\"Diagnosis\",\"properties\":{\"name\":\"viral cold\"}},{\"id\":\"4:Gankang\",\"label\":\"Drug\",\"properties\":{\"name\":\"Gankang\"}}],\"edges\":[{\"label\":\"has_symptom\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"2:headache\",\"inVLabel\":\"Symptom\",\"properties\":{}},{\"label\":\"diagnosed_with\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"3:viral cold\",\"inVLabel\":\"Diagnosis\",\"properties\":{}},{\"label\":\"recommends_drug\",\"outV\":\"3:viral cold\",\"outVLabel\":\"Diagnosis\",\"inV\":\"4:Gankang\",\"inVLabel\":\"Drug\",\"properties\":{}}]}" + "prompt": "## Main Task\nExtract only the vertices and edges supported by the given graph schema and input text. Return valid JSON only.\n\n## Schema Contract\nThe graph schema uses vertexlabels[], edgelabels[], and propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic vertex ids.\n\n## Output Contract\nReturn exactly one JSON object: {\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex id\",\"label\":\"vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\nEdge object: {\"label\":\"edge label\",\"outV\":\"source vertex id\",\"outVLabel\":\"source vertex label\",\"inV\":\"target vertex id\",\"inVLabel\":\"target vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n## Vertex ID Rules\n- If primary_keys has one key: id = \"{vertexLabelID}:{properties.}\".\n- If primary_keys has multiple keys: id = \"{vertexLabelID}:{properties.}!{properties.}\" in schema primary-key order.\n- Never use label names such as \"person:Sarah\" when schema gives a numeric vertex label id.\n\n## Extraction Rules\n- Do not extract labels or properties absent from the schema.\n- Do not translate schema field names, labels, or property keys.\n- Preserve property data types according to propertykeys[].\n- Only output an edge if outV and inV reference vertices in the same output.\n- Output JSON only; no Markdown fences, prose, comments, or trailing text.\n\n## Example\nInput text:\nPatient Li Hua has a headache and fever for three days. The diagnosis is viral cold, and the recommended drug is Gankang.\n\nGraph schema example:\n{\"vertexlabels\":[{\"id\":1,\"name\":\"Patient\",\"primary_keys\":[\"name\"],\"properties\":[\"name\"],\"nullable_keys\":[]},{\"id\":2,\"name\":\"Symptom\",\"primary_keys\":[\"name\"],\"properties\":[\"name\"],\"nullable_keys\":[]},{\"id\":3,\"name\":\"Diagnosis\",\"primary_keys\":[\"name\"],\"properties\":[\"name\"],\"nullable_keys\":[]},{\"id\":4,\"name\":\"Drug\",\"primary_keys\":[\"name\"],\"properties\":[\"name\"],\"nullable_keys\":[]}],\"edgelabels\":[{\"name\":\"has_symptom\",\"source_label\":\"Patient\",\"target_label\":\"Symptom\",\"properties\":[]},{\"name\":\"diagnosed_with\",\"source_label\":\"Patient\",\"target_label\":\"Diagnosis\",\"properties\":[]},{\"name\":\"recommends_drug\",\"source_label\":\"Diagnosis\",\"target_label\":\"Drug\",\"properties\":[]}],\"propertykeys\":[{\"name\":\"name\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"}]}\n\nOutput:\n{\"vertices\":[{\"id\":\"1:Li Hua\",\"label\":\"Patient\",\"properties\":{\"name\":\"Li Hua\"}},{\"id\":\"2:headache\",\"label\":\"Symptom\",\"properties\":{\"name\":\"headache\"}},{\"id\":\"2:fever\",\"label\":\"Symptom\",\"properties\":{\"name\":\"fever\"}},{\"id\":\"3:viral cold\",\"label\":\"Diagnosis\",\"properties\":{\"name\":\"viral cold\"}},{\"id\":\"4:Gankang\",\"label\":\"Drug\",\"properties\":{\"name\":\"Gankang\"}}],\"edges\":[{\"label\":\"has_symptom\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"2:headache\",\"inVLabel\":\"Symptom\",\"properties\":{}},{\"label\":\"has_symptom\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"2:fever\",\"inVLabel\":\"Symptom\",\"properties\":{}},{\"label\":\"diagnosed_with\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"3:viral cold\",\"inVLabel\":\"Diagnosis\",\"properties\":{}},{\"label\":\"recommends_drug\",\"outV\":\"3:viral cold\",\"outVLabel\":\"Diagnosis\",\"inV\":\"4:Gankang\",\"inVLabel\":\"Drug\",\"properties\":{}}]}" } ] diff --git a/hugegraph-llm/src/tests/config/test_prompt_config.py b/hugegraph-llm/src/tests/config/test_prompt_config.py index 103d6cecf..0a46b4c62 100644 --- a/hugegraph-llm/src/tests/config/test_prompt_config.py +++ b/hugegraph-llm/src/tests/config/test_prompt_config.py @@ -62,7 +62,7 @@ def _assert_output_matches_schema_contract(schema, output): vertex_ids = {vertex["id"] for vertex in output["vertices"]} vertex_labels = {vertex["label"] for vertex in output["vertices"]} schema_vertices = {vertex["name"]: vertex for vertex in schema["vertexlabels"]} - schema_edge_labels = {edge["name"] for edge in schema["edgelabels"]} + schema_edges = {edge["name"]: edge for edge in schema["edgelabels"]} for vertex in output["vertices"]: assert set(vertex) == {"id", "label", "properties"} @@ -75,11 +75,13 @@ def _assert_output_matches_schema_contract(schema, output): for edge in output["edges"]: assert set(edge) == {"label", "outV", "outVLabel", "inV", "inVLabel", "properties"} - assert edge["label"] in schema_edge_labels + assert edge["label"] in schema_edges assert edge["outV"] in vertex_ids assert edge["inV"] in vertex_ids assert edge["outVLabel"] in vertex_labels assert edge["inVLabel"] in vertex_labels + assert edge["outVLabel"] == schema_edges[edge["label"]]["source_label"] + assert edge["inVLabel"] == schema_edges[edge["label"]]["target_label"] assert isinstance(edge["properties"], dict) extractor = PropertyGraphExtract(llm=MagicMock(spec=BaseLLM)) @@ -120,7 +122,7 @@ def test_extract_graph_prompt_example_contract_rejects_dangling_edge_reference() raise AssertionError("Prompt example contract accepted an edge reference outside vertices") -def test_prompt_examples_do_not_require_redundant_item_type(): +def test_prompt_examples_match_extraction_contract(): examples_path = ( Path(__file__).parents[2] / "hugegraph_llm" / "resources" / "prompt_examples" / "prompt_examples.json" ) @@ -130,3 +132,22 @@ def test_prompt_examples_do_not_require_redundant_item_type(): prompt = example["prompt"] assert '"type":"vertex"' not in prompt assert '"type":"edge"' not in prompt + _assert_prompt_example_contract(prompt, "## Example") + + +def test_prompt_examples_use_matching_domain_examples(): + examples_path = ( + Path(__file__).parents[2] / "hugegraph_llm" / "resources" / "prompt_examples" / "prompt_examples.json" + ) + examples = json.loads(examples_path.read_text(encoding="utf-8")) + domain_markers = { + "Official Person-Relationship Extraction": ["Sarah", "James"], + "Traffic Accident Element Extraction": ["John Smith", "NY-88888"], + "Financial Event Extraction": ["Company A", "$2 billion"], + "Medical Diagnosis Extraction": ["Li Hua", "Gankang"], + } + + for example in examples: + prompt = example["prompt"] + for marker in domain_markers[example["name"]]: + assert marker in prompt diff --git a/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py b/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py index 2399ec100..b13f90422 100644 --- a/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py +++ b/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py @@ -22,6 +22,7 @@ from pyhugegraph.utils.exceptions import CreateError, NotFoundError from hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph import Commit2Graph +from hugegraph_llm.operators.llm_op.property_graph_extract import PropertyGraphExtract class TestCommit2Graph(unittest.TestCase): @@ -49,6 +50,7 @@ def setUp(self): ], "vertexlabels": [ { + "id": 1, "name": "person", "properties": ["name", "age"], "primary_keys": ["name"], @@ -56,6 +58,7 @@ def setUp(self): "id_strategy": "PRIMARY_KEY", }, { + "id": 2, "name": "movie", "properties": ["title", "year"], "primary_keys": ["title"], @@ -393,6 +396,131 @@ def test_load_into_graph_maps_llm_vertex_ids_to_created_vertex_ids(self, mock_ha {"role": "Forrest Gump"}, ) + @patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation") + def test_load_into_graph_uses_explicit_customize_string_ids(self, mock_handle_graph_creation): + """Test custom string ids are passed to HugeGraph when schema requires them.""" + mock_handle_graph_creation.side_effect = [ + MagicMock(id="Tom Hanks"), + MagicMock(id="Forrest Gump"), + MagicMock(id="edge_id"), + ] + schema = { + "propertykeys": [ + {"name": "name", "data_type": "TEXT", "cardinality": "SINGLE"}, + {"name": "title", "data_type": "TEXT", "cardinality": "SINGLE"}, + ], + "vertexlabels": [ + { + "id": 7, + "name": "person", + "id_strategy": "CUSTOMIZE_STRING", + "primary_keys": ["name"], + "properties": ["name"], + "nullable_keys": [], + }, + { + "id": 8, + "name": "movie", + "id_strategy": "CUSTOMIZE_STRING", + "primary_keys": ["title"], + "properties": ["title"], + "nullable_keys": [], + }, + ], + "edgelabels": [{"name": "acted_in", "properties": [], "source_label": "person", "target_label": "movie"}], + } + vertices = [ + {"id": "Tom Hanks", "label": "person", "properties": {"name": "Tom Hanks"}}, + {"id": "Forrest Gump", "label": "movie", "properties": {"title": "Forrest Gump"}}, + ] + edges = [ + { + "label": "acted_in", + "properties": {}, + "outV": "Tom Hanks", + "inV": "Forrest Gump", + } + ] + + self.commit2graph.load_into_graph(vertices, edges, schema) + + mock_handle_graph_creation.assert_any_call( + self.commit2graph.client.graph().addVertex, + "person", + {"name": "Tom Hanks"}, + id="Tom Hanks", + ) + mock_handle_graph_creation.assert_any_call( + self.commit2graph.client.graph().addVertex, + "movie", + {"title": "Forrest Gump"}, + id="Forrest Gump", + ) + mock_handle_graph_creation.assert_any_call( + self.commit2graph.client.graph().addEdge, + "acted_in", + "Tom Hanks", + "Forrest Gump", + {}, + ) + + @patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation") + def test_load_into_graph_accepts_normalized_extraction_without_item_type(self, mock_handle_graph_creation): + """Test normalized LLM output without type fields can be committed.""" + mock_handle_graph_creation.side_effect = [ + MagicMock(id="1:Tom Hanks"), + MagicMock(id="2:Forrest Gump"), + MagicMock(id="edge_id"), + ] + llm_output = """{ + "vertices": [ + { + "id": "person:Tom Hanks", + "label": "person", + "properties": { + "name": "Tom Hanks", + "age": 67 + } + }, + { + "id": "movie:Forrest Gump", + "label": "movie", + "properties": { + "title": "Forrest Gump", + "year": 1994 + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "person:Tom Hanks", + "outVLabel": "person", + "inV": "movie:Forrest Gump", + "inVLabel": "movie", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + items = PropertyGraphExtract(llm=MagicMock())._extract_and_filter_label(self.schema, llm_output) + vertices = [item for item in items if item["type"] == "vertex"] + edges = [item for item in items if item["type"] == "edge"] + self.assertEqual(edges[0]["outV"], "1:Tom Hanks") + self.assertEqual(edges[0]["inV"], "2:Forrest Gump") + + self.commit2graph.load_into_graph(vertices, edges, self.schema) + + mock_handle_graph_creation.assert_any_call( + self.commit2graph.client.graph().addEdge, + "acted_in", + "1:Tom Hanks", + "2:Forrest Gump", + {"role": "Forrest Gump"}, + ) + @patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation") def test_load_into_graph_with_data_type_validation_failure(self, mock_handle_graph_creation): """Test load_into_graph method with data type validation failure.""" diff --git a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py index cfa1192ce..3eb490261 100644 --- a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py +++ b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py @@ -39,19 +39,23 @@ def setUp(self): self.schema = { "vertexlabels": [ { + "id": 1, "name": "person", "primary_keys": ["name"], "nullable_keys": ["age"], "properties": ["name", "age"], }, { + "id": 2, "name": "movie", "primary_keys": ["title"], "nullable_keys": ["year"], "properties": ["title", "year"], }, ], - "edgelabels": [{"name": "acted_in", "properties": ["role"]}], + "edgelabels": [ + {"name": "acted_in", "properties": ["role"], "source_label": "person", "target_label": "movie"} + ], } # Sample text chunks @@ -77,6 +81,13 @@ def setUp(self): }""", """{ "vertices": [ + { + "type": "vertex", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, { "type": "vertex", "label": "movie", @@ -194,11 +205,13 @@ def test_extract_and_filter_label_valid_json(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") - self.assertEqual(result[0]["label"], "movie") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[0]["label"], "person") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") def test_extract_and_filter_label_markdown_json(self): """Test _extract_and_filter_label with JSON wrapped in markdown fences.""" @@ -209,11 +222,13 @@ def test_extract_and_filter_label_markdown_json(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") - self.assertEqual(result[0]["label"], "movie") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[0]["label"], "person") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") def test_extract_and_filter_label_markdown_json_with_prose(self): """Test fenced JSON can be parsed when the LLM adds prose.""" @@ -226,11 +241,13 @@ def test_extract_and_filter_label_markdown_json_with_prose(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") - self.assertEqual(result[0]["label"], "movie") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[0]["label"], "person") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") def test_extract_and_filter_label_flat_array_json(self): """Test _extract_and_filter_label converts flat arrays to vertices and edges.""" @@ -244,6 +261,13 @@ def test_extract_and_filter_label_flat_array_json(self): "name": "Tom Hanks" } }, + { + "type": "vertex", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + }, { "type": "edge", "label": "acted_in", @@ -268,11 +292,13 @@ def test_extract_and_filter_label_flat_array_json(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") self.assertEqual(result[0]["label"], "person") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") def test_extract_and_filter_label_flat_array_filters_invalid_items(self): """Test flat arrays keep valid graph items and drop invalid ones.""" @@ -285,6 +311,13 @@ def test_extract_and_filter_label_flat_array_filters_invalid_items(self): "name": "Tom Hanks" } }, + { + "type": "vertex", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + }, { "type": "vertex", "label": "unknown_label", @@ -326,11 +359,13 @@ def test_extract_and_filter_label_flat_array_filters_invalid_items(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") self.assertEqual(result[0]["label"], "person") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") def test_extract_and_filter_label_malformed_fenced_json(self): """Test malformed fenced JSON returns no graph items.""" @@ -364,6 +399,12 @@ def test_extract_and_filter_label_infers_type_from_grouped_arrays(self): "properties": { "name": "Tom Hanks" } + }, + { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } } ], "edges": [ @@ -390,11 +431,389 @@ def test_extract_and_filter_label_infers_type_from_grouped_arrays(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") self.assertEqual(result[0]["label"], "person") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") + + def test_extract_and_filter_label_normalizes_primary_key_ids(self): + """Normalize LLM vertex ids to schema-derived primary-key ids.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "id": "person:Tom Hanks", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "id": "movie:Forrest Gump", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "person:Tom Hanks", + "outVLabel": "person", + "inV": "movie:Forrest Gump", + "inVLabel": "movie", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(result[0]["id"], "1:Tom Hanks") + self.assertEqual(result[1]["id"], "2:Forrest Gump") + self.assertEqual(result[2]["outV"], "1:Tom Hanks") + self.assertEqual(result[2]["inV"], "2:Forrest Gump") + + def test_extract_and_filter_label_keeps_canonical_primary_key_ids(self): + """Keep already-canonical vertex and edge ids intact.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "id": "1:Tom Hanks", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "id": "2:Forrest Gump", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "1:Tom Hanks", + "outVLabel": "person", + "inV": "2:Forrest Gump", + "inVLabel": "movie", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(result[0]["id"], "1:Tom Hanks") + self.assertEqual(result[1]["id"], "2:Forrest Gump") + self.assertEqual(result[2]["outV"], "1:Tom Hanks") + self.assertEqual(result[2]["inV"], "2:Forrest Gump") + + def test_extract_and_filter_label_normalizes_multiple_primary_key_ids(self): + """Normalize multi-primary-key vertex ids in schema primary-key order.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + schema = { + "vertexlabels": [ + { + "id": 3, + "name": "character", + "primary_keys": ["name", "universe"], + "nullable_keys": [], + "properties": ["name", "universe"], + } + ], + "edgelabels": [], + } + text = """{ + "vertices": [ + { + "id": "character:Tom!movie", + "label": "character", + "properties": { + "name": "Tom", + "universe": "movie" + } + } + ], + "edges": [] + }""" + + result = extractor._extract_and_filter_label(schema, text) + + self.assertEqual(result[0]["id"], "3:Tom!movie") + + def test_extract_and_filter_label_resolves_source_target_edge_refs(self): + """Resolve source/target edge endpoints to canonical outV/inV ids.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "properties": { + "role": "Forrest Gump" + }, + "source": { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + "target": { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(result[0]["id"], "1:Tom Hanks") + self.assertEqual(result[1]["id"], "2:Forrest Gump") + self.assertEqual(result[2]["outV"], "1:Tom Hanks") + self.assertEqual(result[2]["outVLabel"], "person") + self.assertEqual(result[2]["inV"], "2:Forrest Gump") + self.assertEqual(result[2]["inVLabel"], "movie") + + def test_extract_and_filter_label_drops_edges_with_unresolved_endpoints(self): + """Drop edges whose endpoints cannot be resolved before commit.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "person:Missing", + "outVLabel": "person", + "inV": "movie:Missing", + "inVLabel": "movie", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(len(result), 1) + self.assertEqual(result[0]["type"], "vertex") + + def test_extract_and_filter_label_drops_legacy_edges_with_missing_vertices(self): + """Drop legacy source/target edges unless both endpoints are emitted as vertices.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + } + ], + "edges": [ + { + "label": "acted_in", + "properties": { + "role": "Forrest Gump" + }, + "source": { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + "target": { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(len(result), 1) + self.assertEqual(result[0]["type"], "vertex") + + def test_extract_and_filter_label_keeps_explicit_custom_ids(self): + """Keep self-consistent explicit ids when schema cannot derive primary-key ids.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + schema = { + "vertexlabels": [ + {"name": "person", "id_strategy": "CUSTOMIZE_STRING", "properties": ["name"], "nullable_keys": []}, + {"name": "movie", "id_strategy": "CUSTOMIZE_STRING", "properties": ["title"], "nullable_keys": []}, + ], + "edgelabels": [{"name": "acted_in", "properties": [], "source_label": "person", "target_label": "movie"}], + } + text = """{ + "vertices": [ + { + "id": "Tom Hanks", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "id": "Forrest Gump", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "Tom Hanks", + "outVLabel": "person", + "inV": "Forrest Gump", + "inVLabel": "movie", + "properties": {} + } + ] + }""" + + result = extractor._extract_and_filter_label(schema, text) + + self.assertEqual(len(result), 3) + self.assertEqual(result[2]["outV"], "Tom Hanks") + self.assertEqual(result[2]["inV"], "Forrest Gump") + + def test_extract_and_filter_label_keeps_explicit_custom_ids_with_label_metadata(self): + """Do not rewrite custom ids even when schema includes ids and primary keys.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + schema = { + "vertexlabels": [ + { + "id": 7, + "name": "person", + "id_strategy": "CUSTOMIZE_STRING", + "primary_keys": ["name"], + "properties": ["name"], + "nullable_keys": [], + }, + { + "id": 8, + "name": "movie", + "id_strategy": "CUSTOMIZE_STRING", + "primary_keys": ["title"], + "properties": ["title"], + "nullable_keys": [], + }, + ], + "edgelabels": [{"name": "acted_in", "properties": [], "source_label": "person", "target_label": "movie"}], + } + text = """{ + "vertices": [ + { + "id": "Tom Hanks", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "id": "Forrest Gump", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "Tom Hanks", + "outVLabel": "person", + "inV": "Forrest Gump", + "inVLabel": "movie", + "properties": {} + } + ] + }""" + + result = extractor._extract_and_filter_label(schema, text) + + self.assertEqual(result[0]["id"], "Tom Hanks") + self.assertEqual(result[1]["id"], "Forrest Gump") + self.assertEqual(result[2]["outV"], "Tom Hanks") + self.assertEqual(result[2]["inV"], "Forrest Gump") + + def test_extract_and_filter_label_drops_edges_with_mismatched_endpoint_labels(self): + """Drop edges whose endpoint labels conflict with the edge schema.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "1:Tom Hanks", + "outVLabel": "movie", + "inV": "2:Forrest Gump", + "inVLabel": "person", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(len(result), 2) + self.assertTrue(all(item["type"] == "vertex" for item in result)) def test_extract_and_filter_label_invalid_json(self): """Test the _extract_and_filter_label method with invalid JSON.""" @@ -516,13 +935,13 @@ def test_run(self): self.assertEqual(extractor.extract_property_graph_by_llm.call_count, 2) # Verify the results - self.assertEqual(len(result["vertices"]), 2) + self.assertEqual(len(result["vertices"]), 3) self.assertEqual(len(result["edges"]), 1) self.assertEqual(result["call_count"], 2) # Check vertex properties self.assertEqual(result["vertices"][0]["properties"]["name"], "Tom Hanks") - self.assertEqual(result["vertices"][1]["properties"]["title"], "Forrest Gump") + self.assertEqual(result["vertices"][2]["properties"]["title"], "Forrest Gump") # Check edge properties self.assertEqual(result["edges"][0]["properties"]["role"], "Forrest Gump") @@ -560,7 +979,7 @@ def test_run_with_existing_vertices_and_edges(self): result = extractor.run(context) # Verify the results - self.assertEqual(len(result["vertices"]), 3) # 1 existing + 2 new + self.assertEqual(len(result["vertices"]), 4) # 1 existing + 3 new self.assertEqual(len(result["edges"]), 2) # 1 existing + 1 new self.assertEqual(result["call_count"], 2)