Arize-ai
diff --git a/‎js/examples/notebooks/tracing_openai_sessions_tutorial.ipynb‎
Lines changed: 0 additions & 5 deletions b/‎js/examples/notebooks/tracing_openai_sessions_tutorial.ipynb‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎packages/phoenix-client/README.md‎
Lines changed: 1 addition & 0 deletions b/‎packages/phoenix-client/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/phoenix-client/docs/source/index.md‎
Lines changed: 96 additions & 0 deletions b/‎packages/phoenix-client/docs/source/index.md‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎packages/phoenix-client/src/phoenix/client/helpers/spans/__init__.py‎
Lines changed: 16 additions & 1 deletion b/‎packages/phoenix-client/src/phoenix/client/helpers/spans/__init__.py‎
Lines changed: 16 additions & 1 deletion
@@ -133,11 +133,6 @@
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "Deno",
-   "language": "typescript",
-   "name": "deno"
-  },
   "language_info": {
    "name": "typescript"
   }
 
@@ -29,6 +29,7 @@ Phoenix Client provides a interface for interacting with the Phoenix platform vi
 - **Experiments** - Run evaluations and track experiment results
 - **Spans** - Query and analyze traces with powerful filtering
 - **Annotations** - Add human feedback and automated evaluations
+- **Evaluation Helpers** - Extract span data in formats optimized for RAG evaluation workflows
 
 ## Installation
 
 
@@ -229,6 +229,102 @@ df = pd.DataFrame({
 client.spans.log_span_annotations_dataframe(dataframe=df)
 ```
 
+### Evaluation Helpers
+
+The Phoenix Client provides helper functions to extract span data in formats optimized for RAG evaluation workflows. These helpers streamline the process of preparing data for evaluation with `phoenix.evals`.
+
+#### RAG Retrieval Evaluation
+
+Extract retrieved documents from retriever spans for relevance evaluation:
+
+```python
+from phoenix.client import Client
+from phoenix.client.helpers.spans import get_retrieved_documents
+
+client = Client()
+
+# Extract retrieved documents for evaluation
+retrieved_docs_df = get_retrieved_documents(
+    client,
+    project_name="my-rag-app"
+)
+
+# Each row is a retrieved document with its metadata
+print(retrieved_docs_df.head())
+# Index: context.span_id, document_position
+# Columns: context.trace_id, input, document, document_score, document_metadata
+
+# Use with phoenix.evals for relevance evaluation
+from phoenix.evals import LLM, async_evaluate_dataframe
+from phoenix.evals.metrics import DocumentRelevanceEvaluator
+
+llm = LLM(model="gpt-4o", provider="openai")
+relevance_evaluator = DocumentRelevanceEvaluator(llm=llm)
+
+relevance_results = await async_evaluate_dataframe(
+    dataframe=retrieved_docs_df,
+    evaluators=[relevance_evaluator],
+    concurrency=10,
+    exit_on_error=True,
+)
+relevance_results.head()
+```
+
+#### RAG Q&A Evaluation
+
+Extract Q&A pairs with reference context for hallucination evaluation:
+
+```python
+from phoenix.client.helpers.spans import get_input_output_context
+from phoenix.evals.metrics import HallucinationEvaluator
+
+# Extract Q&A with context documents
+qa_df = get_input_output_context(
+    client,
+    project_name="my-rag-app"
+)
+
+# Each row combines a Q&A pair with concatenated retrieval documents
+# Index: context.span_id
+# Columns: context.trace_id, input, output, context, metadata
+if qa_df is not None:
+    print(qa_df.head())
+
+    # Run hallucination evaluations
+    hallucination_evaluator = HallucinationEvaluator(llm=llm)
+
+    hallucination_results = await async_evaluate_dataframe(
+        dataframe=qa_df,
+        evaluators=[hallucination_evaluator],
+        concurrency=10,
+        exit_on_error=True,
+    )
+    hallucination_results.head()
+```
+
+#### Time-Filtered RAG Spans
+
+Filter spans by time range for evaluation:
+
+```python
+from datetime import datetime, timedelta
+
+# Get documents from last 24 hours
+recent_docs = get_retrieved_documents(
+    client,
+    project_name="my-rag-app",
+    start_time=datetime.now() - timedelta(hours=24),
+    end_time=datetime.now()
+)
+
+# Get Q&A from last week
+weekly_qa = get_input_output_context(
+    client,
+    project_name="my-rag-app",
+    start_time=datetime.now() - timedelta(days=7)
+)
+```
+
 ### Datasets
 
 Manage evaluation datasets and examples for experiments and testing:
 
@@ -6,12 +6,27 @@
 
 from phoenix.client.__generated__ import v1
 
+from .rag import (
+    async_get_input_output_context,
+    async_get_retrieved_documents,
+    get_input_output_context,
+    get_retrieved_documents,
+)
+
 Span = v1.Span
 
 if TYPE_CHECKING:
     import pandas as pd
 
-__all__ = ["uniquify_spans", "uniquify_spans_dataframe", "dataframe_to_spans"]
+__all__ = [
+    "uniquify_spans",
+    "uniquify_spans_dataframe",
+    "dataframe_to_spans",
+    "get_input_output_context",
+    "get_retrieved_documents",
+    "async_get_input_output_context",
+    "async_get_retrieved_documents",
+]
 
 # Source implementation:opentelemetry.sdk.trace.id_generator.RandomIdGenerator
Original file line number	Diff line number	Diff line change
`@@ -133,11 +133,6 @@`
`133`	`133`	`}`
`134`	`134`	`],`
`135`	`135`	`"metadata": {`
`136`		`- "kernelspec": {`
`137`		`- "display_name": "Deno",`
`138`		`- "language": "typescript",`
`139`		`- "name": "deno"`
`140`		`- },`
`141`	`136`	`"language_info": {`
`142`	`137`	`"name": "typescript"`
`143`	`138`	`}`