refactor: make ts default eval prompts to be in message format (#10385)

axiomofjoy · web-flow · commit a7e03e3b3a04 · 2025-11-24T14:46:36.000-08:00
* refactor: make ts default eval prompts to be in message format

* document relevance

* clean types
diff --git a/js/packages/phoenix-evals/src/default_templates/DOCUMENT_RELEVANCY_TEMPLATE.ts b/js/packages/phoenix-evals/src/default_templates/DOCUMENT_RELEVANCY_TEMPLATE.ts
@@ -1,4 +1,9 @@
-export const DOCUMENT_RELEVANCY_TEMPLATE = `
+import { PromptTemplate } from "../types";
+
+export const DOCUMENT_RELEVANCY_TEMPLATE: PromptTemplate = [
+  {
+    role: "user",
+    content: `
 You are comparing a document to a question and trying to determine if the document text
 contains information relevant to answering the question. Here is the data:
 
@@ -17,7 +22,9 @@ Your response must be single word, either "relevant" or "unrelated",
 and should not contain any text or characters aside from that word.
 "unrelated" means that the document text does not contain an answer to the Question.
 "relevant" means the document text contains an answer to the Question.
-`;
+`,
+  },
+];
 
 export const DOCUMENT_RELEVANCY_CHOICES = {
   relevant: 1,
diff --git a/js/packages/phoenix-evals/src/default_templates/HALLUCINATION_TEMPLATE.ts b/js/packages/phoenix-evals/src/default_templates/HALLUCINATION_TEMPLATE.ts
@@ -1,4 +1,9 @@
-export const HALLUCINATION_TEMPLATE = `
+import { PromptTemplate } from "../types";
+
+export const HALLUCINATION_TEMPLATE: PromptTemplate = [
+  {
+    role: "user",
+    content: `
 In this task, you will be presented with a query, a reference text and an answer. The answer is
 generated to the question based on the reference text. The answer may contain false information. You
 must use the reference text to determine if the answer to the question contains false information,
@@ -23,7 +28,9 @@ your response.
     [END DATA]
 
 Is the answer above factual or hallucinated based on the query and reference text?
-`;
+`,
+  },
+];
 
 export const HALLUCINATION_CHOICES = {
   hallucinated: 1,
diff --git a/js/packages/phoenix-evals/test/llm/createDocumentRelevancyEvaluator.test.ts b/js/packages/phoenix-evals/test/llm/createDocumentRelevancyEvaluator.test.ts
@@ -48,9 +48,14 @@ describe("createDocumentRelevancyEvaluator", () => {
     expect(mockGenerateClassification).toHaveBeenCalledWith(
       expect.objectContaining({
         labels: ["relevant", "unrelated"],
-        prompt: expect.stringContaining(
-          "You are comparing a document to a question"
-        ),
+        prompt: expect.arrayContaining([
+          expect.objectContaining({
+            role: "user",
+            content: expect.stringContaining(
+              "You are comparing a document to a question"
+            ),
+          }),
+        ]),
       })
     );
 
@@ -239,12 +244,22 @@ describe("createDocumentRelevancyEvaluator", () => {
     // Verify that the prompt contains the interpolated values
     expect(mockGenerateClassification).toHaveBeenCalledWith(
       expect.objectContaining({
-        prompt: expect.stringContaining(testInput),
+        prompt: expect.arrayContaining([
+          expect.objectContaining({
+            role: "user",
+            content: expect.stringContaining(testInput),
+          }),
+        ]),
       })
     );
     expect(mockGenerateClassification).toHaveBeenCalledWith(
       expect.objectContaining({
-        prompt: expect.stringContaining(testOutput),
+        prompt: expect.arrayContaining([
+          expect.objectContaining({
+            role: "user",
+            content: expect.stringContaining(testOutput),
+          }),
+        ]),
       })
     );
   });
diff --git a/js/packages/phoenix-evals/test/llm/createHallucinationEvaluator.test.ts b/js/packages/phoenix-evals/test/llm/createHallucinationEvaluator.test.ts
@@ -50,9 +50,14 @@ Is the answer hallucinated? Respond with "yes" or "no".
     expect(mockGenerateClassification).toHaveBeenCalledWith(
       expect.objectContaining({
         labels: ["hallucinated", "factual"],
-        prompt: expect.stringContaining(
-          "In this task, you will be presented with a query"
-        ),
+        prompt: expect.arrayContaining([
+          expect.objectContaining({
+            role: "user",
+            content: expect.stringContaining(
+              "In this task, you will be presented with a query"
+            ),
+          }),
+        ]),
       })
     );
 
@@ -257,17 +262,32 @@ Is the answer hallucinated? Respond with "yes" or "no".
     // Verify that the prompt contains the interpolated values
     expect(mockGenerateClassification).toHaveBeenCalledWith(
       expect.objectContaining({
-        prompt: expect.stringContaining(testInput),
+        prompt: expect.arrayContaining([
+          expect.objectContaining({
+            role: "user",
+            content: expect.stringContaining(testInput),
+          }),
+        ]),
       })
     );
     expect(mockGenerateClassification).toHaveBeenCalledWith(
       expect.objectContaining({
-        prompt: expect.stringContaining(testOutput),
+        prompt: expect.arrayContaining([
+          expect.objectContaining({
+            role: "user",
+            content: expect.stringContaining(testOutput),
+          }),
+        ]),
       })
     );
     expect(mockGenerateClassification).toHaveBeenCalledWith(
       expect.objectContaining({
-        prompt: expect.stringContaining(testReference),
+        prompt: expect.arrayContaining([
+          expect.objectContaining({
+            role: "user",
+            content: expect.stringContaining(testReference),
+          }),
+        ]),
       })
     );
   });