Break up eval test suite

dcramer · dcramer · commit 98d366cb726c · 2025-04-13T14:49:57.000-04:00
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -6,14 +6,16 @@ on:
     branches: [main]
     paths:
       - 'src/mcp/**'
-      - 'src/lib/sentry*'
-      - 'src/evals/workflow.test.ts'
+      - 'src/lib/sentry**'
+      - 'src/evals/**'
+      - '*.eval.ts'
       - '.github/workflows/eval.yml'
   pull_request:
     paths:
       - 'src/mcp/**'
-      - 'src/lib/sentry*'
-      - 'src/evals/workflow.test.ts'
+      - 'src/lib/sentry**'
+      - 'src/evals/**'
+      - '*.eval.ts'
       - '.github/workflows/eval.yml'
 
 jobs:
diff --git a/src/evals/create-project.eval.ts b/src/evals/create-project.eval.ts
@@ -0,0 +1,18 @@
+import { describeEval } from "vitest-evals";
+import { Factuality, FIXTURES, TaskRunner } from "./utils";
+
+describeEval("create-project", {
+  data: async () => {
+    return [
+      {
+        input: `Create a new project in Sentry for '${FIXTURES.organizationSlug}' called '${FIXTURES.projectSlug}' with the '${FIXTURES.teamSlug}' team. Output **only** the project slug and the SENTRY_DSN in the format of:\n<PROJECT_SLUG>\n<SENTRY_DSN>`,
+        expected:
+          "cloudflare-mcp\nhttps://d20df0a1ab5031c7f3c7edca9c02814d@o4509106732793856.ingest.us.sentry.io/4509109104082945",
+      },
+    ];
+  },
+  task: TaskRunner(),
+  scorers: [Factuality()],
+  threshold: 0.6,
+  timeout: 30000,
+});
diff --git a/src/evals/create-team.eval.ts b/src/evals/create-team.eval.ts
@@ -0,0 +1,17 @@
+import { describeEval } from "vitest-evals";
+import { Factuality, FIXTURES, TaskRunner } from "./utils";
+
+describeEval("create-team", {
+  data: async () => {
+    return [
+      {
+        input: `Create a new team in Sentry for '${FIXTURES.organizationSlug}' called 'the-goats' response with **only** the team slug and no other text.`,
+        expected: FIXTURES.teamSlug,
+      },
+    ];
+  },
+  task: TaskRunner(),
+  scorers: [Factuality()],
+  threshold: 0.6,
+  timeout: 30000,
+});
diff --git a/src/evals/get-issue.eval.ts b/src/evals/get-issue.eval.ts
@@ -0,0 +1,28 @@
+import { describeEval } from "vitest-evals";
+import { Factuality, FIXTURES, TaskRunner } from "./utils";
+
+describeEval("get-issue", {
+  data: async () => {
+    return [
+      {
+        input: "Analyze issue REMOTE-MCP-41 from Sentry.",
+        expected: [
+          "## REMOTE-MCP-41",
+          "- **Error**: Tool list_organizations is already registered",
+          "- **Issue ID**: REMOTE-MCP-41",
+          "- **Stacktrace**:",
+          "```",
+          "index.js at line 7809:27",
+          '"index.js" at line 8029:24',
+          '"index.js" at line 19631:28',
+          "```",
+          `- **URL**: https://${FIXTURES.organizationSlug}.sentry.io/issues/REMOTE-MCP-41`,
+        ].join("\n"),
+      },
+    ];
+  },
+  task: TaskRunner(),
+  scorers: [Factuality()],
+  threshold: 0.6,
+  timeout: 30000,
+});
diff --git a/src/evals/list-issues.eval.ts b/src/evals/list-issues.eval.ts
@@ -0,0 +1,29 @@
+import { describeEval } from "vitest-evals";
+import { Factuality, FIXTURES, TaskRunner } from "./utils";
+
+describeEval("list-issues", {
+  data: async () => {
+    return [
+      {
+        input:
+          "Can you you give me a list of common production errors messages, with their stacktrace and a url for more information?",
+        expected: [
+          "## REMOTE-MCP-41",
+          "- **Error**: Tool list_organizations is already registered",
+          "- **Issue ID**: REMOTE-MCP-41",
+          "- **Stacktrace**:",
+          "```",
+          "index.js at line 7809:27",
+          '"index.js" at line 8029:24',
+          '"index.js" at line 19631:28',
+          "```",
+          `- **URL**: https://${FIXTURES.organizationSlug}.sentry.io/issues/REMOTE-MCP-41`,
+        ].join("\n"),
+      },
+    ];
+  },
+  task: TaskRunner(),
+  scorers: [Factuality()],
+  threshold: 0.6,
+  timeout: 30000,
+});
diff --git a/src/evals/list-organizations.eval.ts b/src/evals/list-organizations.eval.ts
@@ -0,0 +1,17 @@
+import { describeEval } from "vitest-evals";
+import { Factuality, FIXTURES, TaskRunner } from "./utils";
+
+describeEval("list-organizations", {
+  data: async () => {
+    return [
+      {
+        input: `What organizations do I have access to in Sentry`,
+        expected: FIXTURES.organizationSlug,
+      },
+    ];
+  },
+  task: TaskRunner(),
+  scorers: [Factuality()],
+  threshold: 0.6,
+  timeout: 30000,
+});
diff --git a/src/evals/list-projects.eval.ts b/src/evals/list-projects.eval.ts
@@ -0,0 +1,17 @@
+import { describeEval } from "vitest-evals";
+import { Factuality, FIXTURES, TaskRunner } from "./utils";
+
+describeEval("list-projects", {
+  data: async () => {
+    return [
+      {
+        input: `What projects do I have access to in Sentry for '${FIXTURES.organizationSlug}'`,
+        expected: FIXTURES.projectSlug,
+      },
+    ];
+  },
+  task: TaskRunner(),
+  scorers: [Factuality()],
+  threshold: 0.6,
+  timeout: 30000,
+});
diff --git a/src/evals/list-teams.eval.ts b/src/evals/list-teams.eval.ts
@@ -0,0 +1,17 @@
+import { describeEval } from "vitest-evals";
+import { Factuality, FIXTURES, TaskRunner } from "./utils";
+
+describeEval("workflow", {
+  data: async () => {
+    return [
+      {
+        input: `What teams do I have access to in Sentry for '${FIXTURES.organizationSlug}'`,
+        expected: FIXTURES.teamSlug,
+      },
+    ];
+  },
+  task: TaskRunner(),
+  scorers: [Factuality()],
+  threshold: 0.6,
+  timeout: 30000,
+});
diff --git a/src/evals/utils.ts b/src/evals/utils.ts
@@ -1,6 +1,67 @@
-import { generateObject, type LanguageModel } from "ai";
+import { openai } from "@ai-sdk/openai";
+import {
+  experimental_createMCPClient,
+  generateObject,
+  streamText,
+  type LanguageModel,
+} from "ai";
+import { Experimental_StdioMCPTransport } from "ai/mcp-stdio";
 import { z } from "zod";
 
+export const FIXTURES = {
+  organizationSlug: "sentry-mcp-evals",
+  teamSlug: "the-goats",
+  projectSlug: "cloudflare-mcp",
+};
+
+const defaultModel = openai("gpt-4o");
+
+export function TaskRunner(model: LanguageModel = defaultModel) {
+  return async function TaskRunner(input: string) {
+    const transport = new Experimental_StdioMCPTransport({
+      command: "npm",
+      args: ["run", "start:stdio", "--mocks"],
+      env: {
+        SENTRY_AUTH_TOKEN: process.env.SENTRY_AUTH_TOKEN!,
+      },
+    });
+    const mcpClient = await experimental_createMCPClient({
+      transport,
+    });
+
+    const tools = await mcpClient.tools();
+
+    try {
+      const result = streamText({
+        model,
+        tools,
+        system:
+          "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
+        prompt: input,
+        maxRetries: 1,
+        maxSteps: 10,
+        experimental_telemetry: {
+          isEnabled: true,
+        },
+        onError: (error) => {
+          console.error(error);
+        },
+      });
+
+      for await (const part of result.fullStream) {
+        // console.log(part);
+      }
+
+      return await result.text;
+    } catch (error) {
+      console.error(error);
+      throw error;
+    } finally {
+      await mcpClient.close();
+    }
+  };
+}
+
 /**
  * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
  *
@@ -10,7 +71,7 @@ import { z } from "zod";
  * scorers: [Factuality(openai("gpt-4o"))]
  * ```
  */
-export function Factuality(model: LanguageModel) {
+export function Factuality(model: LanguageModel = defaultModel) {
   return async function Factuality(opts: {
     input: string;
     output: string;
diff --git a/src/evals/workflow.eval.ts b/src/evals/workflow.eval.ts