diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index b0dce49..85901ea 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -6,14 +6,16 @@ on: branches: [main] paths: - 'src/mcp/**' - - 'src/lib/sentry*' - - 'src/evals/workflow.test.ts' + - 'src/lib/sentry-api/**' + - 'src/evals/**' + - '*.eval.ts' - '.github/workflows/eval.yml' pull_request: paths: - 'src/mcp/**' - - 'src/lib/sentry*' - - 'src/evals/workflow.test.ts' + - 'src/lib/sentry-api/**' + - 'src/evals/**' + - '*.eval.ts' - '.github/workflows/eval.yml' jobs: diff --git a/src/evals/create-project.eval.ts b/src/evals/create-project.eval.ts new file mode 100644 index 0000000..85e9f5c --- /dev/null +++ b/src/evals/create-project.eval.ts @@ -0,0 +1,18 @@ +import { describeEval } from "vitest-evals"; +import { Factuality, FIXTURES, TaskRunner } from "./utils"; + +describeEval("create-project", { + data: async () => { + return [ + { + input: `Create a new project in Sentry for '${FIXTURES.organizationSlug}' called '${FIXTURES.projectSlug}' with the '${FIXTURES.teamSlug}' team. Output **only** the project slug and the SENTRY_DSN in the format of:\n\n`, + expected: + "cloudflare-mcp\nhttps://d20df0a1ab5031c7f3c7edca9c02814d@o4509106732793856.ingest.us.sentry.io/4509109104082945", + }, + ]; + }, + task: TaskRunner(), + scorers: [Factuality()], + threshold: 0.6, + timeout: 30000, +}); diff --git a/src/evals/create-team.eval.ts b/src/evals/create-team.eval.ts new file mode 100644 index 0000000..00f21ed --- /dev/null +++ b/src/evals/create-team.eval.ts @@ -0,0 +1,17 @@ +import { describeEval } from "vitest-evals"; +import { Factuality, FIXTURES, TaskRunner } from "./utils"; + +describeEval("create-team", { + data: async () => { + return [ + { + input: `Create a new team in Sentry for '${FIXTURES.organizationSlug}' called 'the-goats' response with **only** the team slug and no other text.`, + expected: FIXTURES.teamSlug, + }, + ]; + }, + task: TaskRunner(), + scorers: [Factuality()], + threshold: 0.6, + timeout: 30000, +}); diff --git a/src/evals/get-issue.eval.ts b/src/evals/get-issue.eval.ts new file mode 100644 index 0000000..05ef87d --- /dev/null +++ b/src/evals/get-issue.eval.ts @@ -0,0 +1,28 @@ +import { describeEval } from "vitest-evals"; +import { Factuality, FIXTURES, TaskRunner } from "./utils"; + +describeEval("get-issue", { + data: async () => { + return [ + { + input: "Analyze issue REMOTE-MCP-41 from Sentry.", + expected: [ + "## REMOTE-MCP-41", + "- **Error**: Tool list_organizations is already registered", + "- **Issue ID**: REMOTE-MCP-41", + "- **Stacktrace**:", + "```", + "index.js at line 7809:27", + '"index.js" at line 8029:24', + '"index.js" at line 19631:28', + "```", + `- **URL**: https://${FIXTURES.organizationSlug}.sentry.io/issues/REMOTE-MCP-41`, + ].join("\n"), + }, + ]; + }, + task: TaskRunner(), + scorers: [Factuality()], + threshold: 0.6, + timeout: 30000, +}); diff --git a/src/evals/list-issues.eval.ts b/src/evals/list-issues.eval.ts new file mode 100644 index 0000000..20835f5 --- /dev/null +++ b/src/evals/list-issues.eval.ts @@ -0,0 +1,29 @@ +import { describeEval } from "vitest-evals"; +import { Factuality, FIXTURES, TaskRunner } from "./utils"; + +describeEval("list-issues", { + data: async () => { + return [ + { + input: + "Can you you give me a list of common production errors messages, with their stacktrace and a url for more information?", + expected: [ + "## REMOTE-MCP-41", + "- **Error**: Tool list_organizations is already registered", + "- **Issue ID**: REMOTE-MCP-41", + "- **Stacktrace**:", + "```", + "index.js at line 7809:27", + '"index.js" at line 8029:24', + '"index.js" at line 19631:28', + "```", + `- **URL**: https://${FIXTURES.organizationSlug}.sentry.io/issues/REMOTE-MCP-41`, + ].join("\n"), + }, + ]; + }, + task: TaskRunner(), + scorers: [Factuality()], + threshold: 0.6, + timeout: 30000, +}); diff --git a/src/evals/list-organizations.eval.ts b/src/evals/list-organizations.eval.ts new file mode 100644 index 0000000..8717061 --- /dev/null +++ b/src/evals/list-organizations.eval.ts @@ -0,0 +1,17 @@ +import { describeEval } from "vitest-evals"; +import { Factuality, FIXTURES, TaskRunner } from "./utils"; + +describeEval("list-organizations", { + data: async () => { + return [ + { + input: `What organizations do I have access to in Sentry`, + expected: FIXTURES.organizationSlug, + }, + ]; + }, + task: TaskRunner(), + scorers: [Factuality()], + threshold: 0.6, + timeout: 30000, +}); diff --git a/src/evals/list-projects.eval.ts b/src/evals/list-projects.eval.ts new file mode 100644 index 0000000..c0388d7 --- /dev/null +++ b/src/evals/list-projects.eval.ts @@ -0,0 +1,17 @@ +import { describeEval } from "vitest-evals"; +import { Factuality, FIXTURES, TaskRunner } from "./utils"; + +describeEval("list-projects", { + data: async () => { + return [ + { + input: `What projects do I have access to in Sentry for '${FIXTURES.organizationSlug}'`, + expected: FIXTURES.projectSlug, + }, + ]; + }, + task: TaskRunner(), + scorers: [Factuality()], + threshold: 0.6, + timeout: 30000, +}); diff --git a/src/evals/list-teams.eval.ts b/src/evals/list-teams.eval.ts new file mode 100644 index 0000000..9cd9c5d --- /dev/null +++ b/src/evals/list-teams.eval.ts @@ -0,0 +1,17 @@ +import { describeEval } from "vitest-evals"; +import { Factuality, FIXTURES, TaskRunner } from "./utils"; + +describeEval("workflow", { + data: async () => { + return [ + { + input: `What teams do I have access to in Sentry for '${FIXTURES.organizationSlug}'`, + expected: FIXTURES.teamSlug, + }, + ]; + }, + task: TaskRunner(), + scorers: [Factuality()], + threshold: 0.6, + timeout: 30000, +}); diff --git a/src/evals/utils.ts b/src/evals/utils.ts index 216afd9..7b8616d 100644 --- a/src/evals/utils.ts +++ b/src/evals/utils.ts @@ -1,6 +1,67 @@ -import { generateObject, type LanguageModel } from "ai"; +import { openai } from "@ai-sdk/openai"; +import { + experimental_createMCPClient, + generateObject, + streamText, + type LanguageModel, +} from "ai"; +import { Experimental_StdioMCPTransport } from "ai/mcp-stdio"; import { z } from "zod"; +export const FIXTURES = { + organizationSlug: "sentry-mcp-evals", + teamSlug: "the-goats", + projectSlug: "cloudflare-mcp", +}; + +const defaultModel = openai("gpt-4o"); + +export function TaskRunner(model: LanguageModel = defaultModel) { + return async function TaskRunner(input: string) { + const transport = new Experimental_StdioMCPTransport({ + command: "npm", + args: ["run", "start:stdio", "--mocks"], + env: { + SENTRY_AUTH_TOKEN: process.env.SENTRY_AUTH_TOKEN!, + }, + }); + const mcpClient = await experimental_createMCPClient({ + transport, + }); + + const tools = await mcpClient.tools(); + + try { + const result = streamText({ + model, + tools, + system: + "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.", + prompt: input, + maxRetries: 1, + maxSteps: 10, + experimental_telemetry: { + isEnabled: true, + }, + onError: (error) => { + console.error(error); + }, + }); + + for await (const part of result.fullStream) { + // console.log(part); + } + + return await result.text; + } catch (error) { + console.error(error); + throw error; + } finally { + await mcpClient.close(); + } + }; +} + /** * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`. * @@ -10,7 +71,7 @@ import { z } from "zod"; * scorers: [Factuality(openai("gpt-4o"))] * ``` */ -export function Factuality(model: LanguageModel) { +export function Factuality(model: LanguageModel = defaultModel) { return async function Factuality(opts: { input: string; output: string; diff --git a/src/evals/workflow.eval.ts b/src/evals/workflow.eval.ts deleted file mode 100644 index 722a9da..0000000 --- a/src/evals/workflow.eval.ts +++ /dev/null @@ -1,119 +0,0 @@ -import { openai } from "@ai-sdk/openai"; -import { experimental_createMCPClient, streamText } from "ai"; -import { Experimental_StdioMCPTransport } from "ai/mcp-stdio"; -import { describeEval } from "vitest-evals"; -import { Factuality } from "./utils"; - -const model = openai("gpt-4o"); - -const CONFIG = { - organizationSlug: "sentry-mcp-evals", - teamSlug: "the-goats", - projectSlug: "cloudflare-mcp", -}; -// TODO: support this in -describeEval("workflow", { - data: async () => { - return [ - { - input: `What organizations do I have access to in Sentry`, - expected: CONFIG.organizationSlug, - }, - { - input: `What teams do I have access to in Sentry for '${CONFIG.organizationSlug}'`, - expected: CONFIG.teamSlug, - }, - { - input: `What projects do I have access to in Sentry for '${CONFIG.organizationSlug}'`, - expected: CONFIG.projectSlug, - }, - { - input: `Create a new team in Sentry for '${CONFIG.organizationSlug}' called 'the-goats' response with **only** the team slug and no other text.`, - expected: CONFIG.teamSlug, - }, - { - input: `Create a new project in Sentry for '${CONFIG.organizationSlug}' called '${CONFIG.projectSlug}' with the '${CONFIG.teamSlug}' team. Output **only** the project slug and the SENTRY_DSN in the format of:\n\n`, - expected: - "cloudflare-mcp\nhttps://d20df0a1ab5031c7f3c7edca9c02814d@o4509106732793856.ingest.us.sentry.io/4509109104082945", - }, - { - input: - "Can you you give me a list of common production errors messages, with their stacktrace and a url for more information?", - expected: [ - "## REMOTE-MCP-41", - "- **Error**: Tool list_organizations is already registered", - "- **Issue ID**: REMOTE-MCP-41", - "- **Stacktrace**:", - "```", - "index.js at line 7809:27", - '"index.js" at line 8029:24', - '"index.js" at line 19631:28', - "```", - `- **URL**: https://${CONFIG.organizationSlug}.sentry.io/issues/REMOTE-MCP-41`, - ].join("\n"), - }, - { - input: "Analyze issue REMOTE-MCP-41 from Sentry.", - expected: [ - "## REMOTE-MCP-41", - "- **Error**: Tool list_organizations is already registered", - "- **Issue ID**: REMOTE-MCP-41", - "- **Stacktrace**:", - "```", - "index.js at line 7809:27", - '"index.js" at line 8029:24', - '"index.js" at line 19631:28', - "```", - `- **URL**: https://${CONFIG.organizationSlug}.sentry.io/issues/REMOTE-MCP-41`, - ].join("\n"), - }, - ]; - }, - task: async (input) => { - const transport = new Experimental_StdioMCPTransport({ - command: "npm", - args: ["run", "start:stdio", "--mocks"], - env: { - SENTRY_AUTH_TOKEN: process.env.SENTRY_AUTH_TOKEN!, - }, - }); - const mcpClient = await experimental_createMCPClient({ - transport, - }); - - const tools = await mcpClient.tools(); - - try { - const result = streamText({ - model, - tools, - system: - "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.", - prompt: input, - maxRetries: 1, - maxSteps: 10, - experimental_telemetry: { - isEnabled: true, - }, - onError: (error) => { - console.error(error); - }, - }); - - for await (const part of result.fullStream) { - // console.log(part); - } - - return await result.text; - } catch (error) { - console.error(error); - throw error; - } finally { - await mcpClient.close(); - } - }, - scorers: [Factuality(model)], - // skipIf: () => !process.env.OPENAI_API_KEY, - threshold: 0.6, - timeout: 30000, -});