Skip to content

Break up eval test suite #46

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions .github/workflows/eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@ on:
branches: [main]
paths:
- 'src/mcp/**'
- 'src/lib/sentry*'
- 'src/evals/workflow.test.ts'
- 'src/lib/sentry-api/**'
- 'src/evals/**'
- '*.eval.ts'
- '.github/workflows/eval.yml'
pull_request:
paths:
- 'src/mcp/**'
- 'src/lib/sentry*'
- 'src/evals/workflow.test.ts'
- 'src/lib/sentry-api/**'
- 'src/evals/**'
- '*.eval.ts'
- '.github/workflows/eval.yml'

jobs:
Expand Down
18 changes: 18 additions & 0 deletions src/evals/create-project.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import { describeEval } from "vitest-evals";
import { Factuality, FIXTURES, TaskRunner } from "./utils";

describeEval("create-project", {
data: async () => {
return [
{
input: `Create a new project in Sentry for '${FIXTURES.organizationSlug}' called '${FIXTURES.projectSlug}' with the '${FIXTURES.teamSlug}' team. Output **only** the project slug and the SENTRY_DSN in the format of:\n<PROJECT_SLUG>\n<SENTRY_DSN>`,
expected:
"cloudflare-mcp\nhttps://d20df0a1ab5031c7f3c7edca9c02814d@o4509106732793856.ingest.us.sentry.io/4509109104082945",
},
];
},
task: TaskRunner(),
scorers: [Factuality()],
threshold: 0.6,
timeout: 30000,
});
17 changes: 17 additions & 0 deletions src/evals/create-team.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import { describeEval } from "vitest-evals";
import { Factuality, FIXTURES, TaskRunner } from "./utils";

describeEval("create-team", {
data: async () => {
return [
{
input: `Create a new team in Sentry for '${FIXTURES.organizationSlug}' called 'the-goats' response with **only** the team slug and no other text.`,
expected: FIXTURES.teamSlug,
},
];
},
task: TaskRunner(),
scorers: [Factuality()],
threshold: 0.6,
timeout: 30000,
});
28 changes: 28 additions & 0 deletions src/evals/get-issue.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import { describeEval } from "vitest-evals";
import { Factuality, FIXTURES, TaskRunner } from "./utils";

describeEval("get-issue", {
data: async () => {
return [
{
input: "Analyze issue REMOTE-MCP-41 from Sentry.",
expected: [
"## REMOTE-MCP-41",
"- **Error**: Tool list_organizations is already registered",
"- **Issue ID**: REMOTE-MCP-41",
"- **Stacktrace**:",
"```",
"index.js at line 7809:27",
'"index.js" at line 8029:24',
'"index.js" at line 19631:28',
"```",
`- **URL**: https://${FIXTURES.organizationSlug}.sentry.io/issues/REMOTE-MCP-41`,
].join("\n"),
},
];
},
task: TaskRunner(),
scorers: [Factuality()],
threshold: 0.6,
timeout: 30000,
});
29 changes: 29 additions & 0 deletions src/evals/list-issues.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import { describeEval } from "vitest-evals";
import { Factuality, FIXTURES, TaskRunner } from "./utils";

describeEval("list-issues", {
data: async () => {
return [
{
input:
"Can you you give me a list of common production errors messages, with their stacktrace and a url for more information?",
expected: [
"## REMOTE-MCP-41",
"- **Error**: Tool list_organizations is already registered",
"- **Issue ID**: REMOTE-MCP-41",
"- **Stacktrace**:",
"```",
"index.js at line 7809:27",
'"index.js" at line 8029:24',
'"index.js" at line 19631:28',
"```",
`- **URL**: https://${FIXTURES.organizationSlug}.sentry.io/issues/REMOTE-MCP-41`,
].join("\n"),
},
];
},
task: TaskRunner(),
scorers: [Factuality()],
threshold: 0.6,
timeout: 30000,
});
17 changes: 17 additions & 0 deletions src/evals/list-organizations.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import { describeEval } from "vitest-evals";
import { Factuality, FIXTURES, TaskRunner } from "./utils";

describeEval("list-organizations", {
data: async () => {
return [
{
input: `What organizations do I have access to in Sentry`,
expected: FIXTURES.organizationSlug,
},
];
},
task: TaskRunner(),
scorers: [Factuality()],
threshold: 0.6,
timeout: 30000,
});
17 changes: 17 additions & 0 deletions src/evals/list-projects.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import { describeEval } from "vitest-evals";
import { Factuality, FIXTURES, TaskRunner } from "./utils";

describeEval("list-projects", {
data: async () => {
return [
{
input: `What projects do I have access to in Sentry for '${FIXTURES.organizationSlug}'`,
expected: FIXTURES.projectSlug,
},
];
},
task: TaskRunner(),
scorers: [Factuality()],
threshold: 0.6,
timeout: 30000,
});
17 changes: 17 additions & 0 deletions src/evals/list-teams.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import { describeEval } from "vitest-evals";
import { Factuality, FIXTURES, TaskRunner } from "./utils";

describeEval("workflow", {
data: async () => {
return [
{
input: `What teams do I have access to in Sentry for '${FIXTURES.organizationSlug}'`,
expected: FIXTURES.teamSlug,
},
];
},
task: TaskRunner(),
scorers: [Factuality()],
threshold: 0.6,
timeout: 30000,
});
65 changes: 63 additions & 2 deletions src/evals/utils.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,67 @@
import { generateObject, type LanguageModel } from "ai";
import { openai } from "@ai-sdk/openai";
import {
experimental_createMCPClient,
generateObject,
streamText,
type LanguageModel,
} from "ai";
import { Experimental_StdioMCPTransport } from "ai/mcp-stdio";
import { z } from "zod";

export const FIXTURES = {
organizationSlug: "sentry-mcp-evals",
teamSlug: "the-goats",
projectSlug: "cloudflare-mcp",
};

const defaultModel = openai("gpt-4o");

export function TaskRunner(model: LanguageModel = defaultModel) {
return async function TaskRunner(input: string) {
const transport = new Experimental_StdioMCPTransport({
command: "npm",
args: ["run", "start:stdio", "--mocks"],
env: {
SENTRY_AUTH_TOKEN: process.env.SENTRY_AUTH_TOKEN!,
},
});
const mcpClient = await experimental_createMCPClient({
transport,
});

const tools = await mcpClient.tools();

try {
const result = streamText({
model,
tools,
system:
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
prompt: input,
maxRetries: 1,
maxSteps: 10,
experimental_telemetry: {
isEnabled: true,
},
onError: (error) => {
console.error(error);
},

Check warning on line 48 in src/evals/utils.ts

View check run for this annotation

Codecov / codecov/patch

src/evals/utils.ts#L47-L48

Added lines #L47 - L48 were not covered by tests
});

for await (const part of result.fullStream) {
// console.log(part);
}

return await result.text;
} catch (error) {
console.error(error);
throw error;

Check warning on line 58 in src/evals/utils.ts

View check run for this annotation

Codecov / codecov/patch

src/evals/utils.ts#L57-L58

Added lines #L57 - L58 were not covered by tests
} finally {
await mcpClient.close();
}
};
}

/**
* A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
*
Expand All @@ -10,7 +71,7 @@
* scorers: [Factuality(openai("gpt-4o"))]
* ```
*/
export function Factuality(model: LanguageModel) {
export function Factuality(model: LanguageModel = defaultModel) {
return async function Factuality(opts: {
input: string;
output: string;
Expand Down
119 changes: 0 additions & 119 deletions src/evals/workflow.eval.ts

This file was deleted.

Loading