Skip to content

Commit 98d366c

Browse files
committed
Break up eval test suite
1 parent a52ceea commit 98d366c

10 files changed

+212
-125
lines changed

.github/workflows/eval.yml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,16 @@ on:
66
branches: [main]
77
paths:
88
- 'src/mcp/**'
9-
- 'src/lib/sentry*'
10-
- 'src/evals/workflow.test.ts'
9+
- 'src/lib/sentry**'
10+
- 'src/evals/**'
11+
- '*.eval.ts'
1112
- '.github/workflows/eval.yml'
1213
pull_request:
1314
paths:
1415
- 'src/mcp/**'
15-
- 'src/lib/sentry*'
16-
- 'src/evals/workflow.test.ts'
16+
- 'src/lib/sentry**'
17+
- 'src/evals/**'
18+
- '*.eval.ts'
1719
- '.github/workflows/eval.yml'
1820

1921
jobs:

src/evals/create-project.eval.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import { describeEval } from "vitest-evals";
2+
import { Factuality, FIXTURES, TaskRunner } from "./utils";
3+
4+
describeEval("create-project", {
5+
data: async () => {
6+
return [
7+
{
8+
input: `Create a new project in Sentry for '${FIXTURES.organizationSlug}' called '${FIXTURES.projectSlug}' with the '${FIXTURES.teamSlug}' team. Output **only** the project slug and the SENTRY_DSN in the format of:\n<PROJECT_SLUG>\n<SENTRY_DSN>`,
9+
expected:
10+
"cloudflare-mcp\nhttps://d20df0a1ab5031c7f3c7edca9c02814d@o4509106732793856.ingest.us.sentry.io/4509109104082945",
11+
},
12+
];
13+
},
14+
task: TaskRunner(),
15+
scorers: [Factuality()],
16+
threshold: 0.6,
17+
timeout: 30000,
18+
});

src/evals/create-team.eval.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import { describeEval } from "vitest-evals";
2+
import { Factuality, FIXTURES, TaskRunner } from "./utils";
3+
4+
describeEval("create-team", {
5+
data: async () => {
6+
return [
7+
{
8+
input: `Create a new team in Sentry for '${FIXTURES.organizationSlug}' called 'the-goats' response with **only** the team slug and no other text.`,
9+
expected: FIXTURES.teamSlug,
10+
},
11+
];
12+
},
13+
task: TaskRunner(),
14+
scorers: [Factuality()],
15+
threshold: 0.6,
16+
timeout: 30000,
17+
});

src/evals/get-issue.eval.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import { describeEval } from "vitest-evals";
2+
import { Factuality, FIXTURES, TaskRunner } from "./utils";
3+
4+
describeEval("get-issue", {
5+
data: async () => {
6+
return [
7+
{
8+
input: "Analyze issue REMOTE-MCP-41 from Sentry.",
9+
expected: [
10+
"## REMOTE-MCP-41",
11+
"- **Error**: Tool list_organizations is already registered",
12+
"- **Issue ID**: REMOTE-MCP-41",
13+
"- **Stacktrace**:",
14+
"```",
15+
"index.js at line 7809:27",
16+
'"index.js" at line 8029:24',
17+
'"index.js" at line 19631:28',
18+
"```",
19+
`- **URL**: https://${FIXTURES.organizationSlug}.sentry.io/issues/REMOTE-MCP-41`,
20+
].join("\n"),
21+
},
22+
];
23+
},
24+
task: TaskRunner(),
25+
scorers: [Factuality()],
26+
threshold: 0.6,
27+
timeout: 30000,
28+
});

src/evals/list-issues.eval.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import { describeEval } from "vitest-evals";
2+
import { Factuality, FIXTURES, TaskRunner } from "./utils";
3+
4+
describeEval("list-issues", {
5+
data: async () => {
6+
return [
7+
{
8+
input:
9+
"Can you you give me a list of common production errors messages, with their stacktrace and a url for more information?",
10+
expected: [
11+
"## REMOTE-MCP-41",
12+
"- **Error**: Tool list_organizations is already registered",
13+
"- **Issue ID**: REMOTE-MCP-41",
14+
"- **Stacktrace**:",
15+
"```",
16+
"index.js at line 7809:27",
17+
'"index.js" at line 8029:24',
18+
'"index.js" at line 19631:28',
19+
"```",
20+
`- **URL**: https://${FIXTURES.organizationSlug}.sentry.io/issues/REMOTE-MCP-41`,
21+
].join("\n"),
22+
},
23+
];
24+
},
25+
task: TaskRunner(),
26+
scorers: [Factuality()],
27+
threshold: 0.6,
28+
timeout: 30000,
29+
});

src/evals/list-organizations.eval.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import { describeEval } from "vitest-evals";
2+
import { Factuality, FIXTURES, TaskRunner } from "./utils";
3+
4+
describeEval("list-organizations", {
5+
data: async () => {
6+
return [
7+
{
8+
input: `What organizations do I have access to in Sentry`,
9+
expected: FIXTURES.organizationSlug,
10+
},
11+
];
12+
},
13+
task: TaskRunner(),
14+
scorers: [Factuality()],
15+
threshold: 0.6,
16+
timeout: 30000,
17+
});

src/evals/list-projects.eval.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import { describeEval } from "vitest-evals";
2+
import { Factuality, FIXTURES, TaskRunner } from "./utils";
3+
4+
describeEval("list-projects", {
5+
data: async () => {
6+
return [
7+
{
8+
input: `What projects do I have access to in Sentry for '${FIXTURES.organizationSlug}'`,
9+
expected: FIXTURES.projectSlug,
10+
},
11+
];
12+
},
13+
task: TaskRunner(),
14+
scorers: [Factuality()],
15+
threshold: 0.6,
16+
timeout: 30000,
17+
});

src/evals/list-teams.eval.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import { describeEval } from "vitest-evals";
2+
import { Factuality, FIXTURES, TaskRunner } from "./utils";
3+
4+
describeEval("workflow", {
5+
data: async () => {
6+
return [
7+
{
8+
input: `What teams do I have access to in Sentry for '${FIXTURES.organizationSlug}'`,
9+
expected: FIXTURES.teamSlug,
10+
},
11+
];
12+
},
13+
task: TaskRunner(),
14+
scorers: [Factuality()],
15+
threshold: 0.6,
16+
timeout: 30000,
17+
});

src/evals/utils.ts

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,67 @@
1-
import { generateObject, type LanguageModel } from "ai";
1+
import { openai } from "@ai-sdk/openai";
2+
import {
3+
experimental_createMCPClient,
4+
generateObject,
5+
streamText,
6+
type LanguageModel,
7+
} from "ai";
8+
import { Experimental_StdioMCPTransport } from "ai/mcp-stdio";
29
import { z } from "zod";
310

11+
export const FIXTURES = {
12+
organizationSlug: "sentry-mcp-evals",
13+
teamSlug: "the-goats",
14+
projectSlug: "cloudflare-mcp",
15+
};
16+
17+
const defaultModel = openai("gpt-4o");
18+
19+
export function TaskRunner(model: LanguageModel = defaultModel) {
20+
return async function TaskRunner(input: string) {
21+
const transport = new Experimental_StdioMCPTransport({
22+
command: "npm",
23+
args: ["run", "start:stdio", "--mocks"],
24+
env: {
25+
SENTRY_AUTH_TOKEN: process.env.SENTRY_AUTH_TOKEN!,
26+
},
27+
});
28+
const mcpClient = await experimental_createMCPClient({
29+
transport,
30+
});
31+
32+
const tools = await mcpClient.tools();
33+
34+
try {
35+
const result = streamText({
36+
model,
37+
tools,
38+
system:
39+
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
40+
prompt: input,
41+
maxRetries: 1,
42+
maxSteps: 10,
43+
experimental_telemetry: {
44+
isEnabled: true,
45+
},
46+
onError: (error) => {
47+
console.error(error);
48+
},
49+
});
50+
51+
for await (const part of result.fullStream) {
52+
// console.log(part);
53+
}
54+
55+
return await result.text;
56+
} catch (error) {
57+
console.error(error);
58+
throw error;
59+
} finally {
60+
await mcpClient.close();
61+
}
62+
};
63+
}
64+
465
/**
566
* A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
667
*
@@ -10,7 +71,7 @@ import { z } from "zod";
1071
* scorers: [Factuality(openai("gpt-4o"))]
1172
* ```
1273
*/
13-
export function Factuality(model: LanguageModel) {
74+
export function Factuality(model: LanguageModel = defaultModel) {
1475
return async function Factuality(opts: {
1576
input: string;
1677
output: string;

src/evals/workflow.eval.ts

Lines changed: 0 additions & 119 deletions
This file was deleted.

0 commit comments

Comments
 (0)