8000 chore: implement multi-model accuracy tests MCP-39 by kmruiz · Pull Request #308 · mongodb-js/mongodb-mcp-server · GitHub
[go: up one dir, main page]

Skip to content

chore: implement multi-model accuracy tests MCP-39 #308

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@
"check:types": "tsc --noEmit --project tsconfig.json",
"reformat": "prettier --write .",
"generate": "./scripts/generate.sh",
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage"
"test": "npm run test:unit && npm run test:integration",
"test:accuracy": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathPattern=tests/accuracy",
"test:unit": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathPattern=tests/unit",
"test:integration": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathPattern=tests/integration"
},
"license": "Apache-2.0",
"devDependencies": {
Expand Down Expand Up @@ -57,7 +60,8 @@
"tsx": "^4.19.3",
"typescript": "^5.8.2",
"typescript-eslint": "^8.29.1",
"yaml": "^2.7.1"
"yaml": "^2.7.1",
"zod-to-json-schema": "^3.24.5"
},
"dependencies": {
"@modelcontextprotocol/sdk": "^1.11.2",
Expand Down
11 changes: 11 additions & 0 deletions tests/accuracy/1-step/simple-delete-query.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { describeAccuracyTest } from "../test-sdk.js";

describeAccuracyTest("1 step delete queries", ({ prompt }) => {
prompt("delete all disabled users (disabled = true) in database 'my' and collection 'users'", (tool) => {
tool("delete-many").verifyCalled({
database: "my",
collection: "users",
filter: { disabled: true },
});
});
});
20 changes: 20 additions & 0 deletions tests/accuracy/1-step/simple-find-query.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { describeAccuracyTest } from "../test-sdk.js";

describeAccuracyTest("1 step find queries", ({ prompt }) => {
prompt("find all users in database 'my' and collection 'users'", (tool) => {
tool("find").verifyCalled({ database: "my", collection: "users", limit: 10 });
});

prompt("find all red cars in database 'production' and collection 'cars'", (tool) => {
tool("find").verifyCalled({ filter: { color: "red" }, database: "production", collection: "cars", limit: 10 });
});

prompt("get 100 books in database 'prod' and collection 'books' where the author is J.R.R Tolkien", (tool) => {
tool("find").verifyCalled({
filter: { author: "J.R.R Tolkien" },
database: "prod",
collection: "books",
limit: 100,
});
});
});
12 changes: 12 additions & 0 deletions tests/accuracy/1-step/simple-update-query.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import { describeAccuracyTest } from "../test-sdk.js";

describeAccuracyTest("1 step update queries", ({ prompt }) => {
prompt("set all users with an empty email to disabled in database 'my' and collection 'users'", (tool) => {
tool("update-many").verifyCalled({
database: "my",
collection: "users",
filter: { email: "" },
update: { $set: { disabled: true } },
});
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { describeAccuracyTest } from "../test-sdk.js";

describeAccuracyTest("2 step create collection", ({ prompt }) => {
prompt(
`
create a new collection named 'users' in database 'my' and afterwards create a sample document with the following data:
- username: "john_doe"
- email: test@mongodb.com
- password: "password123"
- disabled: false
`,
(tool) => {
tool("create-collection").verifyCalled({
database: "my",
collection: "users",
});

tool("insert-many").verifyCalled({
database: "my",
collection: "users",
documents: [
{
username: "john_doe",
email: "test@mongodb.com",
password: "password123",
disabled: false,
},
],
});
}
);
});
147 changes: 147 additions & 0 deletions tests/accuracy/models/gemini.ts
1E79 F438
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import { ModelFacade, ToolCall, ToolDefinition } from "./model.js";

type GeminiModel = "gemini-2.0-flash" | "gemini-1.5-flash";

export class GeminiModelFacade implements ModelFacade {
readonly name: GeminiModel;

constructor(modelName: GeminiModel) {
this.name = modelName;
}

available(): boolean {
return process.env.MONGODB_MCP_TEST_GEMINI_API_KEY !== undefined;
}

async generatePlan(prompt: string, tools: ToolDefinition[]): Promise<string[]> {
const planPrompt = `You are an expert MongoDB developer. Create a plan for the following task: \n ${prompt} \n Return the plan as a list of steps, as a JSON array. For example: [ "Step 1: ...", "Step 2: ...", "Step 3: ..." ]. Only return the JSON array, nothing else. Do not include any wrapper markdown or anything, just the plain JSON array.`;
const chatHistory = [{ role: "user", parts: [{ text: planPrompt }] }];

const apiKey = process.env.MONGODB_MCP_TEST_GEMINI_API_KEY;
const apiUrl = `https://generativelanguage.googleapis.com/v1beta/models/${this.name}:generateContent?key=${apiKey}`;

const toolDefinitions = tools.map((tool) => ({
name: tool.name,
description: tool.description,
parameters: tool.parameters || {},
}));

const payload = {
contents: chatHistory,
tools: {
function_declarations: [toolDefinitions],
},
};

try {
const response = await fetch(apiUrl, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(payload),
});

if (!response.ok) {
const errorData = await response.text();
console.error(`[Gemini API Error] HTTP error! status: ${response.status}, data: ${errorData}`);
return [];
}

const result = (await response.json()) as {
candidates: Array<{
content: {
parts: Array<{
text?: string;
functionCall?: {
name: string;
args: Record<string, unknown>;
};
}>;
};
}>;
};

const responseString = result.candidates
.flatMap((candidate) => candidate.content.parts.map((part) => part.text || ""))
.join("")
.replace("```json", "")
.replace("```", "");

try {
return JSON.parse(responseString) as string[];
} catch (parseError) {
console.error("[Gemini API JSON.parse Error]", responseString, parseError);
}
return [];
} catch (error: unknown) {
console.error("[Gemini API Fetch Error]", error);
return [];
}
}

async generateContent(parts: string[], tools: ToolDefinition[]): Promise<{ toolCall: ToolCall[]; text?: string }> {
const toolDefinitions = tools.map((tool) => ({
name: tool.name,
description: tool.description,
parameters: tool.parameters || {},
}));

const chatHistory = [{ role: "user", parts: parts.map((part) => ({ text: part })) }];
const payload = {
contents: chatHistory,
tools: {
function_declarations: [toolDefinitions],
},
};

const apiKey = process.env.MONGODB_MCP_TEST_GEMINI_API_KEY;
const apiUrl = `https://generativelanguage.googleapis.com/v1beta/models/${this.name}:generateContent?key=${apiKey}`;

try {
const response = await fetch(apiUrl, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(payload),
});

if (!response.ok) {
const errorData = await response.text();
console.error(`[Gemini API Error] HTTP error! status: ${response.status}, data: ${errorData}`);
return { toolCall: [], text: `Gemini API error: ${response.status}` };
}

const result = (await response.json()) as {
candidates: Array<{
content: {
parts: Array<{
text?: string;
functionCall?: {
name: string;
args: Record<string, unknown>;
};
}>;
};
}>;
};

if (result.candidates && result.candidates.length > 0) {
const firstPart = result.candidates[0]?.content.parts[0];
if (firstPart?.functionCall) {
return {
toolCall: [
{
name: firstPart.functionCall.name,
args: firstPart.functionCall.args,
},
],
};
} else if (firstPart?.text) {
return { toolCall: [], text: firstPart.text };
}
}
return { toolCall: [], text: "Gemini response was empty or unexpected." };
} catch (error: unknown) {
console.error("[Gemini API Fetch Error]", error);
return { toolCall: [], text: `Error contacting Gemini LLM.` };
}
}
}
11 changes: 11 additions & 0 deletions tests/accuracy/models/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { ModelFacade } from "./model.js";
import { GeminiModelFacade } from "./gemini.js";

const ALL_MODELS: ModelFacade[] = [
new GeminiModelFacade("gemini-2.0-flash"),
new GeminiModelFacade("gemini-1.5-flash"),
];

export function availableModels(): ModelFacade[] {
return ALL_MODELS.filter((model) => model.available());
}
14 changes: 14 additions & 0 deletions tests/accuracy/models/model.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
export type ToolCall = { name: string; args: Record<string, unknown> };
export type ToolDefinition = {
name: string;
description: string;
parameters: Record<string, unknown>;
};

export interface ModelFacade {
name: string;
available(): boolean;

generatePlan(prompt: string, tools: ToolDefinition[]): Promise<string[]>;
generateContent(parts: string[], tools: ToolDefinition[]): Promise<{ toolCall: ToolCall[]; text?: string }>;
}
Loading
Loading
0