1import Fastify from "fastify";
2
3interface EvalRequest {
4 modelEndpoint: string;
5 evalDatasetPath: string;
6 maxExamples?: number;
7}
8
9interface EvalResult {
10 accuracy: number;
11 formatCompliance: number;
12 avgSimilarity: number;
13 totalExamples: number;
14 perCategory: Record<string, number>;
15 failedExamples: Array<{
16 prompt: string;
17 expected: string;
18 generated: string;
19 category: string;
20 }>;
21}
22
23const app = Fastify({ logger: true });
24
25app.post<{ Body: EvalRequest }>("/evaluate", async (request): Promise<EvalResult> => {
26 const { modelEndpoint, evalDatasetPath, maxExamples = 100 } = request.body;
27
28 const evalData = await loadEvalDataset(evalDatasetPath, maxExamples);
29
30 let correct = 0;
31 let formatOk = 0;
32 const similarities: number[] = [];
33 const perCategory: Record<string, { correct: number; total: number }> = {};
34 const failedExamples: EvalResult["failedExamples"] = [];
35
36 for (const item of evalData) {
37 const generated = await callModel(modelEndpoint, item.prompt);
38 const category = item.category || "general";
39
40 if (!perCategory[category]) {
41 perCategory[category] = { correct: 0, total: 0 };
42 }
43 perCategory[category].total++;
44
45 const isCorrect = normalizeText(generated) === normalizeText(item.expected);
46 const isFormatOk = checkFormat(generated, item.formatSpec);
47 const similarity = computeSimilarity(generated, item.expected);
48
49 if (isCorrect) {
50 correct++;
51 perCategory[category].correct++;
52 } else {
53 failedExamples.push({
54 prompt: item.prompt.slice(0, 200),
55 expected: item.expected.slice(0, 200),
56 generated: generated.slice(0, 200),
57 category,
58 });
59 }
60 if (isFormatOk) formatOk++;
61 similarities.push(similarity);
62 }
63
64 const total = evalData.length;
65 return {
66 accuracy: correct / total,
67 formatCompliance: formatOk / total,
68 avgSimilarity: similarities.reduce((a, b) => a + b, 0) / similarities.length,
69 totalExamples: total,
70 perCategory: Object.fromEntries(
71 Object.entries(perCategory).map(([k, v]) => [k, v.correct / v.total])
72 ),
73 failedExamples: failedExamples.slice(0, 20),
74 };
75});
76
77async function callModel(endpoint: string, prompt: string): Promise<string> {
78 const response = await fetch(`${endpoint}/v1/chat/completions`, {
79 method: "POST",
80 headers: { "Content-Type": "application/json" },
81 body: JSON.stringify({
82 model: "default",
83 messages: [{ role: "user", content: prompt }],
84 temperature: 0.1,
85 max_tokens: 512,
86 }),
87 });
88
89 const data = await response.json();
90 return data.choices[0].message.content;
91}
92
93function normalizeText(text: string): string {
94 return text.trim().toLowerCase().replace(/\s+/g, " ");
95}
96
97function computeSimilarity(a: string, b: string): number {
98 const na = normalizeText(a);
99 const nb = normalizeText(b);
100 const longer = na.length > nb.length ? na : nb;
101 const shorter = na.length > nb.length ? nb : na;
102 if (longer.length === 0) return 1.0;
103
104 let matches = 0;
105 const words_a = na.split(" ");
106 const words_b = new Set(nb.split(" "));
107 for (const word of words_a) {
108 if (words_b.has(word)) matches++;
109 }
110 return matches / Math.max(words_a.length, words_b.size);
111}
112
113function checkFormat(output: string, formatSpec?: { type: string }): boolean {
114 if (!formatSpec) return true;
115 if (formatSpec.type === "json") {
116 try { JSON.parse(output); return true; } catch { return false; }
117 }
118 return true;
119}
120