// Tester Service — streams multi-layer QA test execution via Claude import Anthropic from '@anthropic-ai/sdk'; import { getSkill } from '../skills/loader'; import type { PipelineEvent, TestLayer, TestResult, TestDetail, } from '../types'; const MODEL = 'claude-sonnet-4-5-20250514'; const MAX_TOKENS = 8192; /** * Run multi-layer tests against generated MCP server code. * Streams test:running and test:result events per layer. */ export async function* runTests( serverCode: string, layers: TestLayer[] ): AsyncGenerator { const client = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY }); const systemPrompt = getSkill('tester'); for (const layer of layers) { yield { type: 'test:running', layer }; try { const stream = client.messages.stream({ model: MODEL, max_tokens: MAX_TOKENS, system: systemPrompt, messages: [ { role: 'user', content: `Run "${layer}" layer tests on this MCP server code. ## Test Layer: ${layer} ${getLayerInstructions(layer)} ## Server Code \`\`\`typescript ${serverCode} \`\`\` ## Output Format Return your test results as a JSON object: \`\`\`json { "layer": "${layer}", "passed": true/false, "total": , "failures": , "details": [ { "name": "test name", "passed": true/false, "message": "...", "severity": "error|warning|info" } ], "duration": } \`\`\` Run all ${layer} tests now and return results.`, }, ], }); let fullText = ''; for await (const event of stream) { if (event.type === 'content_block_delta' && event.delta.type === 'text_delta') { fullText += event.delta.text; } } await stream.finalMessage(); // Parse test results const result = extractTestResult(fullText, layer); yield { type: 'test:result', result }; } catch (error) { const msg = error instanceof Error ? error.message : String(error); // Yield a failed result for this layer yield { type: 'test:result', result: { layer, passed: false, total: 0, failures: 1, details: [ { name: `${layer} layer execution`, passed: false, message: `Test execution failed: ${msg}`, severity: 'error' as const, }, ], duration: 0, }, }; // Also yield an error event if it's a rate limit / recoverable issue if (error instanceof Anthropic.RateLimitError) { yield { type: 'error', message: `Rate limited during ${layer} tests: ${msg}`, recoverable: true, }; } } } } function extractTestResult(text: string, layer: TestLayer): TestResult { // Try JSON code block const jsonMatch = text.match(/```json\s*\n([\s\S]*?)\n```/); if (jsonMatch) { try { const parsed = JSON.parse(jsonMatch[1]); return { layer, passed: parsed.passed ?? false, total: parsed.total ?? 0, failures: parsed.failures ?? 0, details: (parsed.details || []).map((d: Partial) => ({ name: d.name || 'unnamed', passed: d.passed ?? false, message: d.message, severity: d.severity || 'info', })), duration: parsed.duration ?? 0, }; } catch { // fall through } } // Try raw JSON const braceStart = text.indexOf('{'); const braceEnd = text.lastIndexOf('}'); if (braceStart !== -1 && braceEnd > braceStart) { try { const parsed = JSON.parse(text.slice(braceStart, braceEnd + 1)); return { layer, passed: parsed.passed ?? false, total: parsed.total ?? 0, failures: parsed.failures ?? 0, details: (parsed.details || []).map((d: Partial) => ({ name: d.name || 'unnamed', passed: d.passed ?? false, message: d.message, severity: d.severity || 'info', })), duration: parsed.duration ?? 0, }; } catch { // fall through } } // Fallback: couldn't parse return { layer, passed: false, total: 0, failures: 1, details: [ { name: 'result_parsing', passed: false, message: 'Could not parse test results from Claude response', severity: 'error', }, ], duration: 0, }; } function getLayerInstructions(layer: TestLayer): string { switch (layer) { case 'protocol': return `Test MCP protocol compliance: - Verify initialize/capabilities handshake - Check tools/list returns valid tool definitions - Verify tool call/response format matches MCP spec - Test error response format - Check JSON-RPC envelope correctness`; case 'static': return `Run static analysis: - TypeScript type safety (look for any, unknown misuse) - Input validation completeness (all required params validated) - Error handling coverage (try/catch around external calls) - Import/export correctness - Naming convention compliance`; case 'visual': return `Evaluate code quality visually: - Code organization and file structure - Documentation completeness (JSDoc, README) - Consistent formatting and style - Appropriate abstraction levels - Clean separation of concerns`; case 'functional': return `Test functional correctness: - Each tool handles valid input correctly - Each tool handles invalid input gracefully - Auth flow works for the configured auth type - Rate limiting is respected - Edge cases (empty arrays, null values, large inputs)`; case 'performance': return `Evaluate performance characteristics: - No synchronous blocking operations - Efficient data serialization - Connection pooling / reuse patterns - Memory leak potential (event listeners, closures) - Response size management`; case 'security': return `Security audit: - No hardcoded credentials - Input sanitization (injection prevention) - Proper auth token handling (not logged, not in URLs) - Rate limit enforcement - SSRF prevention for URL parameters - Safe error messages (no internal details leaked)`; default: return `Run comprehensive tests for the "${layer}" layer.`; } }