From 692cdba602b5afeaaaff27424f4ae7253b222c8a Mon Sep 17 00:00:00 2001 From: Kristiyan Kostadinov Date: Mon, 15 Dec 2025 15:40:05 +0200 Subject: [PATCH] feat: add ability to define additional analysis prompts Adds the `analysisPrompts` field to the environment config that allows users to define their own prompts for analyzing the eval results. Example: ``` { // Usual config fields... analysisPrompts: [{name: 'Custom analysis', path: './custom-analysis.md'}] } ``` --- .../pages/report-viewer/report-viewer.html | 12 ++++++ runner/configuration/environment-config.ts | 19 +++++++++ runner/configuration/environment.ts | 36 +++++++++++++++-- runner/orchestration/generate-summary.ts | 40 ++++++++++++++++++- runner/reporting/report-ai-chat.ts | 2 +- runner/shared-interfaces.ts | 2 + 6 files changed, 106 insertions(+), 5 deletions(-) diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html index b035275..d92d18d 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.html +++ b/report-app/src/app/pages/report-viewer/report-viewer.html @@ -225,6 +225,18 @@

Repair System Prompt

} + @if (report.details.summary.additionalAiAnalysis !== undefined) { + @for (item of report.details.summary.additionalAiAnalysis; track item) { + + + Gemini Logo + {{item.name}} + +
+
+ } + } + @if (missingDeps().length > 0) { diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts index 49d35a2..ab7ef39 100644 --- a/runner/configuration/environment-config.ts +++ b/runner/configuration/environment-config.ts @@ -8,6 +8,7 @@ import { LocalExecutorConfig, localExecutorConfigSchema, } from '../orchestration/executors/local-executor-config.js'; +import {RatingContextFilter, ReportContextFilter} from '../shared-interfaces.js'; export const environmentConfigSchema = z.object({ /** Display name for the environment. */ @@ -98,6 +99,24 @@ export const environmentConfigSchema = z.object({ * It's useful to ensure that the set of ratings hasn't changed between two runs. */ expectedRatingHash: z.string().optional(), + + /** + * Prompts to use when for additional analysis of the eval results. + */ + analysisPrompts: z + .array( + z.object({ + name: z.string(), + path: z.string(), + reportsFilter: z + .enum([ReportContextFilter.AllReports, ReportContextFilter.NonPerfectReports]) + .optional(), + ratingsFilter: z + .enum([RatingContextFilter.AllRatings, RatingContextFilter.NonPerfectRatings]) + .optional(), + }), + ) + .optional(), }); /** diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts index 02aa0aa..c6abbc3 100644 --- a/runner/configuration/environment.ts +++ b/runner/configuration/environment.ts @@ -7,6 +7,8 @@ import { FrameworkInfo, MultiStepPromptDefinition, PromptDefinition, + RatingContextFilter, + ReportContextFilter, RootPromptDefinition, } from '../shared-interfaces.js'; import {UserFacingError} from '../utils/errors.js'; @@ -22,6 +24,13 @@ interface CategoryConfig { maxPoints: number; } +interface AnalysisPrompt { + name: string; + prompt: string; + reportsFilter: ReportContextFilter; + ratingsFilter: RatingContextFilter; +} + /** Represents a single prompt evaluation environment. */ export class Environment { /** Path at which the environment is defined. */ @@ -56,6 +65,9 @@ export class Environment { */ readonly ratingHash: string; + /** Additional analysis prompts defined by the user. */ + readonly analysisPrompts: AnalysisPrompt[]; + /** Ratings configured at the environment level. */ private readonly ratings: Rating[]; @@ -88,6 +100,7 @@ export class Environment { this.ratingCategories = this.getRatingCategories(config); this.ratings = this.resolveRatings(config); this.ratingHash = this.getRatingHash(this.ratings, this.ratingCategories); + this.analysisPrompts = this.resolveAnalysisPrompts(config); this.validateRatingHash(this.ratingHash, config); } @@ -262,7 +275,7 @@ export class Environment { isEditing: boolean, metadata: Metadata, ): Promise> { - const {result, contextFiles} = await this.renderEnvironmentPrompt(relativePath); + const {result, contextFiles} = this.renderEnvironmentPrompt(relativePath); return { name: name, @@ -360,13 +373,13 @@ export class Environment { } /** Renders a prompt from a path relative to the environment config. */ - private async renderEnvironmentPrompt(relativePath: string) { + private renderEnvironmentPrompt(relativePath: string) { const path = resolve(this.rootPath, relativePath); return this.renderPrompt(readFileSync(path, 'utf8'), path); } private async renderSystemPrompt(relativePath: string) { - const result = await this.renderEnvironmentPrompt(relativePath); + const result = this.renderEnvironmentPrompt(relativePath); // Optional hooks for post processing environment system prompts. Useful for e.g. // supporting `@` references from Gemini CLI or inside g3. @@ -446,4 +459,21 @@ export class Environment { ); } } + + private resolveAnalysisPrompts(config: EnvironmentConfig): AnalysisPrompt[] { + const result: AnalysisPrompt[] = []; + + config.analysisPrompts?.forEach(({name, path, reportsFilter, ratingsFilter}) => { + const prompt = this.renderEnvironmentPrompt(path).result; + + result.push({ + name, + prompt, + reportsFilter: reportsFilter ?? ReportContextFilter.NonPerfectReports, + ratingsFilter: ratingsFilter ?? RatingContextFilter.NonPerfectRatings, + }); + }); + + return result; + } } diff --git a/runner/orchestration/generate-summary.ts b/runner/orchestration/generate-summary.ts index 935ba65..27ba1d0 100644 --- a/runner/orchestration/generate-summary.ts +++ b/runner/orchestration/generate-summary.ts @@ -1,6 +1,7 @@ import {GenkitRunner} from '../codegen/genkit/genkit-runner.js'; import {Environment} from '../configuration/environment.js'; import {redX} from '../reporting/format.js'; +import {chatWithReportAI} from '../reporting/report-ai-chat.js'; import {summarizeReportWithAI} from '../reporting/report-ai-summary.js'; import {AssessmentResult, CompletionStats, RunSummary} from '../shared-interfaces.js'; @@ -43,7 +44,7 @@ export async function prepareSummary( let aiSummary: string | undefined = undefined; if (generateAiSummaryLlm) { - console.log(`✨ Generating AI summary for evaluation run..`); + console.log(`✨ Generating AI summary for evaluation run...`); try { const result = await summarizeReportWithAI(generateAiSummaryLlm, abortSignal, assessments); inputTokens += result.usage.inputTokens; @@ -61,6 +62,42 @@ export async function prepareSummary( } } + const additionalAiAnalysis: {name: string; summary: string}[] = []; + if (generateAiSummaryLlm && env.analysisPrompts.length > 0) { + console.log(`✨ Generating additional AI analysis...`); + + await Promise.all( + env.analysisPrompts.map(async config => { + try { + const result = await chatWithReportAI( + generateAiSummaryLlm, + config.prompt, + abortSignal, + assessments, + [], + model, + { + reportContextFilter: config.reportsFilter, + ratingContextFilter: config.ratingsFilter, + }, + undefined, + ); + inputTokens += result.usage.inputTokens; + outputTokens += result.usage.outputTokens; + thinkingTokens += result.usage.thinkingTokens; + totalTokens += result.usage.totalTokens; + additionalAiAnalysis.push({name: config.name, summary: result.responseHtml}); + } catch (e) { + console.log(`${redX()} Failed custom analysis called "${config.name}".`); + + if (process.env.DEBUG === '1' && (e as Partial).stack) { + console.error((e as Error).stack); + } + } + }), + ); + } + const executorInfo = await env.executor.getExecutorInfo?.(); return { @@ -78,6 +115,7 @@ export async function prepareSummary( }, }, aiSummary, + additionalAiAnalysis, completionStats: completionStats, usage: { inputTokens, diff --git a/runner/reporting/report-ai-chat.ts b/runner/reporting/report-ai-chat.ts index 53c1ec6..8c10218 100644 --- a/runner/reporting/report-ai-chat.ts +++ b/runner/reporting/report-ai-chat.ts @@ -87,7 +87,7 @@ ${serializeReportForPrompt(assessmentsToProcess, contextFilters)} includeThoughts: false, }, timeout: { - description: `Generating summary for report`, + description: `Chatting with AI`, durationInMins: 3, }, abortSignal, diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts index c83dda1..ae510f3 100644 --- a/runner/shared-interfaces.ts +++ b/runner/shared-interfaces.ts @@ -437,6 +437,8 @@ export interface RunSummary { completionStats?: CompletionStats; /** AI summary (as HTML code) of all assessments in this run/report. */ aiSummary?: string; + /** Additional user-defined AI analysis. */ + additionalAiAnalysis?: {name: string; summary: string}[]; /** * Information about the runner that was used for the eval. * Optional since some older reports might not have it.