diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html
index b035275..d92d18d 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.html
+++ b/report-app/src/app/pages/report-viewer/report-viewer.html
@@ -225,6 +225,18 @@
Repair System Prompt
}
+ @if (report.details.summary.additionalAiAnalysis !== undefined) {
+ @for (item of report.details.summary.additionalAiAnalysis; track item) {
+
+
+
+ {{item.name}}
+
+
+
+ }
+ }
+
@if (missingDeps().length > 0) {
diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
index 49d35a2..ab7ef39 100644
--- a/runner/configuration/environment-config.ts
+++ b/runner/configuration/environment-config.ts
@@ -8,6 +8,7 @@ import {
LocalExecutorConfig,
localExecutorConfigSchema,
} from '../orchestration/executors/local-executor-config.js';
+import {RatingContextFilter, ReportContextFilter} from '../shared-interfaces.js';
export const environmentConfigSchema = z.object({
/** Display name for the environment. */
@@ -98,6 +99,24 @@ export const environmentConfigSchema = z.object({
* It's useful to ensure that the set of ratings hasn't changed between two runs.
*/
expectedRatingHash: z.string().optional(),
+
+ /**
+ * Prompts to use when for additional analysis of the eval results.
+ */
+ analysisPrompts: z
+ .array(
+ z.object({
+ name: z.string(),
+ path: z.string(),
+ reportsFilter: z
+ .enum([ReportContextFilter.AllReports, ReportContextFilter.NonPerfectReports])
+ .optional(),
+ ratingsFilter: z
+ .enum([RatingContextFilter.AllRatings, RatingContextFilter.NonPerfectRatings])
+ .optional(),
+ }),
+ )
+ .optional(),
});
/**
diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts
index 02aa0aa..c6abbc3 100644
--- a/runner/configuration/environment.ts
+++ b/runner/configuration/environment.ts
@@ -7,6 +7,8 @@ import {
FrameworkInfo,
MultiStepPromptDefinition,
PromptDefinition,
+ RatingContextFilter,
+ ReportContextFilter,
RootPromptDefinition,
} from '../shared-interfaces.js';
import {UserFacingError} from '../utils/errors.js';
@@ -22,6 +24,13 @@ interface CategoryConfig {
maxPoints: number;
}
+interface AnalysisPrompt {
+ name: string;
+ prompt: string;
+ reportsFilter: ReportContextFilter;
+ ratingsFilter: RatingContextFilter;
+}
+
/** Represents a single prompt evaluation environment. */
export class Environment {
/** Path at which the environment is defined. */
@@ -56,6 +65,9 @@ export class Environment {
*/
readonly ratingHash: string;
+ /** Additional analysis prompts defined by the user. */
+ readonly analysisPrompts: AnalysisPrompt[];
+
/** Ratings configured at the environment level. */
private readonly ratings: Rating[];
@@ -88,6 +100,7 @@ export class Environment {
this.ratingCategories = this.getRatingCategories(config);
this.ratings = this.resolveRatings(config);
this.ratingHash = this.getRatingHash(this.ratings, this.ratingCategories);
+ this.analysisPrompts = this.resolveAnalysisPrompts(config);
this.validateRatingHash(this.ratingHash, config);
}
@@ -262,7 +275,7 @@ export class Environment {
isEditing: boolean,
metadata: Metadata,
): Promise> {
- const {result, contextFiles} = await this.renderEnvironmentPrompt(relativePath);
+ const {result, contextFiles} = this.renderEnvironmentPrompt(relativePath);
return {
name: name,
@@ -360,13 +373,13 @@ export class Environment {
}
/** Renders a prompt from a path relative to the environment config. */
- private async renderEnvironmentPrompt(relativePath: string) {
+ private renderEnvironmentPrompt(relativePath: string) {
const path = resolve(this.rootPath, relativePath);
return this.renderPrompt(readFileSync(path, 'utf8'), path);
}
private async renderSystemPrompt(relativePath: string) {
- const result = await this.renderEnvironmentPrompt(relativePath);
+ const result = this.renderEnvironmentPrompt(relativePath);
// Optional hooks for post processing environment system prompts. Useful for e.g.
// supporting `@` references from Gemini CLI or inside g3.
@@ -446,4 +459,21 @@ export class Environment {
);
}
}
+
+ private resolveAnalysisPrompts(config: EnvironmentConfig): AnalysisPrompt[] {
+ const result: AnalysisPrompt[] = [];
+
+ config.analysisPrompts?.forEach(({name, path, reportsFilter, ratingsFilter}) => {
+ const prompt = this.renderEnvironmentPrompt(path).result;
+
+ result.push({
+ name,
+ prompt,
+ reportsFilter: reportsFilter ?? ReportContextFilter.NonPerfectReports,
+ ratingsFilter: ratingsFilter ?? RatingContextFilter.NonPerfectRatings,
+ });
+ });
+
+ return result;
+ }
}
diff --git a/runner/orchestration/generate-summary.ts b/runner/orchestration/generate-summary.ts
index 935ba65..27ba1d0 100644
--- a/runner/orchestration/generate-summary.ts
+++ b/runner/orchestration/generate-summary.ts
@@ -1,6 +1,7 @@
import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
import {Environment} from '../configuration/environment.js';
import {redX} from '../reporting/format.js';
+import {chatWithReportAI} from '../reporting/report-ai-chat.js';
import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';
import {AssessmentResult, CompletionStats, RunSummary} from '../shared-interfaces.js';
@@ -43,7 +44,7 @@ export async function prepareSummary(
let aiSummary: string | undefined = undefined;
if (generateAiSummaryLlm) {
- console.log(`✨ Generating AI summary for evaluation run..`);
+ console.log(`✨ Generating AI summary for evaluation run...`);
try {
const result = await summarizeReportWithAI(generateAiSummaryLlm, abortSignal, assessments);
inputTokens += result.usage.inputTokens;
@@ -61,6 +62,42 @@ export async function prepareSummary(
}
}
+ const additionalAiAnalysis: {name: string; summary: string}[] = [];
+ if (generateAiSummaryLlm && env.analysisPrompts.length > 0) {
+ console.log(`✨ Generating additional AI analysis...`);
+
+ await Promise.all(
+ env.analysisPrompts.map(async config => {
+ try {
+ const result = await chatWithReportAI(
+ generateAiSummaryLlm,
+ config.prompt,
+ abortSignal,
+ assessments,
+ [],
+ model,
+ {
+ reportContextFilter: config.reportsFilter,
+ ratingContextFilter: config.ratingsFilter,
+ },
+ undefined,
+ );
+ inputTokens += result.usage.inputTokens;
+ outputTokens += result.usage.outputTokens;
+ thinkingTokens += result.usage.thinkingTokens;
+ totalTokens += result.usage.totalTokens;
+ additionalAiAnalysis.push({name: config.name, summary: result.responseHtml});
+ } catch (e) {
+ console.log(`${redX()} Failed custom analysis called "${config.name}".`);
+
+ if (process.env.DEBUG === '1' && (e as Partial).stack) {
+ console.error((e as Error).stack);
+ }
+ }
+ }),
+ );
+ }
+
const executorInfo = await env.executor.getExecutorInfo?.();
return {
@@ -78,6 +115,7 @@ export async function prepareSummary(
},
},
aiSummary,
+ additionalAiAnalysis,
completionStats: completionStats,
usage: {
inputTokens,
diff --git a/runner/reporting/report-ai-chat.ts b/runner/reporting/report-ai-chat.ts
index 53c1ec6..8c10218 100644
--- a/runner/reporting/report-ai-chat.ts
+++ b/runner/reporting/report-ai-chat.ts
@@ -87,7 +87,7 @@ ${serializeReportForPrompt(assessmentsToProcess, contextFilters)}
includeThoughts: false,
},
timeout: {
- description: `Generating summary for report`,
+ description: `Chatting with AI`,
durationInMins: 3,
},
abortSignal,
diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts
index c83dda1..ae510f3 100644
--- a/runner/shared-interfaces.ts
+++ b/runner/shared-interfaces.ts
@@ -437,6 +437,8 @@ export interface RunSummary {
completionStats?: CompletionStats;
/** AI summary (as HTML code) of all assessments in this run/report. */
aiSummary?: string;
+ /** Additional user-defined AI analysis. */
+ additionalAiAnalysis?: {name: string; summary: string}[];
/**
* Information about the runner that was used for the eval.
* Optional since some older reports might not have it.