From e63c69da2dd8aba2e1ce6533ec08148172bd075d Mon Sep 17 00:00:00 2001 From: Kristiyan Kostadinov Date: Mon, 15 Dec 2025 11:08:47 +0200 Subject: [PATCH] feat: add auto rating methods to executor Adds the ability to run the visual and code automated ratings through the `Executor`. --- runner/orchestration/executors/executor.ts | 37 +++++- .../orchestration/executors/local-executor.ts | 2 +- runner/ratings/autoraters/auto-rate-shared.ts | 46 +++++++- runner/ratings/autoraters/code-rater.ts | 91 +++++++++------ runner/ratings/autoraters/visuals-rater.ts | 110 +++++++++++------- 5 files changed, 198 insertions(+), 88 deletions(-) diff --git a/runner/orchestration/executors/executor.ts b/runner/orchestration/executors/executor.ts index e6ea121..de704a6 100644 --- a/runner/orchestration/executors/executor.ts +++ b/runner/orchestration/executors/executor.ts @@ -1,6 +1,7 @@ import PQueue from 'p-queue'; -import {ProgressLogger} from '../../progress/progress-logger.js'; -import { +import z from 'zod'; +import type {ProgressLogger} from '../../progress/progress-logger.js'; +import type { LlmContextFile, LlmGenerateFilesRequest, LlmResponse, @@ -8,9 +9,13 @@ import { RootPromptDefinition, TestExecutionResult, } from '../../shared-interfaces.js'; -import {BuildResult} from '../../workers/builder/builder-types.js'; -import z from 'zod'; -import {ServeTestingResult} from '../../workers/serve-testing/worker-types.js'; +import type {BuildResult} from '../../workers/builder/builder-types.js'; +import type {ServeTestingResult} from '../../workers/serve-testing/worker-types.js'; +import type { + ExecutorAutoRateResponse, + ExecutorCodeAutoRateRequest, + ExecutorVisualAutoRateRequest, +} from '../../ratings/autoraters/auto-rate-shared.js'; export type EvalID = string & {__evalID: true}; @@ -124,6 +129,28 @@ export const executorSchema = z.object({ }), ), ), + autoRateCode: z + .function( + z.tuple([ + z.custom().describe('Context for the automated code rating'), + z + .custom() + .describe('Abort Signal to fire when the request should be canceled.'), + ]), + z.promise(z.custom()), + ) + .optional(), + autoRateVisuals: z + .function( + z.tuple([ + z.custom().describe('Context for the automated code rating'), + z + .custom() + .describe('Abort Signal to fire when the request should be canceled.'), + ]), + z.promise(z.custom()), + ) + .optional(), }); export type Executor = z.infer; diff --git a/runner/orchestration/executors/local-executor.ts b/runner/orchestration/executors/local-executor.ts index d03a6b2..c681d4a 100644 --- a/runner/orchestration/executors/local-executor.ts +++ b/runner/orchestration/executors/local-executor.ts @@ -1,5 +1,5 @@ import {ChildProcess, fork} from 'node:child_process'; -import path, {join} from 'node:path'; +import path from 'node:path'; import PQueue from 'p-queue'; import {LlmRunner, McpServerDetails} from '../../codegen/llm-runner.js'; import {getRunnerByName, RunnerName} from '../../codegen/runner-creation.js'; diff --git a/runner/ratings/autoraters/auto-rate-shared.ts b/runner/ratings/autoraters/auto-rate-shared.ts index 0f3ef7f..24a3f8c 100644 --- a/runner/ratings/autoraters/auto-rate-shared.ts +++ b/runner/ratings/autoraters/auto-rate-shared.ts @@ -1,4 +1,7 @@ -import {Usage} from '../../shared-interfaces.js'; +import type {LlmContextFile, Usage} from '../../shared-interfaces.js'; + +/** Minimum rating that the LLM can assign. */ +export const MIN_RATING = 1; /** Maximum rating that the LLM can assign. */ export const MAX_RATING = 10; @@ -13,8 +16,45 @@ export interface AutoRateResult { }; } -export function getCoefficient(rating: number): number { - const percent = rating / MAX_RATING; +/** Request for executor to auto-rate generated code. */ +export interface ExecutorCodeAutoRateRequest { + /** Prompt used for the rating. */ + ratingPrompt: string; + /** Files that should be rated. */ + files: LlmContextFile[]; + /** Minimum score. */ + minRating: number; + /** Maxmum score. */ + maxRating: number; +} + +export interface ExecutorVisualAutoRateRequest { + /** Prompt used for the rating. */ + ratingPrompt: string; + /** URL to the image to be rated. */ + imageUrl: string; + /** base64 representation of the image. */ + base64Image: string; + /** Minimum score. */ + minRating: number; + /** Maxmum score. */ + maxRating: number; +} + +/** Response from the executor to an automated rating request. */ +export interface ExecutorAutoRateResponse { + /** Score of the rating. */ + rating: number; + /** Text summary of the result. */ + summary: string; + /** Categories of the rating and related descriptions. */ + categories: {name: string; message: string}[]; + /** Usage information about the auto rate request. */ + usage?: Usage; +} + +export function getCoefficient(rating: number, maxRating: number): number { + const percent = rating / maxRating; // More than 80% is a perfect score. if (percent >= 0.8) { diff --git a/runner/ratings/autoraters/code-rater.ts b/runner/ratings/autoraters/code-rater.ts index 37dde51..d5c429c 100644 --- a/runner/ratings/autoraters/code-rater.ts +++ b/runner/ratings/autoraters/code-rater.ts @@ -2,13 +2,14 @@ import {readFileSync} from 'node:fs'; import {z} from 'zod'; import {prepareContextFilesMessage} from '../../orchestration/codegen.js'; import {Environment} from '../../configuration/environment.js'; +import {IndividualAssessmentState, LlmResponseFile, Usage} from '../../shared-interfaces.js'; import { - IndividualAssessment, - IndividualAssessmentState, - LlmResponseFile, - SkippedIndividualAssessment, -} from '../../shared-interfaces.js'; -import {AutoRateResult, getCoefficient, MAX_RATING} from './auto-rate-shared.js'; + AutoRateResult, + ExecutorAutoRateResponse, + getCoefficient, + MAX_RATING, + MIN_RATING, +} from './auto-rate-shared.js'; import {GenkitRunner} from '../../codegen/genkit/genkit-runner.js'; import defaultCodeRaterPrompt from './code-rating-prompt.js'; import {RatingsResult} from '../rating-types.js'; @@ -46,13 +47,7 @@ export async function autoRateCode( appPrompt: string, ratingsResult: RatingsResult, ): Promise { - const contextMessage = prepareContextFilesMessage( - files.map(o => ({ - relativePath: o.filePath, - content: o.code, - })), - ); - + const contextFiles = files.map(o => ({relativePath: o.filePath, content: o.code})); let promptText: string; if (environment.codeRatingPromptPath) { @@ -80,32 +75,56 @@ export async function autoRateCode( SAFETY_WEB_RESULTS_JSON: safetyWebResultsJson, }).result; - const result = await llm.generateConstrained({ - abortSignal, - messages: contextMessage ? [contextMessage] : [], - model, - prompt, - skipMcp: true, - schema: z.object({ - rating: z.number().describe(`Rating from 1-${MAX_RATING}. Best is ${MAX_RATING}.`), - summary: z.string().describe('Summary of the overall code quality.'), - categories: z.array( - z.object({ - name: z.string().describe('Category name'), - message: z.string().describe('Short description of the problem.'), - }), - ), - }), - }); + let output: ExecutorAutoRateResponse; + let usage: Usage | null; + + if (environment.executor.autoRateCode) { + output = await environment.executor.autoRateCode( + { + ratingPrompt: prompt, + files: contextFiles, + minRating: MIN_RATING, + maxRating: MAX_RATING, + }, + abortSignal, + ); + usage = output.usage || null; + } else { + // TODO(crisbeto): move this into the local executor once + // `Executor.autoRateVisuals` becomes a required method. + const contextMessage = prepareContextFilesMessage(contextFiles); + const result = await llm.generateConstrained({ + abortSignal, + messages: contextMessage ? [contextMessage] : [], + model, + prompt, + skipMcp: true, + schema: z.object({ + rating: z + .number() + .describe(`Rating from ${MIN_RATING}-${MAX_RATING}. Best is ${MAX_RATING}.`), + summary: z.string().describe('Summary of the overall code quality.'), + categories: z.array( + z.object({ + name: z.string().describe('Category name'), + message: z.string().describe('Short description of the problem.'), + }), + ), + }), + }); + + output = result.output!; + usage = result.usage || null; + } return { - coefficient: getCoefficient(result.output!.rating), + coefficient: getCoefficient(output.rating, MAX_RATING), usage: { - inputTokens: result.usage?.inputTokens ?? 0, - outputTokens: result.usage?.outputTokens ?? 0, - totalTokens: result.usage?.totalTokens ?? 0, - thinkingTokens: result.usage?.thinkingTokens ?? 0, + inputTokens: usage?.inputTokens ?? 0, + outputTokens: usage?.outputTokens ?? 0, + totalTokens: usage?.totalTokens ?? 0, + thinkingTokens: usage?.thinkingTokens ?? 0, }, - details: result.output!, + details: output, }; } diff --git a/runner/ratings/autoraters/visuals-rater.ts b/runner/ratings/autoraters/visuals-rater.ts index 10a490d..140cd11 100644 --- a/runner/ratings/autoraters/visuals-rater.ts +++ b/runner/ratings/autoraters/visuals-rater.ts @@ -1,10 +1,17 @@ import {z} from 'zod'; import {PromptDataMessage} from '../../codegen/llm-runner.js'; -import {AutoRateResult, getCoefficient, MAX_RATING} from './auto-rate-shared.js'; +import { + AutoRateResult, + ExecutorAutoRateResponse, + getCoefficient, + MAX_RATING, + MIN_RATING, +} from './auto-rate-shared.js'; import {GenkitRunner} from '../../codegen/genkit/genkit-runner.js'; import defaultVisualRaterPrompt from './visual-rating-prompt.js'; import {Environment} from '../../configuration/environment.js'; import {screenshotUrlToPngBuffer} from '../../utils/screenshots.js'; +import {Usage} from '../../shared-interfaces.js'; /** * Automatically rate the appearance of a screenshot using an LLM. @@ -29,53 +36,70 @@ export async function autoRateAppearance( APP_PROMPT: appPrompt, }).result; - const messages: PromptDataMessage[] = [ - { - role: 'user', - content: [ - { - media: { - base64PngImage: (await screenshotUrlToPngBuffer(screenshotPngUrl)).toString('base64'), - url: screenshotPngUrl, - }, - }, - ], - }, - ]; + const base64Image = (await screenshotUrlToPngBuffer(screenshotPngUrl)).toString('base64'); - const result = await llm.generateConstrained({ - abortSignal, - messages, - prompt, - model, - skipMcp: true, - timeout: { - description: `Rating screenshot of ${label} using ${model}`, - durationInMins: 2.5, - }, - schema: z.object({ - rating: z.number().describe(`Rating from 1-${MAX_RATING}. Best is ${MAX_RATING}.`), - summary: z - .string() - .describe('Summary of the overall app, talking about concrete features, super concise.'), - categories: z.array( - z.object({ - name: z.string().describe('Category name'), - message: z.string().describe('Short description of what is missing.'), - }), - ), - }), - }); + let output: ExecutorAutoRateResponse; + let usage: Usage | null; + + if (environment.executor.autoRateVisuals) { + output = await environment.executor.autoRateVisuals( + { + ratingPrompt: prompt, + imageUrl: screenshotPngUrl, + base64Image, + minRating: MIN_RATING, + maxRating: MAX_RATING, + }, + abortSignal, + ); + usage = output.usage || null; + } else { + // TODO(crisbeto): move this into the local executor once + // `Executor.autoRateVisuals` becomes a required method. + const messages: PromptDataMessage[] = [ + { + role: 'user', + content: [{media: {base64PngImage: base64Image, url: screenshotPngUrl}}], + }, + ]; + + const result = await llm.generateConstrained({ + abortSignal, + messages, + prompt, + model, + skipMcp: true, + timeout: { + description: `Rating screenshot of ${label} using ${model}`, + durationInMins: 2.5, + }, + schema: z.object({ + rating: z + .number() + .describe(`Rating from ${MIN_RATING}-${MAX_RATING}. Best is ${MAX_RATING}.`), + summary: z + .string() + .describe('Summary of the overall app, talking about concrete features, super concise.'), + categories: z.array( + z.object({ + name: z.string().describe('Category name'), + message: z.string().describe('Short description of what is missing.'), + }), + ), + }), + }); - const output = result.output!; + output = result.output!; + usage = result.usage || null; + } return { - coefficient: getCoefficient(output.rating), + coefficient: getCoefficient(output.rating, MAX_RATING), usage: { - inputTokens: result.usage?.inputTokens ?? 0, - outputTokens: result.usage?.outputTokens ?? 0, - totalTokens: result.usage?.totalTokens ?? 0, - thinkingTokens: result.usage?.thinkingTokens ?? 0, + inputTokens: usage?.inputTokens ?? 0, + outputTokens: usage?.outputTokens ?? 0, + totalTokens: usage?.totalTokens ?? 0, + thinkingTokens: usage?.thinkingTokens ?? 0, }, details: output, };