Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions runner/orchestration/executors/executor.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import PQueue from 'p-queue';
import {ProgressLogger} from '../../progress/progress-logger.js';
import {
import z from 'zod';
import type {ProgressLogger} from '../../progress/progress-logger.js';
import type {
LlmContextFile,
LlmGenerateFilesRequest,
LlmResponse,
LlmResponseFile,
RootPromptDefinition,
TestExecutionResult,
} from '../../shared-interfaces.js';
import {BuildResult} from '../../workers/builder/builder-types.js';
import z from 'zod';
import {ServeTestingResult} from '../../workers/serve-testing/worker-types.js';
import type {BuildResult} from '../../workers/builder/builder-types.js';
import type {ServeTestingResult} from '../../workers/serve-testing/worker-types.js';
import type {
ExecutorAutoRateResponse,
ExecutorCodeAutoRateRequest,
ExecutorVisualAutoRateRequest,
} from '../../ratings/autoraters/auto-rate-shared.js';

export type EvalID = string & {__evalID: true};

Expand Down Expand Up @@ -124,6 +129,28 @@ export const executorSchema = z.object({
}),
),
),
autoRateCode: z
.function(
z.tuple([
z.custom<ExecutorCodeAutoRateRequest>().describe('Context for the automated code rating'),
z
.custom<AbortSignal>()
.describe('Abort Signal to fire when the request should be canceled.'),
]),
z.promise(z.custom<ExecutorAutoRateResponse>()),
)
.optional(),
autoRateVisuals: z
.function(
z.tuple([
z.custom<ExecutorVisualAutoRateRequest>().describe('Context for the automated code rating'),
z
.custom<AbortSignal>()
.describe('Abort Signal to fire when the request should be canceled.'),
]),
z.promise(z.custom<ExecutorAutoRateResponse>()),
)
.optional(),
});

export type Executor = z.infer<typeof executorSchema>;
2 changes: 1 addition & 1 deletion runner/orchestration/executors/local-executor.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import {ChildProcess, fork} from 'node:child_process';
import path, {join} from 'node:path';
import path from 'node:path';
import PQueue from 'p-queue';
import {LlmRunner, McpServerDetails} from '../../codegen/llm-runner.js';
import {getRunnerByName, RunnerName} from '../../codegen/runner-creation.js';
Expand Down
46 changes: 43 additions & 3 deletions runner/ratings/autoraters/auto-rate-shared.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import {Usage} from '../../shared-interfaces.js';
import type {LlmContextFile, Usage} from '../../shared-interfaces.js';

/** Minimum rating that the LLM can assign. */
export const MIN_RATING = 1;

/** Maximum rating that the LLM can assign. */
export const MAX_RATING = 10;
Expand All @@ -13,8 +16,45 @@ export interface AutoRateResult {
};
}

export function getCoefficient(rating: number): number {
const percent = rating / MAX_RATING;
/** Request for executor to auto-rate generated code. */
export interface ExecutorCodeAutoRateRequest {
/** Prompt used for the rating. */
ratingPrompt: string;
/** Files that should be rated. */
files: LlmContextFile[];
/** Minimum score. */
minRating: number;
/** Maxmum score. */
maxRating: number;
}

export interface ExecutorVisualAutoRateRequest {
/** Prompt used for the rating. */
ratingPrompt: string;
/** URL to the image to be rated. */
imageUrl: string;
/** base64 representation of the image. */
base64Image: string;
/** Minimum score. */
minRating: number;
/** Maxmum score. */
maxRating: number;
}

/** Response from the executor to an automated rating request. */
export interface ExecutorAutoRateResponse {
/** Score of the rating. */
rating: number;
/** Text summary of the result. */
summary: string;
/** Categories of the rating and related descriptions. */
categories: {name: string; message: string}[];
/** Usage information about the auto rate request. */
usage?: Usage;
}

export function getCoefficient(rating: number, maxRating: number): number {
const percent = rating / maxRating;

// More than 80% is a perfect score.
if (percent >= 0.8) {
Expand Down
91 changes: 55 additions & 36 deletions runner/ratings/autoraters/code-rater.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ import {readFileSync} from 'node:fs';
import {z} from 'zod';
import {prepareContextFilesMessage} from '../../orchestration/codegen.js';
import {Environment} from '../../configuration/environment.js';
import {IndividualAssessmentState, LlmResponseFile, Usage} from '../../shared-interfaces.js';
import {
IndividualAssessment,
IndividualAssessmentState,
LlmResponseFile,
SkippedIndividualAssessment,
} from '../../shared-interfaces.js';
import {AutoRateResult, getCoefficient, MAX_RATING} from './auto-rate-shared.js';
AutoRateResult,
ExecutorAutoRateResponse,
getCoefficient,
MAX_RATING,
MIN_RATING,
} from './auto-rate-shared.js';
import {GenkitRunner} from '../../codegen/genkit/genkit-runner.js';
import defaultCodeRaterPrompt from './code-rating-prompt.js';
import {RatingsResult} from '../rating-types.js';
Expand Down Expand Up @@ -46,13 +47,7 @@ export async function autoRateCode(
appPrompt: string,
ratingsResult: RatingsResult,
): Promise<AutoRateResult> {
const contextMessage = prepareContextFilesMessage(
files.map(o => ({
relativePath: o.filePath,
content: o.code,
})),
);

const contextFiles = files.map(o => ({relativePath: o.filePath, content: o.code}));
let promptText: string;

if (environment.codeRatingPromptPath) {
Expand Down Expand Up @@ -80,32 +75,56 @@ export async function autoRateCode(
SAFETY_WEB_RESULTS_JSON: safetyWebResultsJson,
}).result;

const result = await llm.generateConstrained({
abortSignal,
messages: contextMessage ? [contextMessage] : [],
model,
prompt,
skipMcp: true,
schema: z.object({
rating: z.number().describe(`Rating from 1-${MAX_RATING}. Best is ${MAX_RATING}.`),
summary: z.string().describe('Summary of the overall code quality.'),
categories: z.array(
z.object({
name: z.string().describe('Category name'),
message: z.string().describe('Short description of the problem.'),
}),
),
}),
});
let output: ExecutorAutoRateResponse;
let usage: Usage | null;

if (environment.executor.autoRateCode) {
output = await environment.executor.autoRateCode(
{
ratingPrompt: prompt,
files: contextFiles,
minRating: MIN_RATING,
maxRating: MAX_RATING,
},
abortSignal,
);
usage = output.usage || null;
} else {
// TODO(crisbeto): move this into the local executor once
// `Executor.autoRateVisuals` becomes a required method.
const contextMessage = prepareContextFilesMessage(contextFiles);
const result = await llm.generateConstrained({
abortSignal,
messages: contextMessage ? [contextMessage] : [],
model,
prompt,
skipMcp: true,
schema: z.object({
rating: z
.number()
.describe(`Rating from ${MIN_RATING}-${MAX_RATING}. Best is ${MAX_RATING}.`),
summary: z.string().describe('Summary of the overall code quality.'),
categories: z.array(
z.object({
name: z.string().describe('Category name'),
message: z.string().describe('Short description of the problem.'),
}),
),
}),
});

output = result.output!;
usage = result.usage || null;
}

return {
coefficient: getCoefficient(result.output!.rating),
coefficient: getCoefficient(output.rating, MAX_RATING),
usage: {
inputTokens: result.usage?.inputTokens ?? 0,
outputTokens: result.usage?.outputTokens ?? 0,
totalTokens: result.usage?.totalTokens ?? 0,
thinkingTokens: result.usage?.thinkingTokens ?? 0,
inputTokens: usage?.inputTokens ?? 0,
outputTokens: usage?.outputTokens ?? 0,
totalTokens: usage?.totalTokens ?? 0,
thinkingTokens: usage?.thinkingTokens ?? 0,
},
details: result.output!,
details: output,
};
}
110 changes: 67 additions & 43 deletions runner/ratings/autoraters/visuals-rater.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import {z} from 'zod';
import {PromptDataMessage} from '../../codegen/llm-runner.js';
import {AutoRateResult, getCoefficient, MAX_RATING} from './auto-rate-shared.js';
import {
AutoRateResult,
ExecutorAutoRateResponse,
getCoefficient,
MAX_RATING,
MIN_RATING,
} from './auto-rate-shared.js';
import {GenkitRunner} from '../../codegen/genkit/genkit-runner.js';
import defaultVisualRaterPrompt from './visual-rating-prompt.js';
import {Environment} from '../../configuration/environment.js';
import {screenshotUrlToPngBuffer} from '../../utils/screenshots.js';
import {Usage} from '../../shared-interfaces.js';

/**
* Automatically rate the appearance of a screenshot using an LLM.
Expand All @@ -29,53 +36,70 @@ export async function autoRateAppearance(
APP_PROMPT: appPrompt,
}).result;

const messages: PromptDataMessage[] = [
{
role: 'user',
content: [
{
media: {
base64PngImage: (await screenshotUrlToPngBuffer(screenshotPngUrl)).toString('base64'),
url: screenshotPngUrl,
},
},
],
},
];
const base64Image = (await screenshotUrlToPngBuffer(screenshotPngUrl)).toString('base64');

const result = await llm.generateConstrained({
abortSignal,
messages,
prompt,
model,
skipMcp: true,
timeout: {
description: `Rating screenshot of ${label} using ${model}`,
durationInMins: 2.5,
},
schema: z.object({
rating: z.number().describe(`Rating from 1-${MAX_RATING}. Best is ${MAX_RATING}.`),
summary: z
.string()
.describe('Summary of the overall app, talking about concrete features, super concise.'),
categories: z.array(
z.object({
name: z.string().describe('Category name'),
message: z.string().describe('Short description of what is missing.'),
}),
),
}),
});
let output: ExecutorAutoRateResponse;
let usage: Usage | null;

if (environment.executor.autoRateVisuals) {
output = await environment.executor.autoRateVisuals(
{
ratingPrompt: prompt,
imageUrl: screenshotPngUrl,
base64Image,
minRating: MIN_RATING,
maxRating: MAX_RATING,
},
abortSignal,
);
usage = output.usage || null;
} else {
// TODO(crisbeto): move this into the local executor once
// `Executor.autoRateVisuals` becomes a required method.
const messages: PromptDataMessage[] = [
{
role: 'user',
content: [{media: {base64PngImage: base64Image, url: screenshotPngUrl}}],
},
];

const result = await llm.generateConstrained({
abortSignal,
messages,
prompt,
model,
skipMcp: true,
timeout: {
description: `Rating screenshot of ${label} using ${model}`,
durationInMins: 2.5,
},
schema: z.object({
rating: z
.number()
.describe(`Rating from ${MIN_RATING}-${MAX_RATING}. Best is ${MAX_RATING}.`),
summary: z
.string()
.describe('Summary of the overall app, talking about concrete features, super concise.'),
categories: z.array(
z.object({
name: z.string().describe('Category name'),
message: z.string().describe('Short description of what is missing.'),
}),
),
}),
});

const output = result.output!;
output = result.output!;
usage = result.usage || null;
}

return {
coefficient: getCoefficient(output.rating),
coefficient: getCoefficient(output.rating, MAX_RATING),
usage: {
inputTokens: result.usage?.inputTokens ?? 0,
outputTokens: result.usage?.outputTokens ?? 0,
totalTokens: result.usage?.totalTokens ?? 0,
thinkingTokens: result.usage?.thinkingTokens ?? 0,
inputTokens: usage?.inputTokens ?? 0,
outputTokens: usage?.outputTokens ?? 0,
totalTokens: usage?.totalTokens ?? 0,
thinkingTokens: usage?.thinkingTokens ?? 0,
},
details: output,
};
Expand Down
Loading