From a472b962c0aba01343de941ad6f95cd9a203bc4e Mon Sep 17 00:00:00 2001
From: Kristiyan Kostadinov <crisbeto@abv.bg>
Date: Fri, 19 Dec 2025 12:02:57 +0200
Subject: [PATCH 1/2] feat: add option to augment prompt

Adds an option to the environment config that allows users to augment the resolved prompts before they're sent out.
---
 runner/configuration/environment-config.ts | 21 +++++-
 runner/configuration/environment.ts        | 81 +++++++++++++++++-----
 runner/orchestration/generate.ts           | 18 +++--
 3 files changed, 97 insertions(+), 23 deletions(-)
diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
index fefd21c..01fafc1 100644
--- a/runner/configuration/environment-config.ts
+++ b/runner/configuration/environment-config.ts
@@ -8,7 +8,9 @@ import {
   LocalExecutorConfig,
   localExecutorConfigSchema,
 } from '../orchestration/executors/local-executor-config.js';
-import {RatingContextFilter, ReportContextFilter} from '../shared-interfaces.js';
+import {PromptDefinition, RatingContextFilter, ReportContextFilter} from '../shared-interfaces.js';
+import type {Environment} from './environment.js';
+import type {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
 
 export const environmentConfigSchema = z.object({
   /** Display name for the environment. */
@@ -118,6 +120,13 @@ export const environmentConfigSchema = z.object({
       }),
     )
     .optional(),
+
+  /**
+   * Function that can be used to augment prompts before they're evaluated.
+   */
+  augmentExecutablePrompt: z
+    .function(z.tuple([z.custom<PromptAugmentationContext>()]), z.promise(z.string()))
+    .optional(),
 });
 
 /**
@@ -127,6 +136,16 @@ export const environmentConfigSchema = z.object({
 export type EnvironmentConfig = z.infer<typeof environmentConfigSchema> &
   Partial<LocalExecutorConfig>;
 
+/** Context passed to the `augmentExecutablePrompt` function. */
+export interface PromptAugmentationContext {
+  /** Definition being augmented. */
+  promptDef: PromptDefinition;
+  /** Environment running the evaluation. */
+  environment: Environment;
+  /** Runner that the user can use for augmentation. */
+  runner: GenkitRunner;
+}
+
 /** Asserts that the specified data is a valid environment config. */
 export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig {
   const validationResult = environmentConfigSchema
diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts
index fb46334..b4fcd46 100644
--- a/runner/configuration/environment.ts
+++ b/runner/configuration/environment.ts
@@ -14,11 +14,13 @@ import {
 import {UserFacingError} from '../utils/errors.js';
 import {generateId} from '../utils/id-generation.js';
 import {lazy} from '../utils/lazy-creation.js';
-import {EnvironmentConfig} from './environment-config.js';
+import {EnvironmentConfig, PromptAugmentationContext} from './environment-config.js';
 import {EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
 import {renderPromptTemplate} from './prompt-templating.js';
 import {getSha256Hash} from '../utils/hashing.js';
 import {DEFAULT_SUMMARY_MODEL} from './constants.js';
+import type {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
+import {getRunnerByName} from '../codegen/runner-creation.js';
 
 interface CategoryConfig {
   name: string;
@@ -73,6 +75,14 @@ export class Environment {
   /** Ratings configured at the environment level. */
   private readonly ratings: Rating[];
 
+  /** User-configured function used to augment prompts. */
+  private readonly augmentExecutablePrompt:
+    | ((context: PromptAugmentationContext) => Promise<string>)
+    | null;
+
+  /** Runner that user can use to access an LLM to augment prompts. */
+  private augmentationRunner: GenkitRunner | null = null;
+
   constructor(
     rootPath: string,
     private readonly config: EnvironmentConfig & Required<Pick<EnvironmentConfig, 'executor'>>,
@@ -103,26 +113,27 @@ export class Environment {
     this.ratings = this.resolveRatings(config);
     this.ratingHash = this.getRatingHash(this.ratings, this.ratingCategories);
     this.analysisPrompts = this.resolveAnalysisPrompts(config);
+    this.augmentExecutablePrompt = config.augmentExecutablePrompt || null;
     this.validateRatingHash(this.ratingHash, config);
   }
 
   /** Prompts that should be executed as a part of the evaluation. */
-  executablePrompts = lazy(async () => {
+  readonly executablePrompts = lazy(async () => {
     return this.resolveExecutablePrompts(this.config.executablePrompts);
   });
 
-  systemPromptGeneration = lazy(async () => {
+  readonly systemPromptGeneration = lazy(async () => {
     return (await this.renderSystemPrompt(this.config.generationSystemPrompt)).result;
   });
 
-  systemPromptRepair = lazy(async () => {
+  readonly systemPromptRepair = lazy(async () => {
     if (!this.config.repairSystemPrompt) {
       return 'Please fix the given errors and return the corrected code.';
     }
     return (await this.renderSystemPrompt(this.config.repairSystemPrompt)).result;
   });
 
-  systemPromptEditing = lazy(async () => {
+  readonly systemPromptEditing = lazy(async () => {
     if (!this.config.editingSystemPrompt) {
       return this.systemPromptGeneration();
     }
@@ -180,6 +191,14 @@ export class Environment {
     });
   }
 
+  async destroy(): Promise<void> {
+    await this.executor.destroy();
+
+    if (this.augmentationRunner) {
+      await this.augmentationRunner.dispose();
+    }
+  }
+
   /**
    * Gets the readable display name of a framework, based on its ID.
    * @param id ID to be resolved.
@@ -209,16 +228,16 @@ export class Environment {
    * @param config Configuration for the environment.
    */
   private async resolveExecutablePrompts(
-    prompts: EnvironmentConfig['executablePrompts'],
+    definitions: EnvironmentConfig['executablePrompts'],
   ): Promise<RootPromptDefinition[]> {
-    const result: Promise<RootPromptDefinition>[] = [];
+    const promptPromises: Promise<RootPromptDefinition>[] = [];
     const envRatings = this.ratings;
 
-    for (const def of prompts) {
+    for (const def of definitions) {
       if (def instanceof MultiStepPrompt) {
-        result.push(this.getMultiStepPrompt(def, envRatings));
+        promptPromises.push(this.getMultiStepPrompt(def, envRatings));
       } else if (def instanceof EvalPromptWithMetadata) {
-        result.push(
+        promptPromises.push(
           Promise.resolve({
             name: def.name,
             kind: 'single',
@@ -243,10 +262,10 @@ export class Environment {
           name = def.name;
         }
 
-        result.push(
+        promptPromises.push(
           ...globSync(path, {cwd: this.rootPath}).map(
             async relativePath =>
-              await this.getStepPromptDefinition(
+              await this.getSinglePromptDefinition(
                 name ?? basename(relativePath, extname(relativePath)),
                 relativePath,
                 ratings,
@@ -258,11 +277,39 @@ export class Environment {
       }
     }
 
-    return Promise.all(result);
+    const prompts = await Promise.all(promptPromises);
+
+    if (this.augmentExecutablePrompt) {
+      const augmentationPromises: Promise<unknown>[] = [];
+      const updatePrompt = (promptDef: PromptDefinition) => {
+        augmentationPromises.push(
+          this.augmentExecutablePrompt!({
+            promptDef,
+            environment: this,
+            runner: this.augmentationRunner!,
+          }).then(text => (promptDef.prompt = text)),
+        );
+      };
+      this.augmentationRunner ??= await getRunnerByName('genkit');
+
+      for (const rootPrompt of prompts) {
+        if (rootPrompt.kind === 'multi-step') {
+          for (const promptDef of rootPrompt.steps) {
+            updatePrompt(promptDef);
+          }
+        } else {
+          updatePrompt(rootPrompt);
+        }
+      }
+
+      await Promise.all(augmentationPromises);
+    }
+
+    return prompts;
   }
 
   /**
-   * Creates a prompt definition for a given step.
+   * Creates a prompt definition for a single prompt.
    *
    * @param name Name of the prompt.
    * @param rootPath Root path of the project.
@@ -270,7 +317,7 @@ export class Environment {
    * @param ratings Ratings to run against the definition.
    * @param isEditing Whether this is an editing or generation step.
    */
-  private async getStepPromptDefinition<Metadata>(
+  private async getSinglePromptDefinition<Metadata>(
     name: string,
     relativePath: string,
     ratings: Rating[],
@@ -345,11 +392,11 @@ export class Environment {
       if (stepNum === 0) {
         throw new UserFacingError('Multi-step prompts start with `step-1`.');
       }
-      const step = await this.getStepPromptDefinition(
+      const step = await this.getSinglePromptDefinition(
         `${name}-step-${stepNum}`,
         join(def.directoryPath, current.name),
         ratings,
-        /*isEditing */ stepNum !== 1,
+        /* isEditing */ stepNum !== 1,
         stepMetadata,
       );
 
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
index 4c6e930..42ba251 100644
--- a/runner/orchestration/generate.ts
+++ b/runner/orchestration/generate.ts
@@ -48,16 +48,24 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
   const cleanup = async () => {
     // Clean-up should never interrupt a potentially passing completion.
     try {
-      await env.executor.destroy();
-      for (const cleanupFn of extraCleanupFns) {
-        await cleanupFn();
-      }
+      await env.destroy();
     } catch (e) {
-      console.error(`Failed to destroy executor: ${e}`);
+      console.error(`Failed to destroy environment: ${e}`);
       if (e instanceof Error) {
         console.error(e.stack);
       }
     }
+
+    for (const cleanupFn of extraCleanupFns) {
+      try {
+        await cleanupFn();
+      } catch (e) {
+        console.error(`Failed cleanup: ${e}`);
+        if (e instanceof Error) {
+          console.error(e.stack);
+        }
+      }
+    }
   };
 
   // Ensure cleanup logic runs when the evaluation is aborted.

From bab8e22c382021fbe20b05f035f4f05985be9f2e Mon Sep 17 00:00:00 2001
From: Kristiyan Kostadinov <crisbeto@abv.bg>
Date: Fri, 19 Dec 2025 12:39:35 +0200
Subject: [PATCH 2/2] feat: add function for augmenting generated files

Adds the ability for the user to augment a generated response before it is evaluated.
---
 runner/configuration/environment-config.ts     | 14 +++++++++++++-
 runner/configuration/environment.ts            | 12 ++++++++++++
 runner/orchestration/codegen.ts                |  1 +
 runner/orchestration/generate-initial-files.ts |  3 ++-
 runner/orchestration/repair.ts                 |  2 +-
 5 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
index 01fafc1..fd9bf2f 100644
--- a/runner/configuration/environment-config.ts
+++ b/runner/configuration/environment-config.ts
@@ -8,7 +8,12 @@ import {
   LocalExecutorConfig,
   localExecutorConfigSchema,
 } from '../orchestration/executors/local-executor-config.js';
-import {PromptDefinition, RatingContextFilter, ReportContextFilter} from '../shared-interfaces.js';
+import {
+  LlmResponseFile,
+  PromptDefinition,
+  RatingContextFilter,
+  ReportContextFilter,
+} from '../shared-interfaces.js';
 import type {Environment} from './environment.js';
 import type {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
 
@@ -127,6 +132,13 @@ export const environmentConfigSchema = z.object({
   augmentExecutablePrompt: z
     .function(z.tuple([z.custom<PromptAugmentationContext>()]), z.promise(z.string()))
     .optional(),
+
+  /**
+   * Function that can be used to augment generated files before they're evaluated.
+   */
+  augmentGeneratedFile: z
+    .function(z.tuple([z.custom<Readonly<LlmResponseFile>>()]), z.string())
+    .optional(),
 });
 
 /**
diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts
index b4fcd46..f7f39c1 100644
--- a/runner/configuration/environment.ts
+++ b/runner/configuration/environment.ts
@@ -5,6 +5,7 @@ import {Executor} from '../orchestration/executors/executor.js';
 import {Rating, RatingCategory} from '../ratings/rating-types.js';
 import {
   FrameworkInfo,
+  LlmResponseFile,
   MultiStepPromptDefinition,
   PromptDefinition,
   RatingContextFilter,
@@ -83,6 +84,9 @@ export class Environment {
   /** Runner that user can use to access an LLM to augment prompts. */
   private augmentationRunner: GenkitRunner | null = null;
 
+  /** User-provided callback for augmenting the LLM-generated files. */
+  private readonly augmentFileCallback: ((file: LlmResponseFile) => string) | null;
+
   constructor(
     rootPath: string,
     private readonly config: EnvironmentConfig & Required<Pick<EnvironmentConfig, 'executor'>>,
@@ -114,6 +118,7 @@ export class Environment {
     this.ratingHash = this.getRatingHash(this.ratings, this.ratingCategories);
     this.analysisPrompts = this.resolveAnalysisPrompts(config);
     this.augmentExecutablePrompt = config.augmentExecutablePrompt || null;
+    this.augmentFileCallback = config.augmentGeneratedFile || null;
     this.validateRatingHash(this.ratingHash, config);
   }
 
@@ -191,6 +196,13 @@ export class Environment {
     });
   }
 
+  /** Augments response files based on the user's configuration. */
+  augmentResponseFiles(files: LlmResponseFile[]): void {
+    if (this.augmentFileCallback) {
+      files.forEach(file => (file.code = this.augmentFileCallback!(file)));
+    }
+  }
+
   async destroy(): Promise<void> {
     await this.executor.destroy();
 
diff --git a/runner/orchestration/codegen.ts b/runner/orchestration/codegen.ts
index 60ae72c..fe19a94 100644
--- a/runner/orchestration/codegen.ts
+++ b/runner/orchestration/codegen.ts
@@ -129,6 +129,7 @@ export async function repairCodeWithAI(
   );
 
   if (response.success) {
+    env.augmentResponseFiles(response.outputFiles);
     progress.log(
       promptDef,
       'codegen',
diff --git a/runner/orchestration/generate-initial-files.ts b/runner/orchestration/generate-initial-files.ts
index b137ecb..6a0495d 100644
--- a/runner/orchestration/generate-initial-files.ts
+++ b/runner/orchestration/generate-initial-files.ts
@@ -75,6 +75,7 @@ export async function generateInitialFiles(
   );
 
   if (response.success) {
+    env.augmentResponseFiles(response.outputFiles);
     progress.log(
       promptDef,
       'codegen',
@@ -90,7 +91,7 @@ export async function generateInitialFiles(
   }
 
   return {
-    files: response.outputFiles!,
+    files: response.outputFiles,
     usage: response.usage,
     reasoning: response.reasoning,
     toolLogs: response.toolLogs,
diff --git a/runner/orchestration/repair.ts b/runner/orchestration/repair.ts
index 13acc24..152acc1 100644
--- a/runner/orchestration/repair.ts
+++ b/runner/orchestration/repair.ts
@@ -126,7 +126,7 @@ async function handleRepairResponse(
   const newAttemptFiles = previousAttemptFiles.map(f => ({...f}));
 
   mergeRepairFiles(repairResponse.outputFiles, newAttemptFiles);
-  writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name);
+  await writeResponseFiles(directory, newAttemptFiles, env, rootPromptDef.name);
 
   const buildResult = await runBuild(
     evalID,