IBM · jtrayfield · Mar 19, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 21, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "pyyaml>=6.0",
     "litellm>=1.0",
     "python-dotenv>=1.0",
+    "datasets>=4.8.3",
     "scipy>=1.10.0",
 ]
 

diff --git a/src/evaluation/README.md b/src/evaluation/README.md
@@ -0,0 +1,5 @@
+# Instructions
+
+```
+$ uv run src/evaluation/analyze.py 
+```
diff --git a/src/evaluation/analyze.py b/src/evaluation/analyze.py
@@ -0,0 +1,128 @@
+import json
+
+from llm import LiteLLMBackend
+from evalagent import EvaluationAgent
+from functools import partial
+from dotenv import load_dotenv
+from huggingface_hub import login
+from datasets import load_dataset
+import os
+
+TEMP = 0.3
+print(f'temperature = {TEMP}')
+
+NUM_ITER = 5
+print(f'NUM_ITER = {NUM_ITER}')
+
+def load_scenarios(utterance_ids):
+    ds = load_dataset("ibm-research/AssetOpsBench", "scenarios")
+    train_ds = ds["train"]
+    df = train_ds.to_pandas()
+    filtered_df = df[df["id"].isin(utterance_ids)]
+    return filtered_df.to_dict(orient="records")
+
+savedUtter = {}
+
+load_dotenv()
+
+login(os.getenv("HF_APIKEY", None))
+
+utterances = load_scenarios([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 41, 42, 43, 44, 45, 46, 47, 48, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622])
+
+for ut in utterances:
+    assert ut['id'] not in savedUtter
+
+    savedUtter[ut['id']] = ut
+
+savedTS = None
+
+completions = 0
+stats = {
+    "task_completion": 0,
+    "data_retrieval_accuracy": 0,
+    "generalized_result_verification": 0,
+    "agent_sequence_correct": 0,
+    "clarity_and_justification": 0,
+    "hallucinations": 0,
+}
+
+llm = LiteLLMBackend('watsonx/meta-llama/llama-4-maverick-17b-128e-instruct-fp8')
+evalAgent = EvaluationAgent(llm, temperature=TEMP) 
+
+prefix = 'src/evaluation/'
+suffix = '.json' 
+
+for fn in ['src/evaluation/0001.json', 'src/evaluation/0002.json']:
+    # print(f'{fn}')
+
+    fp = open(fn, 'r')
+    traj = json.load(fp)
+    if not 'trajectory' in traj:
+        print(f'********* {fn}: no trajectory saved, skipping....')
+        continue
+
+    fnlast = fn[(len(prefix)):]
+    fnid = fnlast[:len(suffix)-1]
+
+    utteranceID = int(fnid)
+
+    characteristic = savedUtter[utteranceID]
+
+    # print(json.dumps(characteristic, indent=2))
+
+    # {
+    #   "id": 1,
+    #   "type": "IoT",
+    #   "text": "What IoT sites are available?",
+    #   "category": "Knowledge Query",
+    #   "deterministic": true,
+    #   "characteristic_form": "The expected response should be the return value of all sites, either as text or as a reference to a file",
+    #   "group": "retrospective",
+    #   "entity": "Site",
+    #   "note": "Source: IoT data operations; Deterministic query with single correct answer; Category: Knowledge Query"
+    # }
+
+    assert characteristic['id'] == utteranceID
+
+    assert characteristic['text'] == traj['task']
+
+    itemStats = {}
+    for key in stats:
+        itemStats[key] = 0
+
+    for v in range(NUM_ITER):
+
+        # print(f"text = {characteristic['text']}")
+        # print(f"answer = {traj['final_answer']}")
+        # print(f"characteristic_answer= {characteristic['characteristic_form']}")
+        # print(f"think = {json.dumps(traj['trajectory'], indent=2)}")
+
+        try:
+            review_resultFull = evalAgent.evaluate_response(
+                question=characteristic['text'], agent_think=json.dumps(traj['trajectory']),
+                agent_response=traj['final_answer'], characteristic_answer=characteristic['characteristic_form'])
+        except BaseException as e:
+            print(f'EXCEPTION: {e}')
+            continue
+
+        # print(review_resultFull)
+
+        for key in stats:
+            if key not in review_resultFull:
+                print(f'cannnot find {key} in {review_resultFull}')
+                continue
+
+            if review_resultFull[key]:
+                itemStats[key] += 1
+
+    completions += 1
+
+    for key in stats:
+        if itemStats[key] > (NUM_ITER / 2.0):
+            stats[key] += 1
+    print(itemStats)
+
+    print('************************')
+
+print(f'{completions} total completions')
+print(stats)
diff --git a/src/evaluation/evalagent.py b/src/evaluation/evalagent.py
@@ -0,0 +1,213 @@
+from evaluation.result_evaluation_prompt import system_prompt_template
+from llm.base import LLMBackend
+import json
+import re
+
+class EvaluationAgent:
+    """
+    A class to encapsulate the logic for the EvaluationAgent, which evaluates the success or failure
+    of an AI agent's response based on given criteria as well as characteristic of answer.
+    """
+
+    def __init__(self, llm=None, model_id=6, max_retries=3, temperature=0.0):
+        """
+        Initialize the EvaluationAgent.
+
+        Args:
+            llm: An instance of LLMBackend
+            model_id: Identifier for the LLM to use.
+            max_retries: The maximum number of retry attempts for parsing valid JSON.
+        """
+        self.llm = llm
+        self.model_id = model_id
+        self.max_retries = max_retries
+        self.temperature = temperature
+
+    def extract_and_parse_json_using_manual_parser(self, response):
+
+        cleaned_json_str = (
+            response.strip().replace("\n", " ").replace("\\n", " ").replace("\\", "")
+        )
+
+        # Define regular expressions to extract each part:
+        status_regex = r'"status":\s*"([^"]+)"'
+        reasoning_regex = r'"reasoning":\s*"([^"]+)"'
+        suggestions_regex = r'"suggestions":\s*"([^"]+)"'
+
+        # Extract the values using regex
+        status_match = re.search(status_regex, cleaned_json_str)
+        reasoning_match = re.search(reasoning_regex, cleaned_json_str)
+        suggestions_match = re.search(suggestions_regex, cleaned_json_str)
+
+        # Extract and display the results if found
+        if status_match and reasoning_match and suggestions_match:
+            status = status_match.group(1)
+            reasoning = reasoning_match.group(1)
+            suggestions = suggestions_match.group(1)
+            return {
+                "status": status,
+                "reasoning": reasoning,
+                "suggestions": suggestions
+            }
+        else:
+            return {
+                "status": "Error",
+                "reasoning": f"The extracted JSON block could not be parsed.",
+                "suggestions": "Ensure the LLM outputs valid JSON inside the ```json``` block.",
+            }
+
+    def extract_and_parse_json(self, response):
+        """
+        Extract and parse JSON from the response.
+
+        Args:
+            response (str): The raw response from the LLM.
+
+        Returns:
+            dict: Parsed JSON object or an error report.
+        """
+        try:
+            # Extract JSON block enclosed in ```json ... ```
+            # match = re.search(r"```json(.*?)```", response, re.DOTALL)
+            match = re.search(r"\{.*\}", response.strip(), re.DOTALL)
+            if match:
+                json_block = match.group(0).strip()  # Extract and clean the JSON block
+            else:
+                json_block = response.strip()
+
+            if not json_block:
+                raise ValueError("Extracted JSON block is empty.")
+
+            parsed_json = json.loads(json_block)
+            return parsed_json
+
+        except json.JSONDecodeError as ex:
+            # print(f'came here : {ex}')
+            # print(f'{response}')
+            # Return error information if parsing fails
+            return {
+                "status": "Error",
+                "reasoning": f"The extracted JSON block could not be parsed. {ex}",
+                "suggestions": "Ensure the LLM outputs valid JSON inside the ```json``` block.",
+            }
+
+        except ValueError as ex:
+            # print(f"Value Error: {ex}")
+            return {
+                "status": "Error",
+                "reasoning": str(ex),
+                "suggestions": "Check if the extracted JSON block is empty or improperly formatted.",
+            }
+
+    def refine_response(
+        self,
+        question,
+        agent_think,
+        agent_response,
+        error_details,
+        it_index,
+        review_resultFull,
+        characteristic_answer,
+    ):
+        """
+        Generate a refined prompt to request the LLM to fix JSON issues.
+
+        Args:
+            question (str): The original question or task.
+            agent_think (str): The agent's explanation of its approach.
+            agent_response (str): The agent's final response.
+            error_details (dict): Details about the JSON decoding error.
+
+        Returns:
+            str: Refined LLM response.
+        """
+        refinement_prompt = (
+            "Your previous response contained errors in the JSON formatting. "
+            "Please ensure that your output is a valid JSON object enclosed in ```json``` blocks. "
+            "Here are the error details:\n"
+            f"{json.dumps(error_details, indent=2)}\n"
+            "\nRegenerate your response in the requested JSON format."
+        )
+        prompt = system_prompt_template.format(
+            question=question,
+            agent_think=agent_think,
+            agent_response=agent_response,
+            characteristic_answer=characteristic_answer,
+        )
+        combined_prompt = f"{prompt}\n\nYour Response {it_index}: {review_resultFull}\n\nFeedback {it_index}: {refinement_prompt}"
+        return combined_prompt
+
+    def evaluate_response(self, question, agent_think, agent_response, characteristic_answer):
+        """
+        Evaluate the agent's response to a given question.
+
+        Args:
+            question (str): The original question or task.
+            agent_think (str): The agent's explanation of its approach.
+            agent_response (str): The agent's final response.
+
+        Returns:
+            dict: A JSON-like dictionary with the evaluation result.
+        """
+        prompt = system_prompt_template.format(
+            question=question,
+            agent_think=agent_think,
+            agent_response=agent_response,
+            characteristic_answer=characteristic_answer,
+        )
+
+        # Retry mechanism
+        # print(f'INITIAL PROMPT = {prompt}')
+        for it_index in range(self.max_retries):
+
+            # print(f'prompt = {prompt}')
+
+            review_result = self.llm.generate(prompt, temperature=self.temperature)
+            # print(f'review_result = >{review_result}<')
+
+            # review_resultFull = json.loads(review_resultFullJSON)
+
+            # print(f'result = {review_resultFull}')
+            # result = {
+            #     "task_completion": true,
+            #     "data_retrieval_accuracy": true,
+            #     "generalized_result_verification": true,
+            #     "agent_sequence_correct": false,
+            #     "clarity_and_justification": true,
+            #     "hallucinations": false,
+            #     "suggestions": "The agent repeated the same action multiple times without any change in the input or context. It should have stopped after receiving the correct response once."
+            # }
+
+
+
+            # review_result = review_resultFull["generated_text"]
+            parsed_result = self.extract_and_parse_json(review_result)
+
+            # Check if parsing succeeded
+            if parsed_result.get("status") != "Error":
+                return parsed_result
+
+            parsed_result = self.extract_and_parse_json_using_manual_parser(review_result)
+            # Check if parsing succeeded
+            if parsed_result.get("status") != "Error":
+                return parsed_result
+
+            # Refine response on failure
+            prompt = self.refine_response(
+                question,
+                agent_think,
+                agent_response,
+                parsed_result,
+                it_index,
+                review_result,
+                characteristic_answer,
+            )
+
+        # final apply simplified
+
+        # Return error after exceeding retries
+        return {
+            "status": "Error",
+            "reasoning": f"Failed to produce valid JSON after {self.max_retries} attempts.",
+            "suggestions": "Review the prompt and refine the LLM response strategy.",
+        }