Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies = [
"pyyaml>=6.0",
"litellm>=1.0",
"python-dotenv>=1.0",
"datasets>=4.8.3",
"scipy>=1.10.0",
]

Expand Down
5 changes: 5 additions & 0 deletions src/evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Instructions

```
$ uv run src/evaluation/analyze.py
```
128 changes: 128 additions & 0 deletions src/evaluation/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import json

from llm import LiteLLMBackend
from evalagent import EvaluationAgent
from functools import partial
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset
import os

TEMP = 0.3
print(f'temperature = {TEMP}')

NUM_ITER = 5
print(f'NUM_ITER = {NUM_ITER}')

def load_scenarios(utterance_ids):
ds = load_dataset("ibm-research/AssetOpsBench", "scenarios")
train_ds = ds["train"]
df = train_ds.to_pandas()
filtered_df = df[df["id"].isin(utterance_ids)]
return filtered_df.to_dict(orient="records")

savedUtter = {}

load_dotenv()

login(os.getenv("HF_APIKEY", None))

utterances = load_scenarios([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 41, 42, 43, 44, 45, 46, 47, 48, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622])

for ut in utterances:
assert ut['id'] not in savedUtter

savedUtter[ut['id']] = ut

savedTS = None

completions = 0
stats = {
"task_completion": 0,
"data_retrieval_accuracy": 0,
"generalized_result_verification": 0,
"agent_sequence_correct": 0,
"clarity_and_justification": 0,
"hallucinations": 0,
}

llm = LiteLLMBackend('watsonx/meta-llama/llama-4-maverick-17b-128e-instruct-fp8')
evalAgent = EvaluationAgent(llm, temperature=TEMP)

prefix = 'src/evaluation/'
suffix = '.json'

for fn in ['src/evaluation/0001.json', 'src/evaluation/0002.json']:
# print(f'{fn}')

fp = open(fn, 'r')
traj = json.load(fp)
if not 'trajectory' in traj:
print(f'********* {fn}: no trajectory saved, skipping....')
continue

fnlast = fn[(len(prefix)):]
fnid = fnlast[:len(suffix)-1]

utteranceID = int(fnid)

characteristic = savedUtter[utteranceID]

# print(json.dumps(characteristic, indent=2))

# {
# "id": 1,
# "type": "IoT",
# "text": "What IoT sites are available?",
# "category": "Knowledge Query",
# "deterministic": true,
# "characteristic_form": "The expected response should be the return value of all sites, either as text or as a reference to a file",
# "group": "retrospective",
# "entity": "Site",
# "note": "Source: IoT data operations; Deterministic query with single correct answer; Category: Knowledge Query"
# }

assert characteristic['id'] == utteranceID

assert characteristic['text'] == traj['task']

itemStats = {}
for key in stats:
itemStats[key] = 0

for v in range(NUM_ITER):

# print(f"text = {characteristic['text']}")
# print(f"answer = {traj['final_answer']}")
# print(f"characteristic_answer= {characteristic['characteristic_form']}")
# print(f"think = {json.dumps(traj['trajectory'], indent=2)}")

try:
review_resultFull = evalAgent.evaluate_response(
question=characteristic['text'], agent_think=json.dumps(traj['trajectory']),
agent_response=traj['final_answer'], characteristic_answer=characteristic['characteristic_form'])
except BaseException as e:
print(f'EXCEPTION: {e}')
continue

# print(review_resultFull)

for key in stats:
if key not in review_resultFull:
print(f'cannnot find {key} in {review_resultFull}')
continue

if review_resultFull[key]:
itemStats[key] += 1

completions += 1

for key in stats:
if itemStats[key] > (NUM_ITER / 2.0):
stats[key] += 1
print(itemStats)

print('************************')

print(f'{completions} total completions')
print(stats)
213 changes: 213 additions & 0 deletions src/evaluation/evalagent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
from evaluation.result_evaluation_prompt import system_prompt_template
from llm.base import LLMBackend
import json
import re

class EvaluationAgent:
"""
A class to encapsulate the logic for the EvaluationAgent, which evaluates the success or failure
of an AI agent's response based on given criteria as well as characteristic of answer.
"""

def __init__(self, llm=None, model_id=6, max_retries=3, temperature=0.0):
"""
Initialize the EvaluationAgent.

Args:
llm: An instance of LLMBackend
model_id: Identifier for the LLM to use.
max_retries: The maximum number of retry attempts for parsing valid JSON.
"""
self.llm = llm
self.model_id = model_id
self.max_retries = max_retries
self.temperature = temperature

def extract_and_parse_json_using_manual_parser(self, response):

cleaned_json_str = (
response.strip().replace("\n", " ").replace("\\n", " ").replace("\\", "")
)

# Define regular expressions to extract each part:
status_regex = r'"status":\s*"([^"]+)"'
reasoning_regex = r'"reasoning":\s*"([^"]+)"'
suggestions_regex = r'"suggestions":\s*"([^"]+)"'

# Extract the values using regex
status_match = re.search(status_regex, cleaned_json_str)
reasoning_match = re.search(reasoning_regex, cleaned_json_str)
suggestions_match = re.search(suggestions_regex, cleaned_json_str)

# Extract and display the results if found
if status_match and reasoning_match and suggestions_match:
status = status_match.group(1)
reasoning = reasoning_match.group(1)
suggestions = suggestions_match.group(1)
return {
"status": status,
"reasoning": reasoning,
"suggestions": suggestions
}
else:
return {
"status": "Error",
"reasoning": f"The extracted JSON block could not be parsed.",
"suggestions": "Ensure the LLM outputs valid JSON inside the ```json``` block.",
}

def extract_and_parse_json(self, response):
"""
Extract and parse JSON from the response.

Args:
response (str): The raw response from the LLM.

Returns:
dict: Parsed JSON object or an error report.
"""
try:
# Extract JSON block enclosed in ```json ... ```
# match = re.search(r"```json(.*?)```", response, re.DOTALL)
match = re.search(r"\{.*\}", response.strip(), re.DOTALL)
if match:
json_block = match.group(0).strip() # Extract and clean the JSON block
else:
json_block = response.strip()

if not json_block:
raise ValueError("Extracted JSON block is empty.")

parsed_json = json.loads(json_block)
return parsed_json

except json.JSONDecodeError as ex:
# print(f'came here : {ex}')
# print(f'{response}')
# Return error information if parsing fails
return {
"status": "Error",
"reasoning": f"The extracted JSON block could not be parsed. {ex}",
"suggestions": "Ensure the LLM outputs valid JSON inside the ```json``` block.",
}

except ValueError as ex:
# print(f"Value Error: {ex}")
return {
"status": "Error",
"reasoning": str(ex),
"suggestions": "Check if the extracted JSON block is empty or improperly formatted.",
}

def refine_response(
self,
question,
agent_think,
agent_response,
error_details,
it_index,
review_resultFull,
characteristic_answer,
):
"""
Generate a refined prompt to request the LLM to fix JSON issues.

Args:
question (str): The original question or task.
agent_think (str): The agent's explanation of its approach.
agent_response (str): The agent's final response.
error_details (dict): Details about the JSON decoding error.

Returns:
str: Refined LLM response.
"""
refinement_prompt = (
"Your previous response contained errors in the JSON formatting. "
"Please ensure that your output is a valid JSON object enclosed in ```json``` blocks. "
"Here are the error details:\n"
f"{json.dumps(error_details, indent=2)}\n"
"\nRegenerate your response in the requested JSON format."
)
prompt = system_prompt_template.format(
question=question,
agent_think=agent_think,
agent_response=agent_response,
characteristic_answer=characteristic_answer,
)
combined_prompt = f"{prompt}\n\nYour Response {it_index}: {review_resultFull}\n\nFeedback {it_index}: {refinement_prompt}"
return combined_prompt

def evaluate_response(self, question, agent_think, agent_response, characteristic_answer):
"""
Evaluate the agent's response to a given question.

Args:
question (str): The original question or task.
agent_think (str): The agent's explanation of its approach.
agent_response (str): The agent's final response.

Returns:
dict: A JSON-like dictionary with the evaluation result.
"""
prompt = system_prompt_template.format(
question=question,
agent_think=agent_think,
agent_response=agent_response,
characteristic_answer=characteristic_answer,
)

# Retry mechanism
# print(f'INITIAL PROMPT = {prompt}')
for it_index in range(self.max_retries):

# print(f'prompt = {prompt}')

review_result = self.llm.generate(prompt, temperature=self.temperature)
# print(f'review_result = >{review_result}<')

# review_resultFull = json.loads(review_resultFullJSON)

# print(f'result = {review_resultFull}')
# result = {
# "task_completion": true,
# "data_retrieval_accuracy": true,
# "generalized_result_verification": true,
# "agent_sequence_correct": false,
# "clarity_and_justification": true,
# "hallucinations": false,
# "suggestions": "The agent repeated the same action multiple times without any change in the input or context. It should have stopped after receiving the correct response once."
# }



# review_result = review_resultFull["generated_text"]
parsed_result = self.extract_and_parse_json(review_result)

# Check if parsing succeeded
if parsed_result.get("status") != "Error":
return parsed_result

parsed_result = self.extract_and_parse_json_using_manual_parser(review_result)
# Check if parsing succeeded
if parsed_result.get("status") != "Error":
return parsed_result

# Refine response on failure
prompt = self.refine_response(
question,
agent_think,
agent_response,
parsed_result,
it_index,
review_result,
characteristic_answer,
)

# final apply simplified

# Return error after exceeding retries
return {
"status": "Error",
"reasoning": f"Failed to produce valid JSON after {self.max_retries} attempts.",
"suggestions": "Review the prompt and refine the LLM response strategy.",
}
Loading