Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
cbed950
fix: doc fine mode bug
CaralHsi Dec 7, 2025
20e0839
fix: doc fine mode bug
CaralHsi Dec 7, 2025
fff0fb2
feat: init longbench_v2
CaralHsi Dec 7, 2025
15562c4
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 7, 2025
9beabba
feat: more strict embedder trucation
CaralHsi Dec 7, 2025
fc54da8
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 7, 2025
8f368bb
feat: parallel processing fine mode in multi-modal-fine
CaralHsi Dec 7, 2025
54897a9
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 7, 2025
be293bc
feat: update parsers; add chunk info into source; remove origin_part
CaralHsi Dec 8, 2025
ba1c161
fix: conflict
CaralHsi Dec 8, 2025
8e8b91b
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 8, 2025
2edd0a3
feat: modify chunk_content in file-fine-parser
CaralHsi Dec 8, 2025
6991ed7
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 8, 2025
45609ab
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 8, 2025
f80896e
fix: token counter bug
CaralHsi Dec 8, 2025
a3f2b32
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 8, 2025
b375d51
feat: enlarge polardb
CaralHsi Dec 8, 2025
0bfcaa9
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 8, 2025
69dd3a8
feat: derease parallrl
CaralHsi Dec 8, 2025
7fa7b77
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 8, 2025
ac38046
feat: add image parser in file
CaralHsi Dec 8, 2025
ef02140
feat: add image parser in file
CaralHsi Dec 8, 2025
37bcc90
feat: update file_content_parser
CaralHsi Dec 8, 2025
7e2adb4
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 8, 2025
20af5d0
feat: modify long_bench_v2
CaralHsi Dec 9, 2025
ec34637
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 9, 2025
31ad564
Merge branch 'dev' of github.com:MemTensor/MemOS into feat/evaluation…
CaralHsi Dec 9, 2025
72eb129
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 9, 2025
0ef1bb5
feat: modify long_bench_v2
CaralHsi Dec 9, 2025
cf1291b
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 9, 2025
1ecf03e
Merge branch 'dev' of github.com:MemTensor/MemOS into feat/evaluation…
CaralHsi Dec 9, 2025
b58ee88
fix: image bug
CaralHsi Dec 9, 2025
f94b001
feat: increase playground depth
CaralHsi Dec 9, 2025
3819dae
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def main(frame, version="default", num_workers=10, max_samples=None):

# Initialize checkpoint file for resume functionality
checkpoint_dir = os.path.join(
ROOT_DIR, "evaluation", "results", "longbench_v2", f"{frame}-{version}"
ROOT_DIR, "evaluation", "results", "long_bench_v2", f"{frame}-{version}"
)
os.makedirs(checkpoint_dir, exist_ok=True)
record_file = os.path.join(checkpoint_dir, "success_records.txt")
Expand Down Expand Up @@ -179,13 +179,13 @@ def main(frame, version="default", num_workers=10, max_samples=None):
parser.add_argument(
"--version",
type=str,
default="long-bench-v2-1208-1556",
default="default",
help="Version identifier for saving results",
)
parser.add_argument(
"--workers",
type=int,
default=20,
default=3,
help="Number of parallel workers",
)
parser.add_argument(
Expand Down
158 changes: 0 additions & 158 deletions evaluation/scripts/long_bench-v2/longbench_v2_ingestion_async.py

This file was deleted.

9 changes: 6 additions & 3 deletions evaluation/scripts/long_bench-v2/longbench_v2_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def main(frame, version="default"):
print("=" * 80 + "\n")

# Load responses
responses_path = f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_responses.json"
responses_path = f"results/long_bench_v2/{frame}-{version}/{frame}_longbench_v2_responses.json"
if not os.path.exists(responses_path):
print(f"❌ Responses not found: {responses_path}")
print("Please run longbench_v2_responses.py first")
Expand All @@ -92,11 +92,14 @@ def main(frame, version="default"):
with open(responses_path, encoding="utf-8") as f:
responses = json.load(f)

# Only keep entries with non-empty context (search_context) to align with response generation
filtered = [r for r in responses if str(r.get("search_context", "")).strip() != ""]

# Calculate metrics
metrics = calculate_accuracy(responses)
metrics = calculate_accuracy(filtered)

# Save metrics
output_path = f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_metrics.json"
output_path = f"results/long_bench_v2/{frame}-{version}/{frame}_longbench_v2_metrics.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, "w", encoding="utf-8") as f:
Expand Down
85 changes: 73 additions & 12 deletions evaluation/scripts/long_bench-v2/longbench_v2_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import sys
import threading

from concurrent.futures import ThreadPoolExecutor, as_completed
from time import time
Expand Down Expand Up @@ -85,8 +86,13 @@ def generate_response(llm_client, context, question, choice_a, choice_b, choice_
return ""


def process_sample(search_result, llm_client):
def process_sample(search_result, llm_client, success_records, record_file, file_lock):
"""Process a single sample: generate answer."""
sample_idx = search_result.get("sample_idx")
# Skip if already processed
if sample_idx is not None and str(sample_idx) in success_records:
return None

start = time()

context = search_result.get("context", "")
Expand All @@ -96,6 +102,10 @@ def process_sample(search_result, llm_client):
choice_c = search_result.get("choice_C", "")
choice_d = search_result.get("choice_D", "")

# Skip empty/placeholder contexts (e.g., "\n" or whitespace-only)
if not context or context.strip() == "":
return None

# Generate answer
response = generate_response(
llm_client, context, question, choice_a, choice_b, choice_c, choice_d
Expand All @@ -106,7 +116,7 @@ def process_sample(search_result, llm_client):

response_duration_ms = (time() - start) * 1000

return {
result = {
"sample_idx": search_result.get("sample_idx"),
"_id": search_result.get("_id"),
"domain": search_result.get("domain"),
Expand All @@ -123,10 +133,20 @@ def process_sample(search_result, llm_client):
"response": response,
"judge": pred == search_result.get("answer") if pred else False,
"search_context": context,
# Preserve full search results payload (e.g., list of memories)
"search_results": search_result.get("search_results"),
"response_duration_ms": response_duration_ms,
"search_duration_ms": search_result.get("search_duration_ms", 0),
}

# Record successful processing (thread-safe)
if sample_idx is not None:
with file_lock, open(record_file, "a") as f:
f.write(f"{sample_idx}\n")
f.flush()

return result


def main(frame, version="default", num_workers=10):
"""Main response generation function."""
Expand All @@ -136,10 +156,16 @@ def main(frame, version="default", num_workers=10):
print(f"🚀 LONGBENCH V2 RESPONSE GENERATION - {frame.upper()} v{version}".center(80))
print("=" * 80 + "\n")

# Load search results
search_path = (
f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_search_results.json"
# Initialize checkpoint file for resume functionality
checkpoint_dir = os.path.join(
ROOT_DIR, "evaluation", "results", "long_bench_v2", f"{frame}-{version}"
)
os.makedirs(checkpoint_dir, exist_ok=True)
record_file = os.path.join(checkpoint_dir, "response_success_records.txt")
search_path = os.path.join(checkpoint_dir, f"{frame}_longbench_v2_search_results.json")
output_path = os.path.join(checkpoint_dir, f"{frame}_longbench_v2_responses.json")

# Load search results
if not os.path.exists(search_path):
print(f"❌ Search results not found: {search_path}")
print("Please run longbench_v2_search.py first")
Expand All @@ -148,6 +174,30 @@ def main(frame, version="default", num_workers=10):
with open(search_path, encoding="utf-8") as f:
search_results = json.load(f)

# Load existing results and success records for resume
existing_results = {}
success_records = set()
if os.path.exists(output_path):
with open(output_path, encoding="utf-8") as f:
existing_results_list = json.load(f)
for result in existing_results_list:
sample_idx = result.get("sample_idx")
if sample_idx is not None:
existing_results[sample_idx] = result
success_records.add(str(sample_idx))
print(f"📋 Found {len(existing_results)} existing responses (resume mode)")
else:
print("📋 Starting fresh response generation (no checkpoint found)")

# Load additional success records from checkpoint file
if os.path.exists(record_file):
with open(record_file) as f:
for line in f:
line = line.strip()
if line and line not in success_records:
success_records.add(line)
print(f"📋 Total {len(success_records)} samples already processed")

# Initialize LLM client
llm_client = OpenAI(
api_key=os.getenv("CHAT_MODEL_API_KEY"),
Expand All @@ -156,9 +206,15 @@ def main(frame, version="default", num_workers=10):
print(f"🔌 Using OpenAI client with model: {os.getenv('CHAT_MODEL')}")

# Process all samples
all_responses = []
new_results = []
file_lock = threading.Lock() # Lock for thread-safe file writing
with ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = [executor.submit(process_sample, sample, llm_client) for sample in search_results]
futures = [
executor.submit(
process_sample, sample, llm_client, success_records, record_file, file_lock
)
for sample in search_results
]

for future in tqdm(
as_completed(futures),
Expand All @@ -167,11 +223,16 @@ def main(frame, version="default", num_workers=10):
):
result = future.result()
if result:
all_responses.append(result)

# Save responses
output_path = f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_responses.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
new_results.append(result)
# Update existing results with new result
sample_idx = result.get("sample_idx")
if sample_idx is not None:
existing_results[sample_idx] = result

# Merge and save all results
all_responses = list(existing_results.values())
# Sort by sample_idx to maintain order
all_responses.sort(key=lambda x: x.get("sample_idx", 0))

with open(output_path, "w", encoding="utf-8") as f:
json.dump(all_responses, f, ensure_ascii=False, indent=2)
Expand Down
Loading