Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
9fdbcf3
Dev (#496)
CarltonXiang Nov 21, 2025
439ed49
hotfix:hotfix
fridayL Nov 21, 2025
2b6dc7e
hotfix:hotfix (#513)
fridayL Nov 21, 2025
39a7b34
test: add routers api
CaralHsi Nov 22, 2025
cbed950
fix: doc fine mode bug
CaralHsi Dec 7, 2025
20e0839
fix: doc fine mode bug
CaralHsi Dec 7, 2025
fff0fb2
feat: init longbench_v2
CaralHsi Dec 7, 2025
15562c4
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 7, 2025
9beabba
feat: more strict embedder trucation
CaralHsi Dec 7, 2025
fc54da8
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 7, 2025
8f368bb
feat: parallel processing fine mode in multi-modal-fine
CaralHsi Dec 7, 2025
54897a9
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 7, 2025
be293bc
feat: update parsers; add chunk info into source; remove origin_part
CaralHsi Dec 8, 2025
ba1c161
fix: conflict
CaralHsi Dec 8, 2025
8e8b91b
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 8, 2025
2edd0a3
feat: modify chunk_content in file-fine-parser
CaralHsi Dec 8, 2025
6991ed7
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 8, 2025
45609ab
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 8, 2025
f80896e
fix: token counter bug
CaralHsi Dec 8, 2025
a3f2b32
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 8, 2025
b375d51
feat: enlarge polardb
CaralHsi Dec 8, 2025
0bfcaa9
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 8, 2025
69dd3a8
feat: derease parallrl
CaralHsi Dec 8, 2025
7fa7b77
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 8, 2025
ac38046
feat: add image parser in file
CaralHsi Dec 8, 2025
ef02140
feat: add image parser in file
CaralHsi Dec 8, 2025
37bcc90
feat: update file_content_parser
CaralHsi Dec 8, 2025
7e2adb4
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 8, 2025
20af5d0
feat: modify long_bench_v2
CaralHsi Dec 9, 2025
ec34637
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 9, 2025
31ad564
Merge branch 'dev' of github.com:MemTensor/MemOS into feat/evaluation…
CaralHsi Dec 9, 2025
72eb129
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 9, 2025
0ef1bb5
feat: modify long_bench_v2
CaralHsi Dec 9, 2025
cf1291b
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 9, 2025
1ecf03e
Merge branch 'dev' of github.com:MemTensor/MemOS into feat/evaluation…
CaralHsi Dec 9, 2025
b58ee88
fix: image bug
CaralHsi Dec 9, 2025
f94b001
feat: increase playground depth
CaralHsi Dec 9, 2025
3819dae
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 9, 2025
eba9e96
feat: set parsed_text None in file parser
CaralHsi Dec 9, 2025
5504d8d
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 9, 2025
5c496ee
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 9, 2025
918bc6a
fix: file_ids bug in file-mode
CaralHsi Dec 9, 2025
1e6dd73
Merge branch 'dev' of github.com:MemTensor/MemOS into feat/evaluation…
CaralHsi Dec 9, 2025
a8ac57c
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 9, 2025
40998f8
Merge branch 'dev' of github.com:MemTensor/MemOS into feat/evaluation…
CaralHsi Dec 15, 2025
56e0d6d
feat: update evaluation
CaralHsi Dec 15, 2025
c64fd26
feat: update evaluation
CaralHsi Dec 15, 2025
41ac6c2
feat: add general string prompt
CaralHsi Dec 16, 2025
0696126
Merge branch 'dev' of github.com:MemTensor/MemOS into feat/evaluation…
CaralHsi Dec 16, 2025
3669b39
fix: conflict
CaralHsi Dec 16, 2025
8d5b51f
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 16, 2025
eaedc9a
fix: test server router
CaralHsi Dec 16, 2025
7674ecc
feat: update evluation
CaralHsi Dec 16, 2025
187e8f2
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 17, 2025
288207f
Merge branch 'dev' of github.com:MemTensor/MemOS into feat/evaluation…
CaralHsi Dec 17, 2025
66e9325
feat: decrease graph-db batch size to 5
CaralHsi Dec 17, 2025
eb426a2
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 17, 2025
321d5e0
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 17, 2025
e10365c
fix: default name in long_bench-v2/longbench_v2_search
CaralHsi Dec 17, 2025
31f07fc
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 17, 2025
316e147
fix: test bug
CaralHsi Dec 17, 2025
e8e29f8
Update test_server_router.py
CaralHsi Dec 17, 2025
ce70121
Update test_product_router.py
CaralHsi Dec 17, 2025
9a379f5
Merge branch 'dev' into feat/evaluation_doc_qa
CaralHsi Dec 17, 2025
9e7ca00
feat: comment
CaralHsi Dec 17, 2025
e1f46b7
Merge branch 'feat/evaluation_doc_qa' of github.com:CaralHsi/MemOSRea…
CaralHsi Dec 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def ingest_sample(
# Get context and convert to messages
context = sample.get("context", "")

# For memos, we ingest the context as document content
# For memos, we ingest the context as a raw document content
messages = [
{
"type": "file",
Expand Down Expand Up @@ -185,7 +185,7 @@ def main(frame, version="default", num_workers=10, max_samples=None):
parser.add_argument(
"--workers",
type=int,
default=3,
default=2,
help="Number of parallel workers",
)
parser.add_argument(
Expand Down
157 changes: 94 additions & 63 deletions evaluation/scripts/long_bench-v2/longbench_v2_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,75 +4,80 @@


def calculate_accuracy(responses):
"""Calculate accuracy metrics for LongBench v2."""
"""Calculate accuracy metrics for LongBench v2.

Logic is aligned with longbench_stx.print_metrics, but returns a dict
and additionally computes by_domain statistics.
"""
total = len(responses)
if total == 0:
return {}

# Overall accuracy
correct = sum(1 for r in responses if r.get("judge", False))
overall_acc = round(100 * correct / total, 1)

# By difficulty
easy_items = [r for r in responses if r.get("difficulty") == "easy"]
hard_items = [r for r in responses if r.get("difficulty") == "hard"]
easy_acc = (
round(100 * sum(1 for r in easy_items if r.get("judge", False)) / len(easy_items), 1)
if easy_items
else 0.0
)
hard_acc = (
round(100 * sum(1 for r in hard_items if r.get("judge", False)) / len(hard_items), 1)
if hard_items
else 0.0
)

# By length
short_items = [r for r in responses if r.get("length") == "short"]
medium_items = [r for r in responses if r.get("length") == "medium"]
long_items = [r for r in responses if r.get("length") == "long"]

short_acc = (
round(100 * sum(1 for r in short_items if r.get("judge", False)) / len(short_items), 1)
if short_items
else 0.0
)
medium_acc = (
round(100 * sum(1 for r in medium_items if r.get("judge", False)) / len(medium_items), 1)
if medium_items
else 0.0
)
long_acc = (
round(100 * sum(1 for r in long_items if r.get("judge", False)) / len(long_items), 1)
if long_items
else 0.0
)

# By domain
# Counters (aligned with longbench_stx.print_metrics)
easy = hard = short = medium = long = 0
easy_acc = hard_acc = short_acc = medium_acc = long_acc = 0
total_prompt_tokens = 0

for pred in responses:
acc = int(pred.get("judge", False))
diff = pred.get("difficulty", "easy")
length = pred.get("length", "short")

pt = pred.get("prompt_tokens")
if isinstance(pt, int | float):
total_prompt_tokens += int(pt)

if diff == "easy":
easy += 1
easy_acc += acc
else:
hard += 1
hard_acc += acc

if length == "short":
short += 1
short_acc += acc
elif length == "medium":
medium += 1
medium_acc += acc
else:
long += 1
long_acc += acc

o_acc = round(100 * (easy_acc + hard_acc) / total, 2)
e_acc = round(100 * easy_acc / easy, 2) if easy > 0 else 0.0
h_acc = round(100 * hard_acc / hard, 2) if hard > 0 else 0.0
s_acc = round(100 * short_acc / short, 2) if short > 0 else 0.0
m_acc = round(100 * medium_acc / medium, 2) if medium > 0 else 0.0
l_acc = round(100 * long_acc / long, 2) if long > 0 else 0.0

# Additional by-domain stats (extra vs. stx)
domain_stats = {}
for response in responses:
domain = response.get("domain", "Unknown")
for r in responses:
domain = r.get("domain", "Unknown")
if domain not in domain_stats:
domain_stats[domain] = {"total": 0, "correct": 0}
domain_stats[domain]["total"] += 1
if response.get("judge", False):
if r.get("judge", False):
domain_stats[domain]["correct"] += 1

domain_acc = {
domain: round(100 * stats["correct"] / stats["total"], 1)
domain: round(100 * stats["correct"] / stats["total"], 2)
for domain, stats in domain_stats.items()
}

return {
"overall": overall_acc,
"easy": easy_acc,
"hard": hard_acc,
"short": short_acc,
"medium": medium_acc,
"long": long_acc,
"overall": o_acc,
"easy": e_acc,
"hard": h_acc,
"short": s_acc,
"medium": m_acc,
"long": l_acc,
"by_domain": domain_acc,
"total_samples": total,
"correct_samples": correct,
"correct_samples": easy_acc + hard_acc,
"total_prompt_tokens": total_prompt_tokens,
"avg_prompt_tokens": round(total_prompt_tokens / total, 2) if total > 0 else 0.0,
}


Expand All @@ -92,11 +97,36 @@ def main(frame, version="default"):
with open(responses_path, encoding="utf-8") as f:
responses = json.load(f)

# Only keep entries with non-empty context (search_context) to align with response generation
filtered = [r for r in responses if str(r.get("search_context", "")).strip() != ""]

# Calculate metrics
metrics = calculate_accuracy(filtered)
# Only keep entries that actually have search results:
# - For new pipeline: non-empty memories_used list
# - For older runs: non-empty search_context string
def _has_search_results(r: dict) -> bool:
mems = r.get("memories_used")
if isinstance(mems, list) and any(str(m).strip() for m in mems):
return True
ctx = str(r.get("search_context", "")).strip()
return ctx != ""

filtered = [r for r in responses if _has_search_results(r)]

# Calculate metrics (handle case where no samples have search results)
if not filtered:
print("⚠️ No responses with valid search results were found. Metrics will be zeroed.")
metrics = {
"overall": 0.0,
"easy": 0.0,
"hard": 0.0,
"short": 0.0,
"medium": 0.0,
"long": 0.0,
"by_domain": {},
"total_samples": 0,
"correct_samples": 0,
"total_prompt_tokens": 0,
"avg_prompt_tokens": 0.0,
}
else:
metrics = calculate_accuracy(filtered)

# Save metrics
output_path = f"results/long_bench_v2/{frame}-{version}/{frame}_longbench_v2_metrics.json"
Expand All @@ -112,12 +142,13 @@ def main(frame, version="default"):
# Print summary table
print("\n📊 Summary of Results:")
print("-" * 80)
print(f"{'Overall Accuracy':<30s}: {metrics['overall']:.1f}%")
print(f"{'Easy':<30s}: {metrics['easy']:.1f}%")
print(f"{'Hard':<30s}: {metrics['hard']:.1f}%")
print(f"{'Short':<30s}: {metrics['short']:.1f}%")
print(f"{'Medium':<30s}: {metrics['medium']:.1f}%")
print(f"{'Long':<30s}: {metrics['long']:.1f}%")
print(f"{'Overall Accuracy':<30s}: {metrics['overall']:.2f}%")
print(f"{'Easy':<30s}: {metrics['easy']:.2f}%")
print(f"{'Hard':<30s}: {metrics['hard']:.2f}%")
print(f"{'Short':<30s}: {metrics['short']:.2f}%")
print(f"{'Medium':<30s}: {metrics['medium']:.2f}%")
print(f"{'Long':<30s}: {metrics['long']:.2f}%")
print(f"{'Avg Prompt Tokens':<30s}: {metrics.get('avg_prompt_tokens', 0.0):.2f}")
print("\nBy Domain:")
for domain, acc in metrics["by_domain"].items():
print(f" {domain:<28s}: {acc:.1f}%")
Expand Down
Loading