text2mem/bench-cli at main · MemTensor/text2mem · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
#!/usr/bin/env python3
"""
Complete Bench CLI - Full Benchmark System

Features:
1. run          - Run benchmark tests
2. generate     - Generate new benchmark candidate data
3. validate     - Validate the quality of generated data
4. promote      - Promote candidate to official benchmark
5. list-results - List test results
6. show-result  - Show result details
7. compare      - Compare two results
8. info         - Show benchmark info
"""

import argparse
import json
import logging
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent))

from bench.core.simple_manager import Benchmark, ResultsManager
from bench.core.simple_runner import SimpleTestRunner

logging.basicConfig(
    level=logging.INFO,
    format='%(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def cmd_run(args):
    """Run benchmark tests"""
    schema_filter = None
    if args.schema_filter:
        schema_filter = [s.strip() for s in args.schema_filter.split(',')]

    schema_indices = None
    if args.schema_indices:
        schema_indices = [int(i.strip()) for i in args.schema_indices.split(',')]

    runner = SimpleTestRunner(
        mode=args.mode,
        filter_expr=args.filter,
        schema_filter=schema_filter,
        schema_indices=schema_indices,
        timeout=args.timeout,
    )

    try:
        result = runner.run(
            result_id=args.output_id,
            verbose=args.verbose
        )
        return 0
    except Exception as e:
        logger.error(f"❌ Test failed / 测试失败: {e}")
        if args.verbose:
            import traceback
            traceback.print_exc()
        return 1


def cmd_generate(args):
    """Generate new benchmark candidate data"""
    print("=" * 80)
    print("🔄 Generate Benchmark Candidate / 生成 Benchmark 候选数据")
    print("=" * 80)
    print()

    import subprocess
    from datetime import datetime

    gen_id = args.output_id or datetime.now().strftime("%Y%m%d_%H%M%S")

    # Prepare output directory
    if args.use_generation_dir:
        output_dir = Path(f'bench/data/generation/{gen_id}')
        output_dir.mkdir(parents=True, exist_ok=True)
        print(f"📂 Output Directory / 输出目录: {output_dir}")
    else:
        output_dir = None
        print(f"📂 Output to default location (bench/data/raw/) / 输出到默认位置 (bench/data/raw/)")

    print()

    # Run the generation script
    cmd = [sys.executable, 'bench/generate/generate.py']

    if args.config:
        cmd.extend(['--config', args.config])

    print("🏃 Running generation script / 运行生成脚本...")
    print(f"   Command / 命令: {' '.join(cmd)}")
    print()

    result = subprocess.run(cmd, cwd=Path.cwd())

    if result.returncode == 0:
        print()
        print("✅ Generation complete! / 生成完成！")
        print()

        # Find generated data
        raw_dir = Path('bench/data/raw')
        if raw_dir.exists():
            gen_dirs = sorted([d for d in raw_dir.iterdir() if d.is_dir()],
                              key=lambda x: x.name, reverse=True)
            if gen_dirs:
                latest_gen = gen_dirs[0].name
                print("Next steps / 下一步:")
                print(f"  1. Validate quality / 验证质量: ./bench-cli validate {latest_gen}")
                print(f"  2. Promote to benchmark / 提升为 benchmark: ./bench-cli promote {latest_gen}")
    else:
        print()
        print("❌ Generation failed / 生成失败")

    return result.returncode


def cmd_validate(args):
    """Validate generated data quality"""
    gen_id = args.generation_id

    # Locate generated data
    possible_paths = [
        Path(f'bench/data/generation/{gen_id}'),
        Path(f'bench/data/raw/{gen_id}'),
    ]

    gen_dir = None
    for p in possible_paths:
        if p.exists():
            gen_dir = p
            break

    if not gen_dir:
        logger.error(f"Generation not found / 未找到生成数据: {gen_id}")
        logger.error(f"Searched in / 搜索路径: {', '.join(str(p) for p in possible_paths)}")
        return 1

    print("=" * 80)
    print(f"🔍 Validate Data / 验证数据: {gen_id}")
    print("=" * 80)
    print(f"📂 Location / 位置: {gen_dir}")
    print()

    # Find stage3.jsonl
    stage3_file = gen_dir / 'stage3.jsonl'
    if not stage3_file.exists():
        logger.error(f"stage3.jsonl not found in {gen_dir}")
        return 1

    # Count samples
    samples = []
    with open(stage3_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    samples.append(json.loads(line))
                except:
                    pass

    print(f"📊 Total samples / 总样本数: {len(samples)}")
    print()

    # Quick statistics
    from collections import Counter
    lang_counter = Counter()
    op_counter = Counter()

    for sample in samples:
        class_info = sample.get('class', {})
        lang_counter[class_info.get('lang', 'unknown')] += 1

        for schema in sample.get('schema_list', []):
            op_counter[schema.get('op', 'unknown')] += 1

    print("📈 Data distribution / 数据分布:")
    print(f"   Languages / 语言: {dict(lang_counter)}")
    print(f"   Operations (Top 5) / 操作前5: {dict(op_counter.most_common(5))}")
    print()

    # Run test validation
    if args.run_tests:
        print("🧪 Running test validation / 运行测试验证...")
        print()

        import tempfile
        import shutil

        with tempfile.TemporaryDirectory() as tmpdir:
            # Create temporary benchmark
            tmp_benchmark_dir = Path(tmpdir) / 'benchmark'
            tmp_benchmark_dir.mkdir()

            shutil.copy(stage3_file, tmp_benchmark_dir / 'benchmark.jsonl')

            metadata = {
                'total_samples': len(samples),
                'created_at': 'validation',
            }
            with open(tmp_benchmark_dir / 'metadata.json', 'w') as f:
                json.dump(metadata, f)

            # Temporary run
            from bench.core.simple_manager import Benchmark
            runner = SimpleTestRunner(mode='mock')

            original_dir = runner.benchmark.benchmark_dir
            runner.benchmark.benchmark_dir = tmp_benchmark_dir
            runner.benchmark.benchmark_file = tmp_benchmark_dir / 'benchmark.jsonl'
            runner.benchmark.metadata_file = tmp_benchmark_dir / 'metadata.json'

            try:
                result = runner.run(verbose=args.verbose)

                pass_rate = result.report['summary']['pass_rate']
                print()
                print(f"📊 Pass rate / 测试通过率: {pass_rate*100:.1f}%")
                print()

                if pass_rate >= 0.5:
                    print("💡 Quality evaluation / 质量评估: Good — ready for promotion / 良好，可以提升为正式 benchmark")
                    print()
                    print("Next step / 下一步:")
                    print(f"  ./bench-cli promote {gen_id}")
                else:
                    print("⚠️  Quality evaluation / 质量评估: Low pass rate — consider regenerating / 通过率偏低，建议重新生成")

            finally:
                runner.benchmark.benchmark_dir = original_dir
    else:
        print("💡 Tip / 提示: Add --run-tests to perform validation / 添加 --run-tests 参数可运行验证")
        print()
        print("Next steps / 下一步:")
        print(f"  ./bench-cli validate {gen_id} --run-tests  # Run tests / 运行测试")
        print(f"  ./bench-cli promote {gen_id}               # Promote / 提升为 benchmark")

    return 0
def cmd_promote(args):
    """Promote generated data to official benchmark"""
    gen_id = args.generation_id

    # Locate generated data
    possible_paths = [
        Path(f'bench/data/generation/{gen_id}'),
        Path(f'bench/data/raw/{gen_id}'),
    ]

    gen_dir = None
    for p in possible_paths:
        if p.exists():
            gen_dir = p
            break

    if not gen_dir:
        logger.error(f"Generation not found / 未找到生成数据: {gen_id}")
        return 1

    stage3_file = gen_dir / 'stage3.jsonl'
    if not stage3_file.exists():
        logger.error(f"stage3.jsonl not found in {gen_dir}")
        return 1

    print("=" * 80)
    print(f"⬆️  Promote to Official Benchmark / 提升为正式 Benchmark: {gen_id}")
    print("=" * 80)
    print()

    # Confirmation
    if not args.yes:
        print("⚠️  This will replace the current benchmark! / 这将替换当前的 benchmark！")

        try:
            benchmark = Benchmark()
            print(f"   Current benchmark / 当前 benchmark: {benchmark.sample_count} samples")
        except:
            print(f"   Current benchmark / 当前 benchmark: Not found / 不存在")

        with open(stage3_file, 'r') as f:
            new_count = sum(1 for _ in f if _.strip())
        print(f"   New benchmark / 新 benchmark: {new_count} samples")
        print()

        confirm = input("Confirm replacement? (yes/no) / 确认替换? (yes/no): ")
        if confirm.lower() != 'yes':
            print("❎ Cancelled / 已取消")
            return 0

    # Backup current benchmark
    import shutil
    from datetime import datetime

    benchmark_dir = Path('bench/data/benchmark')
    backup_dir = Path(f'bench/data/archive/benchmark_backup_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
    backup_dir.parent.mkdir(parents=True, exist_ok=True)

    if benchmark_dir.exists():
        shutil.copytree(benchmark_dir, backup_dir)
        print(f"✓ Backed up current benchmark / 已备份当前 benchmark -> {backup_dir}")
        print()

    # Read and filter samples
    print("🔄 Processing data / 处理数据...")

    samples = []
    with open(stage3_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    samples.append(json.loads(line))
                except:
                    pass

    print(f"   Original samples / 原始样本: {len(samples)}")

    # Filter invalid data
    ALLOWED_OPERATIONS = {
        'Encode', 'Retrieve', 'Update', 'Delete', 'Summarize', 'Label',
        'Promote', 'Demote', 'Expire', 'Lock', 'Merge', 'Split',
    }

    filtered_samples = []
    for sample in samples:
        schema_list = sample.get('schema_list', [])
        if not schema_list:
            continue

        sample_str = json.dumps(sample)
        if 'unknown' in sample_str.lower():
            continue

        valid = True
        for schema in schema_list:
            if schema.get('op') not in ALLOWED_OPERATIONS:
                valid = False
                break

        if valid:
            filtered_samples.append(sample)

    print(f"   Filtered / 过滤后: {len(filtered_samples)}")
    print()

    # Save to benchmark directory
    benchmark_dir.mkdir(parents=True, exist_ok=True)

    with open(benchmark_dir / 'benchmark.jsonl', 'w', encoding='utf-8') as f:
        for sample in filtered_samples:
            f.write(json.dumps(sample, ensure_ascii=False) + '\n')

    # Generate statistics
    from collections import Counter

    lang_counter = Counter()
    op_counter = Counter()
    structure_counter = Counter()

    for sample in filtered_samples:
        class_info = sample.get('class', {})
        lang_counter[class_info.get('lang', 'unknown')] += 1
        structure_counter[class_info.get('structure', 'unknown')] += 1

        for schema in sample.get('schema_list', []):
            op_counter[schema.get('op', 'unknown')] += 1

    stats = {
        'total': len(filtered_samples),
        'distribution': {
            'languages': dict(lang_counter),
            'operations': dict(op_counter),
            'structures': dict(structure_counter),
        }
    }

    with open(benchmark_dir / 'stats.json', 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2, ensure_ascii=False)

    # Save metadata
    metadata = {
        'total_samples': len(filtered_samples),
        'created_at': datetime.now().isoformat(),
        'last_updated': datetime.now().isoformat(),
        'source': f'{gen_dir.parent.name}/{gen_id}',
        'notes': args.notes or f'Promoted from {gen_id}'
    }

    with open(benchmark_dir / 'metadata.json', 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)

    print("=" * 80)
    print("✅ Benchmark updated! / Benchmark 已更新！")
    print("=" * 80)
    print()
    print(f"📊 New Benchmark Info / 新 Benchmark 信息:")
    print(f"   Total samples / 总样本数: {len(filtered_samples)}")
    print(f"   Language distribution / 语言分布: {dict(lang_counter)}")
    print(f"   Operation distribution (Top 5) / 操作分布前5: {dict(op_counter.most_common(5))}")
    print()
    print("Next step / 下一步:")
    print("  ./bench-cli run --mode ollama -v  # Run full test / 运行完整测试")
    print()

    return 0


def cmd_list_results(args):
    """List benchmark test results"""
    manager = ResultsManager()
    results = manager.list_results(limit=args.limit)

    if not results:
        print("ℹ️  No test results found / 没有找到测试结果")
        return 0

    print()
    print("=" * 100)
    print(f"{'ID':<20} {'Mode':<10} {'Pass Rate':<12} {'Duration':<12} {'Timestamp':<20}")
    print("=" * 100)

    for result in results:
        config = result.config
        report = result.report

        mode = config.get('test_config', {}).get('mode', 'unknown')
        summary = report.get('summary', {})
        pass_rate = summary.get('pass_rate', 0.0)
        duration = summary.get('duration_seconds', 0.0)
        timestamp = config.get('timestamp', '')

        if timestamp and 'T' in timestamp:
            from datetime import datetime
            try:
                dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
                timestamp = dt.strftime('%Y-%m-%d %H:%M')
            except:
                timestamp = timestamp[:16]

        print(f"{result.result_id:<20} {mode:<10} {pass_rate*100:>6.1f}%      {duration:>8.1f}s    {timestamp:<20}")

    print()
    return 0


def cmd_show_result(args):
    """Show benchmark result details"""
    manager = ResultsManager()

    try:
        result = manager.get_result(args.result_id)
    except FileNotFoundError as e:
        logger.error(str(e))
        return 1

    config = result.config
    report = result.report

    print()
    print("=" * 80)
    print(f"📊 Test Result / 测试结果: {result.result_id}")
    print("=" * 80)
    print()

    test_config = config.get('test_config', {})
    print("⚙️  Configuration / 配置:")
    print(f"  Mode / 模式: {test_config.get('mode', 'unknown')}")
    print(f"  Samples / 样本数: {config.get('benchmark_samples', 0)}")

    filters = test_config.get('filters', {})
    if filters.get('filter_expr'):
        print(f"  Filter / 过滤: {filters['filter_expr']}")
    if filters.get('schema_filter'):
        print(f"  Schema filter / Schema 过滤: {', '.join(filters['schema_filter'])}")

    print()

    summary = report.get('summary', {})
    print("📈 Summary / 摘要:")
    print(f"  Total / 总数: {summary.get('total', 0)}")
    print(f"  Passed / 通过: {summary.get('passed', 0)}")
    print(f"  Failed / 失败: {summary.get('failed', 0)}")
    print(f"  Pass rate / 通过率: {summary.get('pass_rate', 0)*100:.1f}%")
    print(f"  Duration / 耗时: {summary.get('duration_seconds', 0):.1f}s")
    print()

    by_op = report.get('by_operation', {})
    if by_op:
        print("📋 By Operation / 按操作:")
        sorted_ops = sorted(by_op.items(), key=lambda x: x[1]['total'], reverse=True)
        for op, stats in sorted_ops[:10]:
            print(f"  {op:<12} {stats['passed']:>4}/{stats['total']:<4} ({stats['pass_rate']*100:>5.1f}%)")
        print()

    by_lang = report.get('by_language', {})
    if by_lang:
        print("🌐 By Language / 按语言:")
        for lang, stats in sorted(by_lang.items()):
            print(f"  {lang:<6} {stats['passed']:>4}/{stats['total']:<4} ({stats['pass_rate']*100:>5.1f}%)")
        print()

    if args.show_failed and result.failed_file.exists():
        print("❌ Failed Samples / 失败样本:")
        with open(result.failed_file, 'r', encoding='utf-8') as f:
            failed = [json.loads(line) for line in f if line.strip()]

        for item in failed[:10]:
            sample_id = item.get('sample_id', 'unknown')
            errors = item.get('errors', [])
            print(f"  {sample_id}")
            if errors:
                print(f"    {errors[0][:80]}...")

        if len(failed) > 10:
            print(f"  ... and {len(failed) - 10} more / 还有 {len(failed) - 10} 个")
        print()

    print(f"📂 Result Directory / 结果目录: bench/data/results/{result.result_id}/")
    print()

    return 0
def cmd_compare(args):
    """Compare two benchmark test results"""
    manager = ResultsManager()

    try:
        result1 = manager.get_result(args.result_id1)
        result2 = manager.get_result(args.result_id2)
    except FileNotFoundError as e:
        logger.error(str(e))
        return 1

    report1 = result1.report
    report2 = result2.report

    summary1 = report1.get('summary', {})
    summary2 = report2.get('summary', {})

    print()
    print("=" * 80)
    print("📊 Result Comparison / 结果对比")
    print("=" * 80)
    print()
    print(f"Left / 左:  {result1.result_id}")
    print(f"Right / 右: {result2.result_id}")
    print()

    print("📈 Summary / 摘要:")
    print(f"{'Metric / 指标':<20} {'Left / 左':<15} {'Right / 右':<15} {'Change / 变化':<15}")
    print("-" * 65)

    total1 = summary1.get('total', 0)
    total2 = summary2.get('total', 0)
    print(f"{'Total / 总数':<20} {total1:<15} {total2:<15} {total2 - total1:+15}")

    passed1 = summary1.get('passed', 0)
    passed2 = summary2.get('passed', 0)
    print(f"{'Passed / 通过':<20} {passed1:<15} {passed2:<15} {passed2 - passed1:+15}")

    rate1 = summary1.get('pass_rate', 0)
    rate2 = summary2.get('pass_rate', 0)
    print(f"{'Pass Rate / 通过率':<20} {rate1*100:.1f}%{'':<11} {rate2*100:.1f}%{'':<11} {(rate2-rate1)*100:+.1f}%")

    dur1 = summary1.get('duration_seconds', 0)
    dur2 = summary2.get('duration_seconds', 0)
    print(f"{'Duration (s) / 耗时 (秒)':<20} {dur1:.1f}{'':<11} {dur2:.1f}{'':<11} {dur2-dur1:+.1f}")

    print()
    return 0


def cmd_info(args):
    """Show information about the current benchmark"""
    try:
        benchmark = Benchmark()
        print(benchmark.info())
        return 0
    except FileNotFoundError as e:
        logger.error(str(e))
        logger.error("Benchmark not found / Benchmark 不存在，请先生成或导入 benchmark")
        return 1


def main():
    parser = argparse.ArgumentParser(
        description='Text2Mem Benchmark CLI - Full System / 完整基准测试系统',
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    subparsers = parser.add_subparsers(dest='command', help='Available Commands / 可用命令')

    # --- run ---
    run_parser = subparsers.add_parser('run', help='Run benchmark tests / 运行 benchmark 测试')
    run_parser.add_argument('--mode', default='auto',
                            choices=['auto', 'mock', 'ollama', 'openai'],
                            help='Test mode / 测试模式')
    run_parser.add_argument('--filter', help='Sample filter, e.g. "lang:zh" / 样本过滤 (如 "lang:zh")')
    run_parser.add_argument('--schema-filter', help='Schema filter, e.g. "Encode,Retrieve" / Schema 过滤')
    run_parser.add_argument('--schema-indices', help='Schema indices, e.g. "0,2" / Schema 索引')
    run_parser.add_argument('--timeout', type=float, help='Timeout in seconds / 超时 (秒)')
    run_parser.add_argument('--output-id', help='Result ID (default: timestamp) / 结果 ID (默认: 时间戳)')
    run_parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output / 详细输出')

    # --- generate ---
    gen_parser = subparsers.add_parser('generate', help='Generate new benchmark candidate / 生成新的 benchmark 候选数据')
    gen_parser.add_argument('--config', help='Path to config file / 配置文件路径')
    gen_parser.add_argument('--output-id', help='Output ID (default: timestamp) / 输出 ID (默认: 时间戳)')
    gen_parser.add_argument('--use-generation-dir', action='store_true',
                            help='Use generation/ directory instead of raw/ / 使用 generation/ 目录而不是 raw/')

    # --- validate ---
    val_parser = subparsers.add_parser('validate', help='Validate generated data quality / 验证生成的数据质量')
    val_parser.add_argument('generation_id', help='Generation ID / 生成ID')
    val_parser.add_argument('--run-tests', action='store_true', help='Run validation tests / 运行测试验证')
    val_parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output / 详细输出')

    # --- promote ---
    pro_parser = subparsers.add_parser('promote', help='Promote candidate to benchmark / 提升为正式 benchmark')
    pro_parser.add_argument('generation_id', help='Generation ID / 生成ID')
    pro_parser.add_argument('--yes', '-y', action='store_true', help='Skip confirmation / 跳过确认')
    pro_parser.add_argument('--notes', help='Notes / 备注')

    # --- list-results ---
    list_parser = subparsers.add_parser('list-results', help='List benchmark test results / 列出测试结果')
    list_parser.add_argument('--limit', type=int, default=20, help='Limit number / 限制数量')

    # --- show-result ---
    show_parser = subparsers.add_parser('show-result', help='Show result details / 显示结果详情')
    show_parser.add_argument('result_id', help='Result ID (or "latest") / 结果 ID (或 "latest")')
    show_parser.add_argument('--show-failed', action='store_true', help='Show failed samples / 显示失败样本')

    # --- compare ---
    cmp_parser = subparsers.add_parser('compare', help='Compare two results / 对比两个结果')
    cmp_parser.add_argument('result_id1', help='Result ID 1 / 结果 ID 1')
    cmp_parser.add_argument('result_id2', help='Result ID 2 / 结果 ID 2')

    # --- info ---
    info_parser = subparsers.add_parser('info', help='Show benchmark information / 显示 benchmark 信息')

    # Parse CLI arguments
    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        return 1

    commands = {
        'run': cmd_run,
        'generate': cmd_generate,
        'validate': cmd_validate,
        'promote': cmd_promote,
        'list-results': cmd_list_results,
        'show-result': cmd_show_result,
        'compare': cmd_compare,
        'info': cmd_info,
    }

    # Execute selected command
    return commands[args.command](args)


if __name__ == '__main__':
    sys.exit(main())