Skip to content

Commit 21f062c

Browse files
test: pytest suite + GitHub Actions CI
Covers every Phase 5 addition plus core sanitize passes: - test_progress.py — Progress dataclass, human/jsonl/noop sinks, emit() swallowing broken callbacks, CLI flag resolution including unknown-value fallback - test_results_schema.py — minimal/full valid docs; rejects missing metrics, missing metric.value, bad table id pattern, disallowed owning_section value, extraneous top-level keys. Roundtrip check that owning_section survives from_dict - test_dedup_tables.py — all three dedup signals (ownership, label, fingerprint), no-op on unique tables, demote comment still carries \ref{tab:...} - test_sanitize.py — smoke test per pass (cjk, fences, markdown, reasoning) and the full pipeline - test_cli.py — build_parser covers every subcommand, the --progress flag is wired on every subparser, validate-results exits 0/1/2 correctly via subprocess. Also pins the coherence default (regression guard for the bug just fixed) pyproject adds a [dev] extra (pytest>=7.4) and [tool.pytest.ini_options] that points at tests/. CI runs pytest across Python 3.10/3.11/3.12 on every push/PR to main; job fail-fast is off so all versions report independently. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d1e25e1 commit 21f062c

File tree

8 files changed

+456
-0
lines changed

8 files changed

+456
-0
lines changed

.github/workflows/ci.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: ci
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
jobs:
10+
test:
11+
name: pytest (py${{ matrix.python-version }})
12+
runs-on: ubuntu-latest
13+
strategy:
14+
fail-fast: false
15+
matrix:
16+
python-version: ["3.10", "3.11", "3.12"]
17+
defaults:
18+
run:
19+
working-directory: skills/hermes-sci/package
20+
steps:
21+
- uses: actions/checkout@v4
22+
23+
- uses: actions/setup-python@v5
24+
with:
25+
python-version: ${{ matrix.python-version }}
26+
cache: pip
27+
28+
- name: Install package + dev deps
29+
run: |
30+
python -m pip install --upgrade pip
31+
pip install -e ".[dev]"
32+
33+
- name: Run pytest
34+
run: pytest -v

skills/hermes-sci/package/pyproject.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ full = [
2929
"anthropic>=0.30",
3030
"pymupdf4llm>=0.0.10",
3131
]
32+
dev = [
33+
"pytest>=7.4",
34+
]
3235

3336
[project.scripts]
3437
hermes-sci = "hermes_sci.cli:main"
@@ -37,6 +40,11 @@ hermes-sci = "hermes_sci.cli:main"
3740
where = ["."]
3841
include = ["hermes_sci*"]
3942

43+
[tool.pytest.ini_options]
44+
testpaths = ["tests"]
45+
python_files = ["test_*.py"]
46+
addopts = "-ra -q"
47+
4048
[tool.setuptools.package-data]
4149
hermes_sci = [
4250
"latex/*.tex.j2", "latex/*.sty", "latex/*.bib",

skills/hermes-sci/package/tests/__init__.py

Whitespace-only changes.
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""CLI argparse + validate-results end-to-end (no network)."""
2+
from __future__ import annotations
3+
4+
import json
5+
import pathlib
6+
import subprocess
7+
import sys
8+
9+
import pytest
10+
11+
from hermes_sci.cli import build_parser
12+
13+
PKG_ROOT = pathlib.Path(__file__).resolve().parent.parent
14+
15+
16+
def test_parser_lists_all_subcommands():
17+
p = build_parser()
18+
ns = p.parse_args(["ideate", "--topic", "x", "-o", "i.json"])
19+
assert ns.cmd == "ideate"
20+
ns = p.parse_args(["writeup", "--ideas-json", "i.json", "-o", "out"])
21+
assert ns.cmd == "writeup"
22+
ns = p.parse_args(["review", "--paper", "p.pdf"])
23+
assert ns.cmd == "review"
24+
ns = p.parse_args(["pipeline", "--topic", "x", "-o", "out"])
25+
assert ns.cmd == "pipeline"
26+
ns = p.parse_args(["validate-results", "r.json"])
27+
assert ns.cmd == "validate-results"
28+
29+
30+
@pytest.mark.parametrize("sink", ["human", "jsonl", "off"])
31+
def test_progress_flag_accepted_everywhere(sink):
32+
p = build_parser()
33+
ns = p.parse_args(["ideate", "--topic", "x", "-o", "i.json",
34+
"--progress", sink])
35+
assert ns.progress == sink
36+
37+
38+
def test_progress_rejects_unknown_value():
39+
p = build_parser()
40+
with pytest.raises(SystemExit):
41+
p.parse_args(["ideate", "--topic", "x", "-o", "i.json",
42+
"--progress", "spinner"])
43+
44+
45+
def test_coherence_default_false():
46+
"""Coherence is opt-in on every subcommand; pipeline previously had a
47+
bug where it used args.no_coherence (never defined)."""
48+
p = build_parser()
49+
ns = p.parse_args(["pipeline", "--topic", "x", "-o", "out"])
50+
assert ns.coherence is False
51+
ns = p.parse_args(["pipeline", "--topic", "x", "-o", "out", "--coherence"])
52+
assert ns.coherence is True
53+
54+
55+
def test_validate_results_good_exits_0(tmp_path):
56+
doc = {"metrics": [{"name": "BLEU", "value": 28.3}]}
57+
p = tmp_path / "r.json"
58+
p.write_text(json.dumps(doc))
59+
r = subprocess.run(
60+
[sys.executable, "-m", "hermes_sci.cli", "validate-results", str(p)],
61+
capture_output=True, text=True, cwd=str(PKG_ROOT),
62+
)
63+
assert r.returncode == 0, r.stderr
64+
assert "matches results.json schema" in r.stdout
65+
66+
67+
def test_validate_results_bad_exits_1(tmp_path):
68+
doc = {"metrics": [], "tables": [
69+
{"id": "bad id", "headers": ["x"], "rows": []}]}
70+
p = tmp_path / "bad.json"
71+
p.write_text(json.dumps(doc))
72+
r = subprocess.run(
73+
[sys.executable, "-m", "hermes_sci.cli", "validate-results", str(p)],
74+
capture_output=True, text=True, cwd=str(PKG_ROOT),
75+
)
76+
assert r.returncode == 1
77+
assert "schema violation" in r.stderr
78+
79+
80+
def test_validate_results_missing_file_exits_2(tmp_path):
81+
r = subprocess.run(
82+
[sys.executable, "-m", "hermes_sci.cli",
83+
"validate-results", str(tmp_path / "nope.json")],
84+
capture_output=True, text=True, cwd=str(PKG_ROOT),
85+
)
86+
assert r.returncode == 2
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
"""Cross-section table dedup via ownership + fingerprint."""
2+
from __future__ import annotations
3+
4+
from hermes_sci.sanitize.tables import dedup_tables
5+
6+
# Two identical tables with a shared label that both sections may emit.
7+
_TBL = r"""
8+
\begin{table}[h]
9+
\centering
10+
\caption{Results by input complexity}
11+
\label{tab:complexity}
12+
\begin{tabular}{|l|c|c|}
13+
\hline
14+
Complexity & Latency & BLEU \\
15+
\hline
16+
Simple & 19.4 & 28.1 \\
17+
Medium & 28.7 & 27.6 \\
18+
Hard & 42.1 & 26.8 \\
19+
\hline
20+
\end{tabular}
21+
\end{table}
22+
""".strip()
23+
24+
25+
def _has_begin_table(s: str) -> bool:
26+
return r"\begin{table}" in s
27+
28+
29+
def test_owning_section_wins():
30+
sections = {
31+
"experiments": "Prose.\n" + _TBL + "\nMore.",
32+
"results": "Other prose.\n" + _TBL + "\nDone.",
33+
}
34+
out, events = dedup_tables(
35+
sections, table_ownership={"tab:complexity": "experiments"}
36+
)
37+
assert _has_begin_table(out["experiments"])
38+
assert not _has_begin_table(out["results"])
39+
assert any(e["reason"] == "owning_section" for e in events)
40+
41+
42+
def test_duplicate_label_first_wins_when_no_ownership():
43+
sections = {"experiments": _TBL, "results": _TBL}
44+
out, events = dedup_tables(sections) # no ownership map
45+
assert _has_begin_table(out["experiments"])
46+
assert not _has_begin_table(out["results"])
47+
assert events[0]["reason"] == "duplicate_label"
48+
49+
50+
def test_fingerprint_catches_unlabeled_duplicate():
51+
unlab = _TBL.replace(r"\label{tab:complexity}", "")
52+
sections = {"experiments": unlab, "results": unlab}
53+
out, events = dedup_tables(sections)
54+
assert _has_begin_table(out["experiments"])
55+
assert not _has_begin_table(out["results"])
56+
assert events[0]["reason"] == "fingerprint"
57+
58+
59+
def test_different_tables_both_survive():
60+
other = (_TBL
61+
.replace("Results by input complexity", "Ablation over dropout rate")
62+
.replace("tab:complexity", "tab:ablation"))
63+
sections = {"experiments": _TBL, "results": other}
64+
out, events = dedup_tables(sections)
65+
assert _has_begin_table(out["experiments"])
66+
assert _has_begin_table(out["results"])
67+
assert events == []
68+
69+
70+
def test_no_tables_is_noop():
71+
sections = {"method": "Prose only.", "experiments": "More prose."}
72+
out, events = dedup_tables(sections)
73+
assert out == sections
74+
assert events == []
75+
76+
77+
def test_demotion_leaves_ref_resolvable_comment():
78+
sections = {"experiments": _TBL, "results": _TBL}
79+
out, _ = dedup_tables(
80+
sections, table_ownership={"tab:complexity": "experiments"}
81+
)
82+
# Dropped block is replaced by a LaTeX comment citing the label so a
83+
# nearby \ref{tab:complexity} still makes sense in the prose.
84+
assert r"\ref{tab:complexity}" in out["results"]
85+
assert out["results"].lstrip().startswith("%") or \
86+
"% (duplicate" in out["results"]
87+
88+
89+
def test_owning_section_does_not_demote_first_hit():
90+
sections = {"experiments": _TBL}
91+
out, events = dedup_tables(
92+
sections, table_ownership={"tab:complexity": "experiments"}
93+
)
94+
assert _has_begin_table(out["experiments"])
95+
assert events == []
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""Progress callback: sinks, safe dispatch, CLI flag resolution."""
2+
from __future__ import annotations
3+
4+
import io
5+
import json
6+
7+
import pytest
8+
9+
from hermes_sci.progress import (
10+
Progress,
11+
_resolve_builtin,
12+
emit,
13+
human,
14+
jsonl,
15+
noop,
16+
)
17+
18+
19+
def test_progress_defaults():
20+
p = Progress(kind="stage_start", stage="ideate")
21+
assert p.message == ""
22+
assert p.current == 0 and p.total == 0
23+
assert p.meta == {}
24+
assert p.ts > 0
25+
26+
27+
def test_noop_sink_is_silent(capsys):
28+
noop(Progress(kind="stage_start", stage="ideate"))
29+
assert capsys.readouterr().err == ""
30+
31+
32+
def test_human_sink_writes_to_provided_fd():
33+
buf = io.StringIO()
34+
human(Progress(kind="stage_start", stage="ideate", message="topic"), fd=buf)
35+
human(Progress(kind="item", stage="section", current=2, total=5,
36+
message="method"), fd=buf)
37+
human(Progress(kind="stage_end", stage="verify", message="6/8",
38+
meta={"duration_s": 12.4}), fd=buf)
39+
out = buf.getvalue()
40+
assert "→ ideate: topic" in out
41+
assert "[2/5]" in out and "method" in out
42+
assert "✓ verify" in out and "(12.4s)" in out
43+
44+
45+
def test_human_sink_handles_unknown_kind():
46+
buf = io.StringIO()
47+
# The type says Literal, but runtime unknowns shouldn't crash.
48+
human(Progress(kind="wat", stage="ideate", message="x"), fd=buf) # type: ignore[arg-type]
49+
assert "wat" in buf.getvalue()
50+
51+
52+
def test_jsonl_is_parseable():
53+
buf = io.StringIO()
54+
jsonl(Progress(kind="item", stage="section", current=1, total=3,
55+
message="intro", meta={"model": "m1"}), fd=buf)
56+
line = buf.getvalue().strip()
57+
obj = json.loads(line)
58+
assert obj["kind"] == "item"
59+
assert obj["stage"] == "section"
60+
assert obj["current"] == 1 and obj["total"] == 3
61+
assert obj["meta"] == {"model": "m1"}
62+
63+
64+
def test_emit_swallows_callback_errors():
65+
def bad(p):
66+
raise RuntimeError("sink broke")
67+
# Must not raise — a broken sink cannot crash the pipeline.
68+
emit(bad, Progress(kind="stage_start", stage="ideate"))
69+
70+
71+
@pytest.mark.parametrize("name,expected", [
72+
("human", human),
73+
("jsonl", jsonl),
74+
("off", noop),
75+
("none", noop),
76+
("garbage", human), # unknowns fall through to human
77+
])
78+
def test_resolve_builtin(name, expected):
79+
assert _resolve_builtin(name) is expected

0 commit comments

Comments
 (0)