Skip to content

Commit 6e118e7

Browse files
committed
fixed metrics and output metric file names
1 parent 27e3bee commit 6e118e7

File tree

7 files changed

+193
-45
lines changed

7 files changed

+193
-45
lines changed

README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -152,19 +152,20 @@ Outputs:
152152
- each wrapper invocation creates a run-specific subdirectory: `run_metrics/<RUN_ID>/`
153153
- example: `run_metrics/20260225T120434/`
154154
- `run_metrics/<RUN_ID>/metrics.csv` includes both conversion and compression metrics for that run
155+
- in `--rdf-layout batch`, compression metrics are aggregated across all part files so CSV stays one row per sample
155156
- compound-compression fields are explicit and separate from raw-RDF compression:
156157
- `gzip_on_hdt_*` (gzip applied to `.hdt`)
157158
- `brotli_on_hdt_*` (brotli applied to `.hdt`)
158159
- `hdt_source` (`generated` vs `existing` when reused)
159160
- conversion step artifacts:
160-
- `run_metrics/<RUN_ID>/conversion-time-<output_name>-<run_id>.txt`
161-
- `run_metrics/<RUN_ID>/conversion-metrics-<output_name>-<run_id>.json`
161+
- `run_metrics/<RUN_ID>/conversion_time/<output_name>/<RUN_ID>`
162+
- `run_metrics/<RUN_ID>/conversion_metrics/<output_name>/<RUN_ID>`
162163
- compression step artifacts:
163-
- `run_metrics/<RUN_ID>/compression-time-<method>-<output_name>-<run_id>.txt`
164-
- `run_metrics/<RUN_ID>/compression-metrics-<output_name>-<run_id>.json`
164+
- `run_metrics/<RUN_ID>/compression_time/<method>/<output_name>/<RUN_ID>`
165+
- `run_metrics/<RUN_ID>/compression_metrics/<output_name>/<RUN_ID>`
165166
- wrapper runtime artifacts:
166167
- `run_metrics/<RUN_ID>/wrapper_execution_times.csv` (one row for that run with mode, elapsed time, status, and full-mode triple totals when available)
167-
- `run_metrics/<RUN_ID>/.wrapper_logs/wrapper-<run_id>.log` stores detailed Docker/stdout/stderr command output
168+
- `run_metrics/<RUN_ID>/wrapper_logs/<RUN_ID>` stores detailed Docker/stdout/stderr command output
168169

169170
Small VCF fixtures for RDF size/inflation test runs:
170171
- `test/test_vcf_files/test-100.vcf` (100 total lines)

src/compression.sh

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ if [[ ! -f "$METRICS_CSV" ]]; then
184184
else
185185
EXISTING_HEADER=$(head -n 1 "$METRICS_CSV")
186186
if [[ "$EXISTING_HEADER" != "$METRICS_HEADER" ]]; then
187-
BACKUP="$METRICS_CSV.bak-$RUN_ID"
187+
BACKUP="$LOGDIR/metrics_csv_bak_${RUN_ID}"
188188
cp "$METRICS_CSV" "$BACKUP"
189189
echo "WARNING: metrics header mismatch; backed up to $BACKUP and creating new metrics file." >&2
190190
echo "$METRICS_HEADER" > "$METRICS_CSV"
@@ -208,10 +208,15 @@ for OUT in "${OUTPUT_DIRS[@]}"; do
208208
SAFE_BASENAME="rdf"
209209
fi
210210

211-
TIME_LOG_GZIP="$LOGDIR/compression-time-gzip-${SAFE_BASENAME}-${RUN_ID}.txt"
212-
TIME_LOG_BROTLI="$LOGDIR/compression-time-brotli-${SAFE_BASENAME}-${RUN_ID}.txt"
213-
TIME_LOG_HDT="$LOGDIR/compression-time-hdt-${SAFE_BASENAME}-${RUN_ID}.txt"
214-
METRICS_JSON="$LOGDIR/compression-metrics-${SAFE_BASENAME}-${RUN_ID}.json"
211+
TIME_LOG_GZIP_DIR="$LOGDIR/compression_time/gzip/${SAFE_BASENAME}"
212+
TIME_LOG_BROTLI_DIR="$LOGDIR/compression_time/brotli/${SAFE_BASENAME}"
213+
TIME_LOG_HDT_DIR="$LOGDIR/compression_time/hdt/${SAFE_BASENAME}"
214+
METRICS_JSON_DIR="$LOGDIR/compression_metrics/${SAFE_BASENAME}"
215+
mkdir -p "$TIME_LOG_GZIP_DIR" "$TIME_LOG_BROTLI_DIR" "$TIME_LOG_HDT_DIR" "$METRICS_JSON_DIR"
216+
TIME_LOG_GZIP="$TIME_LOG_GZIP_DIR/${RUN_ID}"
217+
TIME_LOG_BROTLI="$TIME_LOG_BROTLI_DIR/${RUN_ID}"
218+
TIME_LOG_HDT="$TIME_LOG_HDT_DIR/${RUN_ID}"
219+
METRICS_JSON="$METRICS_JSON_DIR/${RUN_ID}"
215220

216221
HDT_SOURCE="not_used"
217222
GZIP_ON_HDT_SIZE=0

src/run_conversion.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,11 @@ SAFE_OUT_NAME=$(printf "%s" "$OUT_NAME" | tr -cs 'A-Za-z0-9._-' '_')
2929
if [[ -z "$SAFE_OUT_NAME" ]]; then
3030
SAFE_OUT_NAME="rdf"
3131
fi
32-
TIME_LOG="$LOGDIR/conversion-time-${SAFE_OUT_NAME}-${RUN_ID}.txt"
33-
METRICS_JSON="$LOGDIR/conversion-metrics-${SAFE_OUT_NAME}-${RUN_ID}.json"
32+
TIME_LOG_DIR="$LOGDIR/conversion_time/${SAFE_OUT_NAME}"
33+
METRICS_JSON_DIR="$LOGDIR/conversion_metrics/${SAFE_OUT_NAME}"
34+
mkdir -p "$TIME_LOG_DIR" "$METRICS_JSON_DIR"
35+
TIME_LOG="$TIME_LOG_DIR/${RUN_ID}"
36+
METRICS_JSON="$METRICS_JSON_DIR/${RUN_ID}"
3437
METRICS_CSV="$LOGDIR/metrics.csv"
3538
METRICS_HEADER="run_id,timestamp,output_name,output_dir,exit_code_java,wall_seconds_java,user_seconds_java,sys_seconds_java,max_rss_kb_java,input_mapping_size_bytes,input_vcf_size_bytes,output_dir_size_bytes,output_triples,jar,mapping_file,output_path,combined_nq_size_bytes,gzip_size_bytes,brotli_size_bytes,hdt_size_bytes,exit_code_gzip,exit_code_brotli,exit_code_hdt,wall_seconds_gzip,user_seconds_gzip,sys_seconds_gzip,max_rss_kb_gzip,wall_seconds_brotli,user_seconds_brotli,sys_seconds_brotli,max_rss_kb_brotli,wall_seconds_hdt,user_seconds_hdt,sys_seconds_hdt,max_rss_kb_hdt,compression_methods,hdt_source,gzip_on_hdt_size_bytes,brotli_on_hdt_size_bytes,exit_code_gzip_on_hdt,exit_code_brotli_on_hdt,wall_seconds_gzip_on_hdt,user_seconds_gzip_on_hdt,sys_seconds_gzip_on_hdt,max_rss_kb_gzip_on_hdt,wall_seconds_brotli_on_hdt,user_seconds_brotli_on_hdt,sys_seconds_brotli_on_hdt,max_rss_kb_brotli_on_hdt"
3639

@@ -238,7 +241,7 @@ if [[ ! -f "$METRICS_CSV" ]]; then
238241
else
239242
EXISTING_HEADER=$(head -n 1 "$METRICS_CSV")
240243
if [[ "$EXISTING_HEADER" != "$METRICS_HEADER" ]]; then
241-
BACKUP="$METRICS_CSV.bak-$RUN_ID"
244+
BACKUP="$LOGDIR/metrics_csv_bak_${RUN_ID}"
242245
cp "$METRICS_CSV" "$BACKUP"
243246
echo "WARNING: metrics header mismatch; backed up to $BACKUP and creating new metrics file." >&2
244247
echo "$METRICS_HEADER" > "$METRICS_CSV"

test/test_compression_unit.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,10 @@ def test_compression_updates_existing_metrics_row_with_mocked_tools(self):
154154
self.assertTrue((output / "rdf.nq.gz").exists())
155155
self.assertTrue((output / "rdf.nq.br").exists())
156156
self.assertTrue((output / "rdf.hdt").exists())
157-
self.assertTrue((logdir / "compression-time-gzip-rdf-run-compress-1.txt").exists())
158-
self.assertTrue((logdir / "compression-time-brotli-rdf-run-compress-1.txt").exists())
159-
self.assertTrue((logdir / "compression-time-hdt-rdf-run-compress-1.txt").exists())
160-
self.assertTrue((logdir / "compression-metrics-rdf-run-compress-1.json").exists())
157+
self.assertTrue((logdir / "compression_time" / "gzip" / "rdf" / "run-compress-1").exists())
158+
self.assertTrue((logdir / "compression_time" / "brotli" / "rdf" / "run-compress-1").exists())
159+
self.assertTrue((logdir / "compression_time" / "hdt" / "rdf" / "run-compress-1").exists())
160+
self.assertTrue((logdir / "compression_metrics" / "rdf" / "run-compress-1").exists())
161161

162162
row = read_metrics_row(metrics_csv, run_id, "rdf")
163163
self.assertEqual(row["run_id"], run_id)
@@ -327,7 +327,7 @@ def test_compression_backs_up_metrics_file_on_header_mismatch(self):
327327
)
328328
result = subprocess.run(["bash", str(SCRIPT), "-m", "gzip"], env=env, capture_output=True, text=True)
329329
self.assertEqual(result.returncode, 0, msg=result.stderr)
330-
self.assertTrue((logdir / "metrics.csv.bak-run-hdr").exists())
330+
self.assertTrue((logdir / "metrics_csv_bak_run-hdr").exists())
331331

332332
def test_compression_reports_failure_when_gzip_fails(self):
333333
"""gzip failure path returns non-zero and records gzip exit code."""

test/test_run_conversion_unit.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ def test_run_conversion_writes_nt_and_metrics_without_real_java(self):
6767
merged_nt = out_dir / "rdf" / "rdf.nt"
6868
self.assertTrue(merged_nt.exists())
6969
self.assertIn("<s> <p> <o> .", merged_nt.read_text())
70-
self.assertTrue((metrics_dir / "conversion-time-rdf-run123.txt").exists())
71-
self.assertTrue((metrics_dir / "conversion-metrics-rdf-run123.json").exists())
70+
self.assertTrue((metrics_dir / "conversion_time" / "rdf" / "run123").exists())
71+
self.assertTrue((metrics_dir / "conversion_metrics" / "rdf" / "run123").exists())
7272

7373
metrics_csv = metrics_dir / "metrics.csv"
7474
self.assertTrue(metrics_csv.exists())
@@ -183,7 +183,7 @@ def test_run_conversion_backs_up_metrics_file_on_header_mismatch(self):
183183

184184
result = subprocess.run(["bash", str(SCRIPT)], env=env, capture_output=True, text=True)
185185
self.assertEqual(result.returncode, 0, msg=result.stderr)
186-
self.assertTrue((metrics_dir / "metrics.csv.bak-run-hdr").exists())
186+
self.assertTrue((metrics_dir / "metrics_csv_bak_run-hdr").exists())
187187

188188
def test_run_conversion_handles_comment_only_output_without_crashing(self):
189189
"""Comment-only output does not crash triple counting and records zero triples."""

test/test_vcf_rdfizer_unit.py

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,9 @@ def fake_run(cmd, cwd=None, env=None):
471471
payload = {"artifacts": {"output_triples": {"TOTAL": 17}}}
472472
run_metrics_dir = metrics_dir / run_id
473473
run_metrics_dir.mkdir(parents=True, exist_ok=True)
474-
(run_metrics_dir / f"conversion-metrics-{out_name}-{run_id}.json").write_text(
474+
conversion_metrics_dir = run_metrics_dir / "conversion_metrics" / out_name
475+
conversion_metrics_dir.mkdir(parents=True, exist_ok=True)
476+
(conversion_metrics_dir / run_id).write_text(
475477
json.dumps(payload),
476478
encoding="utf-8",
477479
)
@@ -840,6 +842,58 @@ def fake_run(cmd, cwd=None, env=None):
840842
self.assertIn("/data/in/part-00001.nt", gzip_cmds[1][-1])
841843
self.assertEqual(out_buf.getvalue().count("* Output directory:"), 1)
842844

845+
def test_main_full_mode_batch_metrics_upsert_is_sample_scoped(self):
846+
"""Batch layout writes compression CSV metrics once per sample, not once per RDF part."""
847+
with tempfile.TemporaryDirectory() as td:
848+
tmp_path = Path(td)
849+
input_dir, rules_path = prepare_inputs(tmp_path)
850+
out_dir = tmp_path / "out"
851+
seen_output_names = []
852+
853+
def fake_run(cmd, cwd=None, env=None):
854+
if "/opt/vcf-rdfizer/run_conversion.sh" in cmd:
855+
sample_dir = out_dir / "sample"
856+
sample_dir.mkdir(parents=True, exist_ok=True)
857+
(sample_dir / "part-00000.nt").write_text("<s1> <p> <o> .\n")
858+
(sample_dir / "part-00001.nt").write_text("<s2> <p> <o> .\n")
859+
return 0
860+
861+
def fake_update_metrics_csv_with_compression(**kwargs):
862+
seen_output_names.append(kwargs["output_name"])
863+
864+
old_cwd = os.getcwd()
865+
os.chdir(tmp_path)
866+
try:
867+
with mock.patch.object(vcf_rdfizer, "run", side_effect=fake_run), mock.patch.object(
868+
vcf_rdfizer, "check_docker", return_value=True
869+
), mock.patch.object(
870+
vcf_rdfizer, "docker_image_exists", return_value=True
871+
), mock.patch.object(
872+
vcf_rdfizer, "discover_tsv_triplets", return_value=mocked_triplets()
873+
), mock.patch.object(
874+
vcf_rdfizer, "update_metrics_csv_with_compression", side_effect=fake_update_metrics_csv_with_compression
875+
):
876+
rc = invoke_main(
877+
[
878+
"--input",
879+
str(input_dir),
880+
"--rules",
881+
str(rules_path),
882+
"--rdf-layout",
883+
"batch",
884+
"--compression",
885+
"gzip",
886+
"--out",
887+
str(out_dir),
888+
"--keep-tsv",
889+
]
890+
)
891+
finally:
892+
os.chdir(old_cwd)
893+
894+
self.assertEqual(rc, 0)
895+
self.assertEqual(seen_output_names, ["sample"])
896+
843897
def test_main_full_mode_aggregate_layout_sets_merge_flag(self):
844898
"""Aggregate layout passes AGGREGATE_RDF=1 to conversion step."""
845899
with tempfile.TemporaryDirectory() as td:
@@ -1151,10 +1205,10 @@ def fake_run(cmd, cwd=None, env=None):
11511205
self.assertIn("sample", csv_text)
11521206
self.assertIn("hdt", csv_text)
11531207

1154-
json_files = list(run_metrics_dir.glob("compression-metrics-sample-*.json"))
1155-
time_files = list(run_metrics_dir.glob("compression-time-hdt-sample-*.txt"))
1156-
self.assertTrue(json_files)
1157-
self.assertTrue(time_files)
1208+
json_file = run_metrics_dir / "compression_metrics" / "sample" / run_metrics_dir.name
1209+
time_file = run_metrics_dir / "compression_time" / "hdt" / "sample" / run_metrics_dir.name
1210+
self.assertTrue(json_file.exists())
1211+
self.assertTrue(time_file.exists())
11581212

11591213
def test_main_full_mode_deletes_nt_with_docker_fallback_on_permission_error(self):
11601214
"""Full mode falls back to Docker-based removal when .nt unlink raises PermissionError."""

0 commit comments

Comments
 (0)