fixed metrics and output metric file names

ecrum19 · ecrum19 · commit 6e118e7a864d · 2026-02-26T21:10:08.000+01:00
diff --git a/README.md b/README.md
@@ -152,19 +152,20 @@ Outputs:
   - each wrapper invocation creates a run-specific subdirectory: `run_metrics/<RUN_ID>/`
     - example: `run_metrics/20260225T120434/`
   - `run_metrics/<RUN_ID>/metrics.csv` includes both conversion and compression metrics for that run
+    - in `--rdf-layout batch`, compression metrics are aggregated across all part files so CSV stays one row per sample
     - compound-compression fields are explicit and separate from raw-RDF compression:
       - `gzip_on_hdt_*` (gzip applied to `.hdt`)
       - `brotli_on_hdt_*` (brotli applied to `.hdt`)
       - `hdt_source` (`generated` vs `existing` when reused)
   - conversion step artifacts:
-    - `run_metrics/<RUN_ID>/conversion-time-<output_name>-<run_id>.txt`
-    - `run_metrics/<RUN_ID>/conversion-metrics-<output_name>-<run_id>.json`
+    - `run_metrics/<RUN_ID>/conversion_time/<output_name>/<RUN_ID>`
+    - `run_metrics/<RUN_ID>/conversion_metrics/<output_name>/<RUN_ID>`
   - compression step artifacts:
-    - `run_metrics/<RUN_ID>/compression-time-<method>-<output_name>-<run_id>.txt`
-    - `run_metrics/<RUN_ID>/compression-metrics-<output_name>-<run_id>.json`
+    - `run_metrics/<RUN_ID>/compression_time/<method>/<output_name>/<RUN_ID>`
+    - `run_metrics/<RUN_ID>/compression_metrics/<output_name>/<RUN_ID>`
   - wrapper runtime artifacts:
     - `run_metrics/<RUN_ID>/wrapper_execution_times.csv` (one row for that run with mode, elapsed time, status, and full-mode triple totals when available)
-  - `run_metrics/<RUN_ID>/.wrapper_logs/wrapper-<run_id>.log` stores detailed Docker/stdout/stderr command output
+  - `run_metrics/<RUN_ID>/wrapper_logs/<RUN_ID>` stores detailed Docker/stdout/stderr command output
 
 Small VCF fixtures for RDF size/inflation test runs:
 - `test/test_vcf_files/test-100.vcf` (100 total lines)
diff --git a/src/compression.sh b/src/compression.sh
@@ -184,7 +184,7 @@ if [[ ! -f "$METRICS_CSV" ]]; then
 else
   EXISTING_HEADER=$(head -n 1 "$METRICS_CSV")
   if [[ "$EXISTING_HEADER" != "$METRICS_HEADER" ]]; then
-    BACKUP="$METRICS_CSV.bak-$RUN_ID"
+    BACKUP="$LOGDIR/metrics_csv_bak_${RUN_ID}"
     cp "$METRICS_CSV" "$BACKUP"
     echo "WARNING: metrics header mismatch; backed up to $BACKUP and creating new metrics file." >&2
     echo "$METRICS_HEADER" > "$METRICS_CSV"
@@ -208,10 +208,15 @@ for OUT in "${OUTPUT_DIRS[@]}"; do
     SAFE_BASENAME="rdf"
   fi
 
-  TIME_LOG_GZIP="$LOGDIR/compression-time-gzip-${SAFE_BASENAME}-${RUN_ID}.txt"
-  TIME_LOG_BROTLI="$LOGDIR/compression-time-brotli-${SAFE_BASENAME}-${RUN_ID}.txt"
-  TIME_LOG_HDT="$LOGDIR/compression-time-hdt-${SAFE_BASENAME}-${RUN_ID}.txt"
-  METRICS_JSON="$LOGDIR/compression-metrics-${SAFE_BASENAME}-${RUN_ID}.json"
+  TIME_LOG_GZIP_DIR="$LOGDIR/compression_time/gzip/${SAFE_BASENAME}"
+  TIME_LOG_BROTLI_DIR="$LOGDIR/compression_time/brotli/${SAFE_BASENAME}"
+  TIME_LOG_HDT_DIR="$LOGDIR/compression_time/hdt/${SAFE_BASENAME}"
+  METRICS_JSON_DIR="$LOGDIR/compression_metrics/${SAFE_BASENAME}"
+  mkdir -p "$TIME_LOG_GZIP_DIR" "$TIME_LOG_BROTLI_DIR" "$TIME_LOG_HDT_DIR" "$METRICS_JSON_DIR"
+  TIME_LOG_GZIP="$TIME_LOG_GZIP_DIR/${RUN_ID}"
+  TIME_LOG_BROTLI="$TIME_LOG_BROTLI_DIR/${RUN_ID}"
+  TIME_LOG_HDT="$TIME_LOG_HDT_DIR/${RUN_ID}"
+  METRICS_JSON="$METRICS_JSON_DIR/${RUN_ID}"
 
   HDT_SOURCE="not_used"
   GZIP_ON_HDT_SIZE=0
diff --git a/src/run_conversion.sh b/src/run_conversion.sh
@@ -29,8 +29,11 @@ SAFE_OUT_NAME=$(printf "%s" "$OUT_NAME" | tr -cs 'A-Za-z0-9._-' '_')
 if [[ -z "$SAFE_OUT_NAME" ]]; then
   SAFE_OUT_NAME="rdf"
 fi
-TIME_LOG="$LOGDIR/conversion-time-${SAFE_OUT_NAME}-${RUN_ID}.txt"
-METRICS_JSON="$LOGDIR/conversion-metrics-${SAFE_OUT_NAME}-${RUN_ID}.json"
+TIME_LOG_DIR="$LOGDIR/conversion_time/${SAFE_OUT_NAME}"
+METRICS_JSON_DIR="$LOGDIR/conversion_metrics/${SAFE_OUT_NAME}"
+mkdir -p "$TIME_LOG_DIR" "$METRICS_JSON_DIR"
+TIME_LOG="$TIME_LOG_DIR/${RUN_ID}"
+METRICS_JSON="$METRICS_JSON_DIR/${RUN_ID}"
 METRICS_CSV="$LOGDIR/metrics.csv"
 METRICS_HEADER="run_id,timestamp,output_name,output_dir,exit_code_java,wall_seconds_java,user_seconds_java,sys_seconds_java,max_rss_kb_java,input_mapping_size_bytes,input_vcf_size_bytes,output_dir_size_bytes,output_triples,jar,mapping_file,output_path,combined_nq_size_bytes,gzip_size_bytes,brotli_size_bytes,hdt_size_bytes,exit_code_gzip,exit_code_brotli,exit_code_hdt,wall_seconds_gzip,user_seconds_gzip,sys_seconds_gzip,max_rss_kb_gzip,wall_seconds_brotli,user_seconds_brotli,sys_seconds_brotli,max_rss_kb_brotli,wall_seconds_hdt,user_seconds_hdt,sys_seconds_hdt,max_rss_kb_hdt,compression_methods,hdt_source,gzip_on_hdt_size_bytes,brotli_on_hdt_size_bytes,exit_code_gzip_on_hdt,exit_code_brotli_on_hdt,wall_seconds_gzip_on_hdt,user_seconds_gzip_on_hdt,sys_seconds_gzip_on_hdt,max_rss_kb_gzip_on_hdt,wall_seconds_brotli_on_hdt,user_seconds_brotli_on_hdt,sys_seconds_brotli_on_hdt,max_rss_kb_brotli_on_hdt"
 
@@ -238,7 +241,7 @@ if [[ ! -f "$METRICS_CSV" ]]; then
 else
   EXISTING_HEADER=$(head -n 1 "$METRICS_CSV")
   if [[ "$EXISTING_HEADER" != "$METRICS_HEADER" ]]; then
-    BACKUP="$METRICS_CSV.bak-$RUN_ID"
+    BACKUP="$LOGDIR/metrics_csv_bak_${RUN_ID}"
     cp "$METRICS_CSV" "$BACKUP"
     echo "WARNING: metrics header mismatch; backed up to $BACKUP and creating new metrics file." >&2
     echo "$METRICS_HEADER" > "$METRICS_CSV"
diff --git a/test/test_compression_unit.py b/test/test_compression_unit.py
@@ -154,10 +154,10 @@ def test_compression_updates_existing_metrics_row_with_mocked_tools(self):
             self.assertTrue((output / "rdf.nq.gz").exists())
             self.assertTrue((output / "rdf.nq.br").exists())
             self.assertTrue((output / "rdf.hdt").exists())
-            self.assertTrue((logdir / "compression-time-gzip-rdf-run-compress-1.txt").exists())
-            self.assertTrue((logdir / "compression-time-brotli-rdf-run-compress-1.txt").exists())
-            self.assertTrue((logdir / "compression-time-hdt-rdf-run-compress-1.txt").exists())
-            self.assertTrue((logdir / "compression-metrics-rdf-run-compress-1.json").exists())
+            self.assertTrue((logdir / "compression_time" / "gzip" / "rdf" / "run-compress-1").exists())
+            self.assertTrue((logdir / "compression_time" / "brotli" / "rdf" / "run-compress-1").exists())
+            self.assertTrue((logdir / "compression_time" / "hdt" / "rdf" / "run-compress-1").exists())
+            self.assertTrue((logdir / "compression_metrics" / "rdf" / "run-compress-1").exists())
 
             row = read_metrics_row(metrics_csv, run_id, "rdf")
             self.assertEqual(row["run_id"], run_id)
@@ -327,7 +327,7 @@ def test_compression_backs_up_metrics_file_on_header_mismatch(self):
             )
             result = subprocess.run(["bash", str(SCRIPT), "-m", "gzip"], env=env, capture_output=True, text=True)
             self.assertEqual(result.returncode, 0, msg=result.stderr)
-            self.assertTrue((logdir / "metrics.csv.bak-run-hdr").exists())
+            self.assertTrue((logdir / "metrics_csv_bak_run-hdr").exists())
 
     def test_compression_reports_failure_when_gzip_fails(self):
         """gzip failure path returns non-zero and records gzip exit code."""
diff --git a/test/test_run_conversion_unit.py b/test/test_run_conversion_unit.py
@@ -67,8 +67,8 @@ def test_run_conversion_writes_nt_and_metrics_without_real_java(self):
             merged_nt = out_dir / "rdf" / "rdf.nt"
             self.assertTrue(merged_nt.exists())
             self.assertIn("<s> <p> <o> .", merged_nt.read_text())
-            self.assertTrue((metrics_dir / "conversion-time-rdf-run123.txt").exists())
-            self.assertTrue((metrics_dir / "conversion-metrics-rdf-run123.json").exists())
+            self.assertTrue((metrics_dir / "conversion_time" / "rdf" / "run123").exists())
+            self.assertTrue((metrics_dir / "conversion_metrics" / "rdf" / "run123").exists())
 
             metrics_csv = metrics_dir / "metrics.csv"
             self.assertTrue(metrics_csv.exists())
@@ -183,7 +183,7 @@ def test_run_conversion_backs_up_metrics_file_on_header_mismatch(self):
 
             result = subprocess.run(["bash", str(SCRIPT)], env=env, capture_output=True, text=True)
             self.assertEqual(result.returncode, 0, msg=result.stderr)
-            self.assertTrue((metrics_dir / "metrics.csv.bak-run-hdr").exists())
+            self.assertTrue((metrics_dir / "metrics_csv_bak_run-hdr").exists())
 
     def test_run_conversion_handles_comment_only_output_without_crashing(self):
         """Comment-only output does not crash triple counting and records zero triples."""
diff --git a/test/test_vcf_rdfizer_unit.py b/test/test_vcf_rdfizer_unit.py
@@ -471,7 +471,9 @@ def fake_run(cmd, cwd=None, env=None):
                     payload = {"artifacts": {"output_triples": {"TOTAL": 17}}}
                     run_metrics_dir = metrics_dir / run_id
                     run_metrics_dir.mkdir(parents=True, exist_ok=True)
-                    (run_metrics_dir / f"conversion-metrics-{out_name}-{run_id}.json").write_text(
+                    conversion_metrics_dir = run_metrics_dir / "conversion_metrics" / out_name
+                    conversion_metrics_dir.mkdir(parents=True, exist_ok=True)
+                    (conversion_metrics_dir / run_id).write_text(
                         json.dumps(payload),
                         encoding="utf-8",
                     )
@@ -840,6 +842,58 @@ def fake_run(cmd, cwd=None, env=None):
             self.assertIn("/data/in/part-00001.nt", gzip_cmds[1][-1])
             self.assertEqual(out_buf.getvalue().count("* Output directory:"), 1)
 
+    def test_main_full_mode_batch_metrics_upsert_is_sample_scoped(self):
+        """Batch layout writes compression CSV metrics once per sample, not once per RDF part."""
+        with tempfile.TemporaryDirectory() as td:
+            tmp_path = Path(td)
+            input_dir, rules_path = prepare_inputs(tmp_path)
+            out_dir = tmp_path / "out"
+            seen_output_names = []
+
+            def fake_run(cmd, cwd=None, env=None):
+                if "/opt/vcf-rdfizer/run_conversion.sh" in cmd:
+                    sample_dir = out_dir / "sample"
+                    sample_dir.mkdir(parents=True, exist_ok=True)
+                    (sample_dir / "part-00000.nt").write_text("<s1> <p> <o> .\n")
+                    (sample_dir / "part-00001.nt").write_text("<s2> <p> <o> .\n")
+                return 0
+
+            def fake_update_metrics_csv_with_compression(**kwargs):
+                seen_output_names.append(kwargs["output_name"])
+
+            old_cwd = os.getcwd()
+            os.chdir(tmp_path)
+            try:
+                with mock.patch.object(vcf_rdfizer, "run", side_effect=fake_run), mock.patch.object(
+                    vcf_rdfizer, "check_docker", return_value=True
+                ), mock.patch.object(
+                    vcf_rdfizer, "docker_image_exists", return_value=True
+                ), mock.patch.object(
+                    vcf_rdfizer, "discover_tsv_triplets", return_value=mocked_triplets()
+                ), mock.patch.object(
+                    vcf_rdfizer, "update_metrics_csv_with_compression", side_effect=fake_update_metrics_csv_with_compression
+                ):
+                    rc = invoke_main(
+                        [
+                            "--input",
+                            str(input_dir),
+                            "--rules",
+                            str(rules_path),
+                            "--rdf-layout",
+                            "batch",
+                            "--compression",
+                            "gzip",
+                            "--out",
+                            str(out_dir),
+                            "--keep-tsv",
+                        ]
+                    )
+            finally:
+                os.chdir(old_cwd)
+
+            self.assertEqual(rc, 0)
+            self.assertEqual(seen_output_names, ["sample"])
+
     def test_main_full_mode_aggregate_layout_sets_merge_flag(self):
         """Aggregate layout passes AGGREGATE_RDF=1 to conversion step."""
         with tempfile.TemporaryDirectory() as td:
@@ -1151,10 +1205,10 @@ def fake_run(cmd, cwd=None, env=None):
             self.assertIn("sample", csv_text)
             self.assertIn("hdt", csv_text)
 
-            json_files = list(run_metrics_dir.glob("compression-metrics-sample-*.json"))
-            time_files = list(run_metrics_dir.glob("compression-time-hdt-sample-*.txt"))
-            self.assertTrue(json_files)
-            self.assertTrue(time_files)
+            json_file = run_metrics_dir / "compression_metrics" / "sample" / run_metrics_dir.name
+            time_file = run_metrics_dir / "compression_time" / "hdt" / "sample" / run_metrics_dir.name
+            self.assertTrue(json_file.exists())
+            self.assertTrue(time_file.exists())
 
     def test_main_full_mode_deletes_nt_with_docker_fallback_on_permission_error(self):
         """Full mode falls back to Docker-based removal when .nt unlink raises PermissionError."""
diff --git a/vcf_rdfizer.py b/vcf_rdfizer.py