Skip to content

Commit ed15136

Browse files
committed
clean batch console logs
1 parent 7bde1c3 commit ed15136

2 files changed

Lines changed: 113 additions & 60 deletions

File tree

test/test_vcf_rdfizer_unit.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -787,12 +787,13 @@ def fake_run(cmd, cwd=None, env=None):
787787
self.assertIn("/opt/vcf-rdfizer/run_conversion.sh", commands[1])
788788

789789
def test_main_full_mode_batch_layout_compresses_each_rml_part(self):
790-
"""Batch layout keeps RML output parts separate and compresses each part individually."""
790+
"""Batch layout compresses each part and prints one consolidated size summary."""
791791
with tempfile.TemporaryDirectory() as td:
792792
tmp_path = Path(td)
793793
input_dir, rules_path = prepare_inputs(tmp_path)
794794
out_dir = tmp_path / "out"
795795
commands = []
796+
out_buf = StringIO()
796797

797798
def fake_run(cmd, cwd=None, env=None):
798799
commands.append(cmd)
@@ -812,7 +813,7 @@ def fake_run(cmd, cwd=None, env=None):
812813
vcf_rdfizer, "docker_image_exists", return_value=True
813814
), mock.patch.object(
814815
vcf_rdfizer, "discover_tsv_triplets", return_value=mocked_triplets()
815-
):
816+
), redirect_stdout(out_buf):
816817
rc = invoke_main(
817818
[
818819
"--input",
@@ -837,6 +838,7 @@ def fake_run(cmd, cwd=None, env=None):
837838
self.assertEqual(len(gzip_cmds), 2)
838839
self.assertIn("/data/in/part-00000.nt", gzip_cmds[0][-1])
839840
self.assertIn("/data/in/part-00001.nt", gzip_cmds[1][-1])
841+
self.assertEqual(out_buf.getvalue().count("* Output directory:"), 1)
840842

841843
def test_main_full_mode_aggregate_layout_sets_merge_flag(self):
842844
"""Aggregate layout passes AGGREGATE_RDF=1 to conversion step."""

vcf_rdfizer.py

Lines changed: 109 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -540,41 +540,6 @@ def print_nt_hdt_summary(
540540
"""Print per-output size summary for RDF and selected compression artifacts."""
541541
print(f"{indent}* Output directory: {output_root}")
542542

543-
def rdf_label(path: Path) -> str:
544-
if path.suffix == ".nt":
545-
return "N-Triples (.nt)"
546-
if path.suffix == ".nq":
547-
return "N-Quads (.nq)"
548-
if path.suffix:
549-
return f"RDF ({path.suffix})"
550-
return "RDF"
551-
552-
def output_name_for_method(path: Path, method: str) -> str:
553-
stem = path.stem
554-
ext = path.suffix.lstrip(".") or "nt"
555-
if method == "gzip":
556-
return f"{stem}.{ext}.gz"
557-
if method == "brotli":
558-
return f"{stem}.{ext}.br"
559-
if method == "hdt":
560-
return f"{stem}.hdt"
561-
if method == "hdt_gzip":
562-
return f"{stem}.hdt.gz"
563-
if method == "hdt_brotli":
564-
return f"{stem}.hdt.br"
565-
return f"{stem}.{method}"
566-
567-
def label_for_method(path: Path, method: str) -> str:
568-
ext = path.suffix.lstrip(".") or "nt"
569-
labels = {
570-
"gzip": f"gzip (.{ext}.gz)",
571-
"brotli": f"brotli (.{ext}.br)",
572-
"hdt": "HDT (.hdt)",
573-
"hdt_gzip": "gzip-on-HDT (.hdt.gz)",
574-
"hdt_brotli": "brotli-on-HDT (.hdt.br)",
575-
}
576-
return labels.get(method, method)
577-
578543
nt_size = nt_size_override if nt_size_override is not None else file_size_bytes(nt_path)
579544

580545
if nt_size is None:
@@ -583,7 +548,7 @@ def label_for_method(path: Path, method: str) -> str:
583548
nt_text = f"{format_bytes(nt_size)} ({nt_path})"
584549
if nt_note:
585550
nt_text = f"{nt_text} ({nt_note})"
586-
print(f"{indent} - {rdf_label(nt_path)}: {nt_text}")
551+
print(f"{indent} - {rdf_label_for_path(nt_path)}: {nt_text}")
587552

588553
# Backward-compatible fallback summary when no explicit compression method
589554
# set is provided to this printer.
@@ -601,7 +566,7 @@ def label_for_method(path: Path, method: str) -> str:
601566

602567
results = method_results or {}
603568
for method in selected_methods:
604-
artifact_name = output_name_for_method(nt_path, method)
569+
artifact_name = compression_artifact_name_for_method(nt_path, method)
605570
artifact_path = output_root / artifact_name
606571
result = results.get(method, {})
607572
size = result.get("output_size_bytes")
@@ -619,7 +584,48 @@ def label_for_method(path: Path, method: str) -> str:
619584
if source == "existing":
620585
artifact_text = f"{artifact_text} (reused existing HDT)"
621586

622-
print(f"{indent} - {label_for_method(nt_path, method)}: {artifact_text}")
587+
print(f"{indent} - {compression_method_label_for_path(nt_path, method)}: {artifact_text}")
588+
589+
590+
def rdf_label_for_path(path: Path) -> str:
591+
"""Return human-readable RDF format label for a path."""
592+
if path.suffix == ".nt":
593+
return "N-Triples (.nt)"
594+
if path.suffix == ".nq":
595+
return "N-Quads (.nq)"
596+
if path.suffix:
597+
return f"RDF ({path.suffix})"
598+
return "RDF"
599+
600+
601+
def compression_artifact_name_for_method(path: Path, method: str) -> str:
602+
"""Compute expected compressed artifact filename for a method."""
603+
stem = path.stem
604+
ext = path.suffix.lstrip(".") or "nt"
605+
if method == "gzip":
606+
return f"{stem}.{ext}.gz"
607+
if method == "brotli":
608+
return f"{stem}.{ext}.br"
609+
if method == "hdt":
610+
return f"{stem}.hdt"
611+
if method == "hdt_gzip":
612+
return f"{stem}.hdt.gz"
613+
if method == "hdt_brotli":
614+
return f"{stem}.hdt.br"
615+
return f"{stem}.{method}"
616+
617+
618+
def compression_method_label_for_path(path: Path, method: str) -> str:
619+
"""Return human-readable compression method label for a path."""
620+
ext = path.suffix.lstrip(".") or "nt"
621+
labels = {
622+
"gzip": f"gzip (.{ext}.gz)",
623+
"brotli": f"brotli (.{ext}.br)",
624+
"hdt": "HDT (.hdt)",
625+
"hdt_gzip": "gzip-on-HDT (.hdt.gz)",
626+
"hdt_brotli": "brotli-on-HDT (.hdt.br)",
627+
}
628+
return labels.get(method, method)
623629

624630

625631
def remove_file_with_docker_fallback(
@@ -1524,11 +1530,13 @@ def run_full_mode(
15241530
method_results_by_file[raw_rdf_path.name] = method_results
15251531
print(" * Compression ✅")
15261532

1533+
raw_size_before_cleanup_by_file: dict[str, int] = {}
15271534
try:
15281535
# Persist machine-readable metrics after compression succeeds.
15291536
for raw_rdf_path in raw_rdf_files:
15301537
method_results = method_results_by_file.get(raw_rdf_path.name, {})
15311538
source_size_before_cleanup = int(file_size_bytes(raw_rdf_path) or 0)
1539+
raw_size_before_cleanup_by_file[raw_rdf_path.name] = source_size_before_cleanup
15321540
write_compression_metrics_artifacts(
15331541
metrics_dir=metrics_dir,
15341542
run_id=run_id,
@@ -1588,28 +1596,71 @@ def run_full_mode(
15881596
):
15891597
return 1
15901598

1591-
for raw_rdf_path in raw_rdf_files:
1592-
hdt_path = (out_dir / output_name) / f"{raw_rdf_path.stem}.hdt"
1593-
rdf_size = file_size_bytes(raw_rdf_path)
1594-
nt_note = None
1595-
method_results = method_results_by_file.get(raw_rdf_path.name, {})
1596-
if raw_rdf_path.exists():
1597-
nt_note = "retained via --keep-rdf" if keep_rdf else "retained"
1598-
elif not keep_rdf and selected_methods:
1599-
nt_note = "removed, set --keep-rdf to retain"
1600-
elif not keep_rdf and not selected_methods:
1601-
nt_note = "kept (compression methods set to none)"
1602-
print_nt_hdt_summary(
1603-
output_root=out_dir / output_name,
1604-
nt_path=raw_rdf_path,
1605-
hdt_path=hdt_path,
1606-
indent=" ",
1607-
nt_note=nt_note,
1608-
nt_size_override=rdf_size,
1609-
selected_methods=selected_methods,
1610-
method_results=method_results,
1599+
if rdf_layout == "batch" and raw_rdf_files:
1600+
output_root = out_dir / output_name
1601+
part_count = len(raw_rdf_files)
1602+
raw_total_size = sum(raw_size_before_cleanup_by_file.values())
1603+
1604+
if keep_rdf:
1605+
raw_note = "retained via --keep-rdf"
1606+
elif selected_methods:
1607+
raw_note = "removed, set --keep-rdf to retain"
1608+
else:
1609+
raw_note = "kept (compression methods set to none)"
1610+
1611+
first_path = raw_rdf_files[0]
1612+
print(f" * Output directory: {output_root}")
1613+
print(f" - RDF part files: {part_count}")
1614+
raw_text = f"{format_bytes(raw_total_size)} across {part_count} files"
1615+
print(
1616+
f" - {rdf_label_for_path(first_path)} total: {raw_text} "
1617+
f"({raw_note})"
16111618
)
16121619

1620+
if selected_methods:
1621+
for method in selected_methods:
1622+
method_total = 0
1623+
method_count = 0
1624+
for raw_rdf_path in raw_rdf_files:
1625+
result = method_results_by_file.get(raw_rdf_path.name, {}).get(method)
1626+
if not result or int(result.get("exit_code", 1)) != 0:
1627+
continue
1628+
method_total += int(result.get("output_size_bytes") or 0)
1629+
method_count += 1
1630+
1631+
label = compression_method_label_for_path(first_path, method)
1632+
if method_count == 0:
1633+
print(f" - {label}: not generated")
1634+
else:
1635+
print(
1636+
f" - {label}: {format_bytes(method_total)} "
1637+
f"across {method_count} files"
1638+
)
1639+
else:
1640+
print(" - Compression: none selected")
1641+
else:
1642+
for raw_rdf_path in raw_rdf_files:
1643+
hdt_path = (out_dir / output_name) / f"{raw_rdf_path.stem}.hdt"
1644+
rdf_size = file_size_bytes(raw_rdf_path)
1645+
nt_note = None
1646+
method_results = method_results_by_file.get(raw_rdf_path.name, {})
1647+
if raw_rdf_path.exists():
1648+
nt_note = "retained via --keep-rdf" if keep_rdf else "retained"
1649+
elif not keep_rdf and selected_methods:
1650+
nt_note = "removed, set --keep-rdf to retain"
1651+
elif not keep_rdf and not selected_methods:
1652+
nt_note = "kept (compression methods set to none)"
1653+
print_nt_hdt_summary(
1654+
output_root=out_dir / output_name,
1655+
nt_path=raw_rdf_path,
1656+
hdt_path=hdt_path,
1657+
indent=" ",
1658+
nt_note=nt_note,
1659+
nt_size_override=rdf_size,
1660+
selected_methods=selected_methods,
1661+
method_results=method_results,
1662+
)
1663+
16131664
if not keep_tsv:
16141665
# Cleanup only the triplet generated for this input iteration.
16151666
for tsv_path in (triplet["records"], triplet["headers"], triplet["metadata"]):

0 commit comments

Comments
 (0)