@@ -540,41 +540,6 @@ def print_nt_hdt_summary(
540540 """Print per-output size summary for RDF and selected compression artifacts."""
541541 print (f"{ indent } * Output directory: { output_root } " )
542542
543- def rdf_label (path : Path ) -> str :
544- if path .suffix == ".nt" :
545- return "N-Triples (.nt)"
546- if path .suffix == ".nq" :
547- return "N-Quads (.nq)"
548- if path .suffix :
549- return f"RDF ({ path .suffix } )"
550- return "RDF"
551-
552- def output_name_for_method (path : Path , method : str ) -> str :
553- stem = path .stem
554- ext = path .suffix .lstrip ("." ) or "nt"
555- if method == "gzip" :
556- return f"{ stem } .{ ext } .gz"
557- if method == "brotli" :
558- return f"{ stem } .{ ext } .br"
559- if method == "hdt" :
560- return f"{ stem } .hdt"
561- if method == "hdt_gzip" :
562- return f"{ stem } .hdt.gz"
563- if method == "hdt_brotli" :
564- return f"{ stem } .hdt.br"
565- return f"{ stem } .{ method } "
566-
567- def label_for_method (path : Path , method : str ) -> str :
568- ext = path .suffix .lstrip ("." ) or "nt"
569- labels = {
570- "gzip" : f"gzip (.{ ext } .gz)" ,
571- "brotli" : f"brotli (.{ ext } .br)" ,
572- "hdt" : "HDT (.hdt)" ,
573- "hdt_gzip" : "gzip-on-HDT (.hdt.gz)" ,
574- "hdt_brotli" : "brotli-on-HDT (.hdt.br)" ,
575- }
576- return labels .get (method , method )
577-
578543 nt_size = nt_size_override if nt_size_override is not None else file_size_bytes (nt_path )
579544
580545 if nt_size is None :
@@ -583,7 +548,7 @@ def label_for_method(path: Path, method: str) -> str:
583548 nt_text = f"{ format_bytes (nt_size )} ({ nt_path } )"
584549 if nt_note :
585550 nt_text = f"{ nt_text } ({ nt_note } )"
586- print (f"{ indent } - { rdf_label (nt_path )} : { nt_text } " )
551+ print (f"{ indent } - { rdf_label_for_path (nt_path )} : { nt_text } " )
587552
588553 # Backward-compatible fallback summary when no explicit compression method
589554 # set is provided to this printer.
@@ -601,7 +566,7 @@ def label_for_method(path: Path, method: str) -> str:
601566
602567 results = method_results or {}
603568 for method in selected_methods :
604- artifact_name = output_name_for_method (nt_path , method )
569+ artifact_name = compression_artifact_name_for_method (nt_path , method )
605570 artifact_path = output_root / artifact_name
606571 result = results .get (method , {})
607572 size = result .get ("output_size_bytes" )
@@ -619,7 +584,48 @@ def label_for_method(path: Path, method: str) -> str:
619584 if source == "existing" :
620585 artifact_text = f"{ artifact_text } (reused existing HDT)"
621586
622- print (f"{ indent } - { label_for_method (nt_path , method )} : { artifact_text } " )
587+ print (f"{ indent } - { compression_method_label_for_path (nt_path , method )} : { artifact_text } " )
588+
589+
590+ def rdf_label_for_path (path : Path ) -> str :
591+ """Return human-readable RDF format label for a path."""
592+ if path .suffix == ".nt" :
593+ return "N-Triples (.nt)"
594+ if path .suffix == ".nq" :
595+ return "N-Quads (.nq)"
596+ if path .suffix :
597+ return f"RDF ({ path .suffix } )"
598+ return "RDF"
599+
600+
601+ def compression_artifact_name_for_method (path : Path , method : str ) -> str :
602+ """Compute expected compressed artifact filename for a method."""
603+ stem = path .stem
604+ ext = path .suffix .lstrip ("." ) or "nt"
605+ if method == "gzip" :
606+ return f"{ stem } .{ ext } .gz"
607+ if method == "brotli" :
608+ return f"{ stem } .{ ext } .br"
609+ if method == "hdt" :
610+ return f"{ stem } .hdt"
611+ if method == "hdt_gzip" :
612+ return f"{ stem } .hdt.gz"
613+ if method == "hdt_brotli" :
614+ return f"{ stem } .hdt.br"
615+ return f"{ stem } .{ method } "
616+
617+
618+ def compression_method_label_for_path (path : Path , method : str ) -> str :
619+ """Return human-readable compression method label for a path."""
620+ ext = path .suffix .lstrip ("." ) or "nt"
621+ labels = {
622+ "gzip" : f"gzip (.{ ext } .gz)" ,
623+ "brotli" : f"brotli (.{ ext } .br)" ,
624+ "hdt" : "HDT (.hdt)" ,
625+ "hdt_gzip" : "gzip-on-HDT (.hdt.gz)" ,
626+ "hdt_brotli" : "brotli-on-HDT (.hdt.br)" ,
627+ }
628+ return labels .get (method , method )
623629
624630
625631def remove_file_with_docker_fallback (
@@ -1524,11 +1530,13 @@ def run_full_mode(
15241530 method_results_by_file [raw_rdf_path .name ] = method_results
15251531 print (" * Compression ✅" )
15261532
1533+ raw_size_before_cleanup_by_file : dict [str , int ] = {}
15271534 try :
15281535 # Persist machine-readable metrics after compression succeeds.
15291536 for raw_rdf_path in raw_rdf_files :
15301537 method_results = method_results_by_file .get (raw_rdf_path .name , {})
15311538 source_size_before_cleanup = int (file_size_bytes (raw_rdf_path ) or 0 )
1539+ raw_size_before_cleanup_by_file [raw_rdf_path .name ] = source_size_before_cleanup
15321540 write_compression_metrics_artifacts (
15331541 metrics_dir = metrics_dir ,
15341542 run_id = run_id ,
@@ -1588,28 +1596,71 @@ def run_full_mode(
15881596 ):
15891597 return 1
15901598
1591- for raw_rdf_path in raw_rdf_files :
1592- hdt_path = (out_dir / output_name ) / f"{ raw_rdf_path .stem } .hdt"
1593- rdf_size = file_size_bytes (raw_rdf_path )
1594- nt_note = None
1595- method_results = method_results_by_file .get (raw_rdf_path .name , {})
1596- if raw_rdf_path .exists ():
1597- nt_note = "retained via --keep-rdf" if keep_rdf else "retained"
1598- elif not keep_rdf and selected_methods :
1599- nt_note = "removed, set --keep-rdf to retain"
1600- elif not keep_rdf and not selected_methods :
1601- nt_note = "kept (compression methods set to none)"
1602- print_nt_hdt_summary (
1603- output_root = out_dir / output_name ,
1604- nt_path = raw_rdf_path ,
1605- hdt_path = hdt_path ,
1606- indent = " " ,
1607- nt_note = nt_note ,
1608- nt_size_override = rdf_size ,
1609- selected_methods = selected_methods ,
1610- method_results = method_results ,
1599+ if rdf_layout == "batch" and raw_rdf_files :
1600+ output_root = out_dir / output_name
1601+ part_count = len (raw_rdf_files )
1602+ raw_total_size = sum (raw_size_before_cleanup_by_file .values ())
1603+
1604+ if keep_rdf :
1605+ raw_note = "retained via --keep-rdf"
1606+ elif selected_methods :
1607+ raw_note = "removed, set --keep-rdf to retain"
1608+ else :
1609+ raw_note = "kept (compression methods set to none)"
1610+
1611+ first_path = raw_rdf_files [0 ]
1612+ print (f" * Output directory: { output_root } " )
1613+ print (f" - RDF part files: { part_count } " )
1614+ raw_text = f"{ format_bytes (raw_total_size )} across { part_count } files"
1615+ print (
1616+ f" - { rdf_label_for_path (first_path )} total: { raw_text } "
1617+ f"({ raw_note } )"
16111618 )
16121619
1620+ if selected_methods :
1621+ for method in selected_methods :
1622+ method_total = 0
1623+ method_count = 0
1624+ for raw_rdf_path in raw_rdf_files :
1625+ result = method_results_by_file .get (raw_rdf_path .name , {}).get (method )
1626+ if not result or int (result .get ("exit_code" , 1 )) != 0 :
1627+ continue
1628+ method_total += int (result .get ("output_size_bytes" ) or 0 )
1629+ method_count += 1
1630+
1631+ label = compression_method_label_for_path (first_path , method )
1632+ if method_count == 0 :
1633+ print (f" - { label } : not generated" )
1634+ else :
1635+ print (
1636+ f" - { label } : { format_bytes (method_total )} "
1637+ f"across { method_count } files"
1638+ )
1639+ else :
1640+ print (" - Compression: none selected" )
1641+ else :
1642+ for raw_rdf_path in raw_rdf_files :
1643+ hdt_path = (out_dir / output_name ) / f"{ raw_rdf_path .stem } .hdt"
1644+ rdf_size = file_size_bytes (raw_rdf_path )
1645+ nt_note = None
1646+ method_results = method_results_by_file .get (raw_rdf_path .name , {})
1647+ if raw_rdf_path .exists ():
1648+ nt_note = "retained via --keep-rdf" if keep_rdf else "retained"
1649+ elif not keep_rdf and selected_methods :
1650+ nt_note = "removed, set --keep-rdf to retain"
1651+ elif not keep_rdf and not selected_methods :
1652+ nt_note = "kept (compression methods set to none)"
1653+ print_nt_hdt_summary (
1654+ output_root = out_dir / output_name ,
1655+ nt_path = raw_rdf_path ,
1656+ hdt_path = hdt_path ,
1657+ indent = " " ,
1658+ nt_note = nt_note ,
1659+ nt_size_override = rdf_size ,
1660+ selected_methods = selected_methods ,
1661+ method_results = method_results ,
1662+ )
1663+
16131664 if not keep_tsv :
16141665 # Cleanup only the triplet generated for this input iteration.
16151666 for tsv_path in (triplet ["records" ], triplet ["headers" ], triplet ["metadata" ]):
0 commit comments