2626import com .google .common .base .Preconditions ;
2727import com .google .common .collect .Lists ;
2828import java .io .IOException ;
29+ import java .util .Collection ;
30+ import java .util .HashSet ;
2931import java .util .List ;
32+ import java .util .Set ;
33+ import javax .annotation .Nullable ;
3034import org .apache .commons .text .TextStringBuilder ;
3135import org .apache .parquet .cli .BaseCommand ;
3236import org .apache .parquet .column .statistics .SizeStatistics ;
4044@ Parameters (commandDescription = "Print size statistics for a Parquet file" )
4145public class ShowSizeStatisticsCommand extends BaseCommand {
4246
47+ private static <T > Set <T > filterOrNull (Collection <T > values ) {
48+ if (values == null || values .isEmpty ()) return null ;
49+ return new HashSet <>(values );
50+ }
51+
52+ private static boolean includesOrAll (@ Nullable Set <?> filter , Object value ) {
53+ return filter == null || filter .contains (value );
54+ }
55+
4356 public ShowSizeStatisticsCommand (Logger console ) {
4457 super (console );
4558 }
4659
4760 @ Parameter (description = "<parquet path>" )
4861 List <String > targets ;
4962
63+ @ Parameter (
64+ names = {"-c" , "--column" , "--columns" },
65+ description = "List of columns (dot paths) to include" )
66+ List <String > columns ;
67+
68+ @ Parameter (
69+ names = {"-r" , "--row-group" , "--row-groups" },
70+ description = "List of row-group indexes to include (0-based)" )
71+ List <Integer > rowGroups ;
72+
5073 @ Override
5174 @ SuppressWarnings ("unchecked" )
5275 public int run () throws IOException {
@@ -60,9 +83,13 @@ public int run() throws IOException {
6083
6184 console .info ("\n File path: {}" , source );
6285
63- List <BlockMetaData > rowGroups = footer .getBlocks ();
64- for (int index = 0 , n = rowGroups .size (); index < n ; index ++) {
65- printRowGroupSizeStats (console , index , rowGroups .get (index ), schema );
86+ List <BlockMetaData > blocks = footer .getBlocks ();
87+
88+ final Set <Integer > rowGroupFilter = filterOrNull (this .rowGroups );
89+
90+ for (int i = 0 , n = blocks .size (); i < n ; i ++) {
91+ if (!includesOrAll (rowGroupFilter , i )) continue ;
92+ printRowGroupSizeStats (console , i , blocks .get (i ), schema );
6693 console .info ("" );
6794 }
6895 }
@@ -84,7 +111,11 @@ private void printRowGroupSizeStats(Logger console, int index, BlockMetaData row
84111 console .info (
85112 String .format (formatString , "column" , "unencoded bytes" , "rep level histogram" , "def level histogram" ));
86113
114+ final Set <String > columnFilter = filterOrNull (this .columns );
115+
87116 for (ColumnChunkMetaData column : rowGroup .getColumns ()) {
117+ String dotPath = column .getPath ().toDotString ();
118+ if (!includesOrAll (columnFilter , dotPath )) continue ;
88119 printColumnSizeStats (console , column , schema , maxColumnWidth );
89120 }
90121 }
@@ -111,6 +142,12 @@ private void printColumnSizeStats(Logger console, ColumnChunkMetaData column, Me
111142
112143 @ Override
113144 public List <String > getExamples () {
114- return Lists .newArrayList ("# Show size statistics for a Parquet file" , "sample.parquet" );
145+ return Lists .newArrayList (
146+ "# Show size statistics for a Parquet file" ,
147+ "sample.parquet" ,
148+ "# Show size statistics for selected columns" ,
149+ "sample.parquet -c name,tags" ,
150+ "# Show size statistics for a specific row-group" ,
151+ "sample.parquet -r 0" );
115152 }
116153}
0 commit comments