|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +#include <chrono> |
| 21 | +#include <iostream> |
| 22 | +#include <memory> |
| 23 | +#include <string> |
| 24 | + |
| 25 | +#include <arrow/array.h> |
| 26 | +#include <arrow/c/bridge.h> |
| 27 | +#include <arrow/filesystem/localfs.h> |
| 28 | +#include <arrow/type.h> |
| 29 | + |
| 30 | +#include "iceberg/arrow/arrow_fs_file_io_internal.h" |
| 31 | +#include "iceberg/avro/avro_register.h" |
| 32 | +#include "iceberg/file_reader.h" |
| 33 | +#include "iceberg/schema.h" |
| 34 | + |
| 35 | +void PrintUsage(const char* program_name) { |
| 36 | + std::cerr << "Usage: " << program_name << " [options] <avro_file>\n" |
| 37 | + << "Options:\n" |
| 38 | + << " --skip-datum=<true|false> Use direct decoder (default: true)\n" |
| 39 | + << " --batch-size=<N> Batch size for reading (default: 4096)\n" |
| 40 | + << " --help Show this help message\n" |
| 41 | + << "\nExample:\n" |
| 42 | + << " " << program_name |
| 43 | + << " --skip-datum=false --batch-size=1000 data.avro\n"; |
| 44 | +} |
| 45 | + |
| 46 | +int main(int argc, char* argv[]) { |
| 47 | + iceberg::avro::RegisterAll(); |
| 48 | + |
| 49 | + if (argc < 2) { |
| 50 | + PrintUsage(argv[0]); |
| 51 | + return 1; |
| 52 | + } |
| 53 | + |
| 54 | + std::string avro_file; |
| 55 | + bool skip_datum = true; |
| 56 | + int64_t batch_size = 4096; |
| 57 | + |
| 58 | + // Parse arguments |
| 59 | + for (int i = 1; i < argc; ++i) { |
| 60 | + std::string arg = argv[i]; |
| 61 | + if (arg == "--help") { |
| 62 | + PrintUsage(argv[0]); |
| 63 | + return 0; |
| 64 | + } else if (arg.starts_with("--skip-datum=")) { |
| 65 | + std::string value = arg.substr(13); |
| 66 | + if (value == "true" || value == "1") { |
| 67 | + skip_datum = true; |
| 68 | + } else if (value == "false" || value == "0") { |
| 69 | + skip_datum = false; |
| 70 | + } else { |
| 71 | + std::cerr << "Invalid value for --skip-datum: " << value << "\n"; |
| 72 | + return 1; |
| 73 | + } |
| 74 | + } else if (arg.starts_with("--batch-size=")) { |
| 75 | + batch_size = std::stoll(arg.substr(13)); |
| 76 | + if (batch_size <= 0) { |
| 77 | + std::cerr << "Batch size must be positive\n"; |
| 78 | + return 1; |
| 79 | + } |
| 80 | + } else if (arg[0] == '-') { |
| 81 | + std::cerr << "Unknown option: " << arg << "\n"; |
| 82 | + PrintUsage(argv[0]); |
| 83 | + return 1; |
| 84 | + } else { |
| 85 | + avro_file = arg; |
| 86 | + } |
| 87 | + } |
| 88 | + |
| 89 | + if (avro_file.empty()) { |
| 90 | + std::cerr << "Error: No Avro file specified\n"; |
| 91 | + PrintUsage(argv[0]); |
| 92 | + return 1; |
| 93 | + } |
| 94 | + |
| 95 | + std::cout << "Scanning Avro file: " << avro_file << "\n"; |
| 96 | + std::cout << "Skip datum: " << (skip_datum ? "true" : "false") << "\n"; |
| 97 | + std::cout << "Batch size: " << batch_size << "\n"; |
| 98 | + std::cout << std::string(60, '-') << "\n"; |
| 99 | + |
| 100 | + auto local_fs = std::make_shared<::arrow::fs::LocalFileSystem>(); |
| 101 | + auto file_io = std::make_shared<iceberg::arrow::ArrowFileSystemFileIO>(local_fs); |
| 102 | + |
| 103 | + // Get file info |
| 104 | + auto file_info_result = local_fs->GetFileInfo(avro_file); |
| 105 | + if (!file_info_result.ok()) { |
| 106 | + std::cerr << "Error: Cannot access file: " << file_info_result.status().message() |
| 107 | + << "\n"; |
| 108 | + return 1; |
| 109 | + } |
| 110 | + auto file_info = file_info_result.ValueOrDie(); |
| 111 | + if (file_info.type() != ::arrow::fs::FileType::File) { |
| 112 | + std::cerr << "Error: Not a file: " << avro_file << "\n"; |
| 113 | + return 1; |
| 114 | + } |
| 115 | + |
| 116 | + std::cout << "File size: " << file_info.size() << " bytes\n"; |
| 117 | + |
| 118 | + // Configure reader properties |
| 119 | + auto reader_properties = iceberg::ReaderProperties::default_properties(); |
| 120 | + reader_properties->Set(iceberg::ReaderProperties::kAvroSkipDatum, skip_datum); |
| 121 | + reader_properties->Set(iceberg::ReaderProperties::kBatchSize, batch_size); |
| 122 | + |
| 123 | + // Open reader (without projection to read all columns) |
| 124 | + auto reader_result = iceberg::ReaderFactoryRegistry::Open( |
| 125 | + iceberg::FileFormatType::kAvro, {.path = avro_file, |
| 126 | + .length = file_info.size(), |
| 127 | + .io = file_io, |
| 128 | + .projection = nullptr, |
| 129 | + .properties = std::move(reader_properties)}); |
| 130 | + |
| 131 | + if (!reader_result.has_value()) { |
| 132 | + std::cerr << "Error opening reader: " << reader_result.error().message << "\n"; |
| 133 | + return 1; |
| 134 | + } |
| 135 | + |
| 136 | + auto reader = std::move(reader_result.value()); |
| 137 | + |
| 138 | + // Get schema |
| 139 | + auto schema_result = reader->Schema(); |
| 140 | + if (!schema_result.has_value()) { |
| 141 | + std::cerr << "Error getting schema: " << schema_result.error().message << "\n"; |
| 142 | + return 1; |
| 143 | + } |
| 144 | + auto arrow_schema = schema_result.value(); |
| 145 | + auto arrow_schema_import = ::arrow::ImportType(&arrow_schema); |
| 146 | + if (!arrow_schema_import.ok()) { |
| 147 | + std::cerr << "Error importing schema: " << arrow_schema_import.status().message() |
| 148 | + << "\n"; |
| 149 | + return 1; |
| 150 | + } |
| 151 | + std::cout << "Schema: " << arrow_schema_import.ValueOrDie()->ToString() << "\n"; |
| 152 | + std::cout << std::string(60, '-') << "\n"; |
| 153 | + |
| 154 | + // Scan file and measure time |
| 155 | + auto start = std::chrono::high_resolution_clock::now(); |
| 156 | + |
| 157 | + int64_t total_rows = 0; |
| 158 | + int64_t batch_count = 0; |
| 159 | + |
| 160 | + while (true) { |
| 161 | + auto batch_result = reader->Next(); |
| 162 | + if (!batch_result.has_value()) { |
| 163 | + std::cerr << "Error reading batch: " << batch_result.error().message << "\n"; |
| 164 | + return 1; |
| 165 | + } |
| 166 | + |
| 167 | + auto batch_opt = batch_result.value(); |
| 168 | + if (!batch_opt.has_value()) { |
| 169 | + // End of file |
| 170 | + break; |
| 171 | + } |
| 172 | + |
| 173 | + auto arrow_array = batch_opt.value(); |
| 174 | + auto arrow_type = arrow_schema_import.ValueOrDie(); |
| 175 | + auto array_import = ::arrow::ImportArray(&arrow_array, arrow_type); |
| 176 | + if (!array_import.ok()) { |
| 177 | + std::cerr << "Error importing array: " << array_import.status().message() << "\n"; |
| 178 | + return 1; |
| 179 | + } |
| 180 | + |
| 181 | + int64_t batch_rows = array_import.ValueOrDie()->length(); |
| 182 | + total_rows += batch_rows; |
| 183 | + batch_count++; |
| 184 | + } |
| 185 | + |
| 186 | + auto end = std::chrono::high_resolution_clock::now(); |
| 187 | + auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start); |
| 188 | + |
| 189 | + // Print results |
| 190 | + std::cout << "\nResults:\n"; |
| 191 | + std::cout << " Total rows: " << total_rows << "\n"; |
| 192 | + std::cout << " Batches: " << batch_count << "\n"; |
| 193 | + std::cout << " Time: " << duration.count() << " ms\n"; |
| 194 | + std::cout << " Throughput: " |
| 195 | + << (duration.count() > 0 ? (total_rows * 1000 / duration.count()) : 0) |
| 196 | + << " rows/sec\n"; |
| 197 | + std::cout << " Throughput: " |
| 198 | + << (duration.count() > 0 |
| 199 | + ? (file_info.size() / 1024.0 / 1024.0) / (duration.count() / 1000.0) |
| 200 | + : 0) |
| 201 | + << " MB/sec\n"; |
| 202 | + |
| 203 | + return 0; |
| 204 | +} |
0 commit comments