diff --git a/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java b/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java index 8a2ad90ff7ef..c6ed06f5fe06 100644 --- a/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java +++ b/paimon-core/src/main/java/org/apache/paimon/consumer/ConsumerManager.java @@ -45,7 +45,7 @@ public class ConsumerManager implements Serializable { private static final long serialVersionUID = 1L; - private static final String CONSUMER_PREFIX = "consumer-"; + public static final String CONSUMER_PREFIX = "consumer-"; private final FileIO fileIO; private final Path tablePath; diff --git a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java index 8667f2271d32..549c0e96a8d8 100644 --- a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java +++ b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaManager.java @@ -109,7 +109,7 @@ @ThreadSafe public class SchemaManager implements Serializable { - private static final String SCHEMA_PREFIX = "schema-"; + public static final String SCHEMA_PREFIX = "schema-"; private final FileIO fileIO; private final Path tableRoot; diff --git a/paimon-core/src/main/java/org/apache/paimon/utils/FileType.java b/paimon-core/src/main/java/org/apache/paimon/utils/FileType.java new file mode 100644 index 000000000000..963780cbfd93 --- /dev/null +++ b/paimon-core/src/main/java/org/apache/paimon/utils/FileType.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.utils; + +import org.apache.paimon.consumer.ConsumerManager; +import org.apache.paimon.fs.Path; +import org.apache.paimon.io.DataFilePathFactory; +import org.apache.paimon.schema.SchemaManager; +import org.apache.paimon.service.ServiceManager; + +/** + * Classification of Paimon files. + * + * + */ +public enum FileType { + META, + DATA, + BUCKET_INDEX, + GLOBAL_INDEX, + FILE_INDEX; + + private static final String MANIFEST = "manifest"; + private static final String CHANGELOG_DIR = "changelog"; + private static final String GLOBAL_INDEX_INFIX = "global-index-"; + + /** Returns {@code true} if this file type is any kind of index. */ + public boolean isIndex() { + return this == BUCKET_INDEX || this == GLOBAL_INDEX || this == FILE_INDEX; + } + + /** + * Classify a file based on its full path. + * + *

When the file does not match any known pattern, it defaults to {@link #DATA}. + */ + public static FileType classify(Path filePath) { + String name = filePath.getName(); + + // meta file prefixes: snapshot-, schema-, stat-, tag-, consumer-, service- + if (name.startsWith(SnapshotManager.SNAPSHOT_PREFIX) + || name.startsWith(SchemaManager.SCHEMA_PREFIX) + || name.startsWith(FileStorePathFactory.STATISTICS_PREFIX) + || name.startsWith(TagManager.TAG_PREFIX) + || name.startsWith(ConsumerManager.CONSUMER_PREFIX) + || name.startsWith(ServiceManager.SERVICE_PREFIX)) { + return META; + } + + // file index: {data-file}.index (e.g. data-xxx.orc.index) + // must check before global index since global index also ends with ".index" + if (name.endsWith(DataFilePathFactory.INDEX_PATH_SUFFIX)) { + if (name.contains(GLOBAL_INDEX_INFIX)) { + return GLOBAL_INDEX; + } + return FILE_INDEX; + } + + // manifest, manifest-list, index-manifest: name contains "manifest" + if (name.contains(MANIFEST)) { + return META; + } + + // bucket index: name starts with "index-" (e.g. index-{uuid}-{N}) + if (name.startsWith(FileStorePathFactory.INDEX_PREFIX)) { + return BUCKET_INDEX; + } + + // hint files + if ("EARLIEST".equals(name) || "LATEST".equals(name)) { + return META; + } + + // success files + if ("_SUCCESS".equals(name) || name.endsWith("_SUCCESS")) { + return META; + } + + // changelog metadata: parent dir is "changelog" and name starts with "changelog-" + if (name.startsWith(ChangelogManager.CHANGELOG_PREFIX) + && CHANGELOG_DIR.equals(filePath.getParent().getName())) { + return META; + } + + // default: DATA + return DATA; + } +} diff --git a/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java b/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java index 703b93e145be..4a0a18e51e01 100644 --- a/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java +++ b/paimon-core/src/main/java/org/apache/paimon/utils/TagManager.java @@ -62,7 +62,7 @@ public class TagManager { private static final Logger LOG = LoggerFactory.getLogger(TagManager.class); - private static final String TAG_PREFIX = "tag-"; + public static final String TAG_PREFIX = "tag-"; private final FileIO fileIO; private final Path tablePath; diff --git a/paimon-core/src/test/java/org/apache/paimon/utils/FileTypeTest.java b/paimon-core/src/test/java/org/apache/paimon/utils/FileTypeTest.java new file mode 100644 index 000000000000..423fe3ba5fed --- /dev/null +++ b/paimon-core/src/test/java/org/apache/paimon/utils/FileTypeTest.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.utils; + +import org.apache.paimon.fs.Path; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for {@link FileType}. */ +public class FileTypeTest { + + private static final String TABLE_ROOT = "hdfs://cluster/warehouse/db.db/table"; + + // ===== META files ===== + + @Test + public void testMetaFiles() { + // snapshot + assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/snapshot-1"))) + .isEqualTo(FileType.META); + assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/snapshot-100"))) + .isEqualTo(FileType.META); + // schema + assertThat(FileType.classify(new Path(TABLE_ROOT + "/schema/schema-0"))) + .isEqualTo(FileType.META); + assertThat(FileType.classify(new Path(TABLE_ROOT + "/schema/schema-5"))) + .isEqualTo(FileType.META); + // manifest + assertThat(FileType.classify(new Path(TABLE_ROOT + "/manifest/manifest-a1b2c3d4-0"))) + .isEqualTo(FileType.META); + // manifest-list + assertThat(FileType.classify(new Path(TABLE_ROOT + "/manifest/manifest-list-a1b2c3d4-0"))) + .isEqualTo(FileType.META); + // index-manifest + assertThat(FileType.classify(new Path(TABLE_ROOT + "/manifest/index-manifest-a1b2c3d4-0"))) + .isEqualTo(FileType.META); + // statistics + assertThat(FileType.classify(new Path(TABLE_ROOT + "/statistics/stat-a1b2c3d4-0"))) + .isEqualTo(FileType.META); + // tag + assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-2024-01-01"))) + .isEqualTo(FileType.META); + assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-myTag"))) + .isEqualTo(FileType.META); + // changelog metadata + assertThat(FileType.classify(new Path(TABLE_ROOT + "/changelog/changelog-1"))) + .isEqualTo(FileType.META); + assertThat(FileType.classify(new Path(TABLE_ROOT + "/changelog/changelog-100"))) + .isEqualTo(FileType.META); + // hint files + assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/EARLIEST"))) + .isEqualTo(FileType.META); + assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/LATEST"))) + .isEqualTo(FileType.META); + // success files + assertThat(FileType.classify(new Path(TABLE_ROOT + "/dt=2024-01-01/bucket-0/_SUCCESS"))) + .isEqualTo(FileType.META); + assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-success-file/myTag_SUCCESS"))) + .isEqualTo(FileType.META); + // consumer + assertThat(FileType.classify(new Path(TABLE_ROOT + "/consumer/consumer-myGroup"))) + .isEqualTo(FileType.META); + // service + assertThat(FileType.classify(new Path(TABLE_ROOT + "/service/service-primary-key-lookup"))) + .isEqualTo(FileType.META); + } + + // ===== BUCKET_INDEX files ===== + + @Test + public void testBucketIndexFiles() { + // under /index/ dir + assertThat(FileType.classify(new Path(TABLE_ROOT + "/index/index-a1b2c3d4-0"))) + .isEqualTo(FileType.BUCKET_INDEX); + // under bucket dir + assertThat( + FileType.classify( + new Path(TABLE_ROOT + "/dt=2024-01-01/bucket-0/index-a1b2c3d4-0"))) + .isEqualTo(FileType.BUCKET_INDEX); + } + + // ===== GLOBAL_INDEX files ===== + + @Test + public void testGlobalIndexFiles() { + // btree global index + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/index/btree-global-index-a1b2c3d4-e5f6.index"))) + .isEqualTo(FileType.GLOBAL_INDEX); + // bitmap global index + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/index/bitmap-global-index-a1b2c3d4-e5f6.index"))) + .isEqualTo(FileType.GLOBAL_INDEX); + // lumina vector global index + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/index/lumina-vector-ann-global-index-a1b2c3d4.index"))) + .isEqualTo(FileType.GLOBAL_INDEX); + // tantivy fulltext global index + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/index/tantivy-fulltext-global-index-a1b2c3d4.index"))) + .isEqualTo(FileType.GLOBAL_INDEX); + } + + // ===== FILE_INDEX files ===== + + @Test + public void testFileIndexFiles() { + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc.index"))) + .isEqualTo(FileType.FILE_INDEX); + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.parquet.index"))) + .isEqualTo(FileType.FILE_INDEX); + } + + // ===== isIndex() ===== + + @Test + public void testIsIndex() { + assertThat(FileType.BUCKET_INDEX.isIndex()).isTrue(); + assertThat(FileType.GLOBAL_INDEX.isIndex()).isTrue(); + assertThat(FileType.FILE_INDEX.isIndex()).isTrue(); + assertThat(FileType.META.isIndex()).isFalse(); + assertThat(FileType.DATA.isIndex()).isFalse(); + } + + // ===== DATA files ===== + + @Test + public void testDataFiles() { + // orc data file + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc"))) + .isEqualTo(FileType.DATA); + // parquet data file + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.parquet"))) + .isEqualTo(FileType.DATA); + // changelog data file + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc"))) + .isEqualTo(FileType.DATA); + // blob file + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.blob"))) + .isEqualTo(FileType.DATA); + // vector file + assertThat( + FileType.classify( + new Path( + TABLE_ROOT + + "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.vector.lance"))) + .isEqualTo(FileType.DATA); + // unknown file defaults to DATA + assertThat( + FileType.classify( + new Path(TABLE_ROOT + "/dt=2024-01-01/bucket-0/unknown-file.bin"))) + .isEqualTo(FileType.DATA); + } + + // ===== Edge cases ===== + + @Test + public void testChangelogDirInParentPathNotMisjudged() { + // table root path itself contains "changelog", should not be misjudged as META + String tricky = "hdfs://cluster/changelog/warehouse/db.db/table"; + assertThat( + FileType.classify( + new Path(tricky + "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc"))) + .isEqualTo(FileType.DATA); + assertThat( + FileType.classify( + new Path( + tricky + + "/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc"))) + .isEqualTo(FileType.DATA); + } + + @Test + public void testBranchPaths() { + String branchRoot = TABLE_ROOT + "/branch/branch-dev"; + assertThat(FileType.classify(new Path(branchRoot + "/snapshot/snapshot-1"))) + .isEqualTo(FileType.META); + assertThat(FileType.classify(new Path(branchRoot + "/schema/schema-0"))) + .isEqualTo(FileType.META); + assertThat(FileType.classify(new Path(branchRoot + "/changelog/changelog-1"))) + .isEqualTo(FileType.META); + assertThat(FileType.classify(new Path(branchRoot + "/index/index-a1b2c3d4-0"))) + .isEqualTo(FileType.BUCKET_INDEX); + assertThat( + FileType.classify( + new Path(branchRoot + "/index/btree-global-index-a1b2c3d4.index"))) + .isEqualTo(FileType.GLOBAL_INDEX); + } +}