Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public class ConsumerManager implements Serializable {

private static final long serialVersionUID = 1L;

private static final String CONSUMER_PREFIX = "consumer-";
public static final String CONSUMER_PREFIX = "consumer-";

private final FileIO fileIO;
private final Path tablePath;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@
@ThreadSafe
public class SchemaManager implements Serializable {

private static final String SCHEMA_PREFIX = "schema-";
public static final String SCHEMA_PREFIX = "schema-";

private final FileIO fileIO;
private final Path tableRoot;
Expand Down
111 changes: 111 additions & 0 deletions paimon-core/src/main/java/org/apache/paimon/utils/FileType.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.utils;

import org.apache.paimon.consumer.ConsumerManager;
import org.apache.paimon.fs.Path;
import org.apache.paimon.io.DataFilePathFactory;
import org.apache.paimon.schema.SchemaManager;
import org.apache.paimon.service.ServiceManager;

/**
* Classification of Paimon files.
*
* <ul>
* <li>{@link #META}: snapshot, schema, manifest, statistics, tag, changelog metadata, hint files,
* _SUCCESS, consumer, service files
* <li>{@link #DATA}: data files and any unrecognized files (default)
* <li>{@link #BUCKET_INDEX}: bucket level index files (Hash, DV)
* <li>{@link #GLOBAL_INDEX}: table level global index files (btree, bitmap, lumina, tantivy)
* <li>{@link #FILE_INDEX}: data-file index files (bloom filter, bitmap, etc.)
* </ul>
*/
public enum FileType {
META,
DATA,
BUCKET_INDEX,
GLOBAL_INDEX,
FILE_INDEX;

private static final String MANIFEST = "manifest";
private static final String CHANGELOG_DIR = "changelog";
private static final String GLOBAL_INDEX_INFIX = "global-index-";

/** Returns {@code true} if this file type is any kind of index. */
public boolean isIndex() {
return this == BUCKET_INDEX || this == GLOBAL_INDEX || this == FILE_INDEX;
}

/**
* Classify a file based on its full path.
*
* <p>When the file does not match any known pattern, it defaults to {@link #DATA}.
*/
public static FileType classify(Path filePath) {
String name = filePath.getName();

// meta file prefixes: snapshot-, schema-, stat-, tag-, consumer-, service-
if (name.startsWith(SnapshotManager.SNAPSHOT_PREFIX)
|| name.startsWith(SchemaManager.SCHEMA_PREFIX)
|| name.startsWith(FileStorePathFactory.STATISTICS_PREFIX)
|| name.startsWith(TagManager.TAG_PREFIX)
|| name.startsWith(ConsumerManager.CONSUMER_PREFIX)
|| name.startsWith(ServiceManager.SERVICE_PREFIX)) {
return META;
}

// file index: {data-file}.index (e.g. data-xxx.orc.index)
// must check before global index since global index also ends with ".index"
if (name.endsWith(DataFilePathFactory.INDEX_PATH_SUFFIX)) {
if (name.contains(GLOBAL_INDEX_INFIX)) {
return GLOBAL_INDEX;
}
return FILE_INDEX;
}

// manifest, manifest-list, index-manifest: name contains "manifest"
if (name.contains(MANIFEST)) {
return META;
}

// bucket index: name starts with "index-" (e.g. index-{uuid}-{N})
if (name.startsWith(FileStorePathFactory.INDEX_PREFIX)) {
return BUCKET_INDEX;
}

// hint files
if ("EARLIEST".equals(name) || "LATEST".equals(name)) {
return META;
}

// success files
if ("_SUCCESS".equals(name) || name.endsWith("_SUCCESS")) {
return META;
}

// changelog metadata: parent dir is "changelog" and name starts with "changelog-"
if (name.startsWith(ChangelogManager.CHANGELOG_PREFIX)
&& CHANGELOG_DIR.equals(filePath.getParent().getName())) {
return META;
}

// default: DATA
return DATA;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public class TagManager {

private static final Logger LOG = LoggerFactory.getLogger(TagManager.class);

private static final String TAG_PREFIX = "tag-";
public static final String TAG_PREFIX = "tag-";

private final FileIO fileIO;
private final Path tablePath;
Expand Down
243 changes: 243 additions & 0 deletions paimon-core/src/test/java/org/apache/paimon/utils/FileTypeTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.utils;

import org.apache.paimon.fs.Path;

import org.junit.jupiter.api.Test;

import static org.assertj.core.api.Assertions.assertThat;

/** Tests for {@link FileType}. */
public class FileTypeTest {

private static final String TABLE_ROOT = "hdfs://cluster/warehouse/db.db/table";

// ===== META files =====

@Test
public void testMetaFiles() {
// snapshot
assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/snapshot-1")))
.isEqualTo(FileType.META);
assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/snapshot-100")))
.isEqualTo(FileType.META);
// schema
assertThat(FileType.classify(new Path(TABLE_ROOT + "/schema/schema-0")))
.isEqualTo(FileType.META);
assertThat(FileType.classify(new Path(TABLE_ROOT + "/schema/schema-5")))
.isEqualTo(FileType.META);
// manifest
assertThat(FileType.classify(new Path(TABLE_ROOT + "/manifest/manifest-a1b2c3d4-0")))
.isEqualTo(FileType.META);
// manifest-list
assertThat(FileType.classify(new Path(TABLE_ROOT + "/manifest/manifest-list-a1b2c3d4-0")))
.isEqualTo(FileType.META);
// index-manifest
assertThat(FileType.classify(new Path(TABLE_ROOT + "/manifest/index-manifest-a1b2c3d4-0")))
.isEqualTo(FileType.META);
// statistics
assertThat(FileType.classify(new Path(TABLE_ROOT + "/statistics/stat-a1b2c3d4-0")))
.isEqualTo(FileType.META);
// tag
assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-2024-01-01")))
.isEqualTo(FileType.META);
assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-myTag")))
.isEqualTo(FileType.META);
// changelog metadata
assertThat(FileType.classify(new Path(TABLE_ROOT + "/changelog/changelog-1")))
.isEqualTo(FileType.META);
assertThat(FileType.classify(new Path(TABLE_ROOT + "/changelog/changelog-100")))
.isEqualTo(FileType.META);
// hint files
assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/EARLIEST")))
.isEqualTo(FileType.META);
assertThat(FileType.classify(new Path(TABLE_ROOT + "/snapshot/LATEST")))
.isEqualTo(FileType.META);
// success files
assertThat(FileType.classify(new Path(TABLE_ROOT + "/dt=2024-01-01/bucket-0/_SUCCESS")))
.isEqualTo(FileType.META);
assertThat(FileType.classify(new Path(TABLE_ROOT + "/tag/tag-success-file/myTag_SUCCESS")))
.isEqualTo(FileType.META);
// consumer
assertThat(FileType.classify(new Path(TABLE_ROOT + "/consumer/consumer-myGroup")))
.isEqualTo(FileType.META);
// service
assertThat(FileType.classify(new Path(TABLE_ROOT + "/service/service-primary-key-lookup")))
.isEqualTo(FileType.META);
}

// ===== BUCKET_INDEX files =====

@Test
public void testBucketIndexFiles() {
// under /index/ dir
assertThat(FileType.classify(new Path(TABLE_ROOT + "/index/index-a1b2c3d4-0")))
.isEqualTo(FileType.BUCKET_INDEX);
// under bucket dir
assertThat(
FileType.classify(
new Path(TABLE_ROOT + "/dt=2024-01-01/bucket-0/index-a1b2c3d4-0")))
.isEqualTo(FileType.BUCKET_INDEX);
}

// ===== GLOBAL_INDEX files =====

@Test
public void testGlobalIndexFiles() {
// btree global index
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/index/btree-global-index-a1b2c3d4-e5f6.index")))
.isEqualTo(FileType.GLOBAL_INDEX);
// bitmap global index
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/index/bitmap-global-index-a1b2c3d4-e5f6.index")))
.isEqualTo(FileType.GLOBAL_INDEX);
// lumina vector global index
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/index/lumina-vector-ann-global-index-a1b2c3d4.index")))
.isEqualTo(FileType.GLOBAL_INDEX);
// tantivy fulltext global index
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/index/tantivy-fulltext-global-index-a1b2c3d4.index")))
.isEqualTo(FileType.GLOBAL_INDEX);
}

// ===== FILE_INDEX files =====

@Test
public void testFileIndexFiles() {
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc.index")))
.isEqualTo(FileType.FILE_INDEX);
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.parquet.index")))
.isEqualTo(FileType.FILE_INDEX);
}

// ===== isIndex() =====

@Test
public void testIsIndex() {
assertThat(FileType.BUCKET_INDEX.isIndex()).isTrue();
assertThat(FileType.GLOBAL_INDEX.isIndex()).isTrue();
assertThat(FileType.FILE_INDEX.isIndex()).isTrue();
assertThat(FileType.META.isIndex()).isFalse();
assertThat(FileType.DATA.isIndex()).isFalse();
}

// ===== DATA files =====

@Test
public void testDataFiles() {
// orc data file
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc")))
.isEqualTo(FileType.DATA);
// parquet data file
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.parquet")))
.isEqualTo(FileType.DATA);
// changelog data file
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc")))
.isEqualTo(FileType.DATA);
// blob file
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.blob")))
.isEqualTo(FileType.DATA);
// vector file
assertThat(
FileType.classify(
new Path(
TABLE_ROOT
+ "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.vector.lance")))
.isEqualTo(FileType.DATA);
// unknown file defaults to DATA
assertThat(
FileType.classify(
new Path(TABLE_ROOT + "/dt=2024-01-01/bucket-0/unknown-file.bin")))
.isEqualTo(FileType.DATA);
}

// ===== Edge cases =====

@Test
public void testChangelogDirInParentPathNotMisjudged() {
// table root path itself contains "changelog", should not be misjudged as META
String tricky = "hdfs://cluster/changelog/warehouse/db.db/table";
assertThat(
FileType.classify(
new Path(tricky + "/dt=2024-01-01/bucket-0/data-a1b2c3d4-0.orc")))
.isEqualTo(FileType.DATA);
assertThat(
FileType.classify(
new Path(
tricky
+ "/dt=2024-01-01/bucket-0/changelog-a1b2c3d4-0.orc")))
.isEqualTo(FileType.DATA);
}

@Test
public void testBranchPaths() {
String branchRoot = TABLE_ROOT + "/branch/branch-dev";
assertThat(FileType.classify(new Path(branchRoot + "/snapshot/snapshot-1")))
.isEqualTo(FileType.META);
assertThat(FileType.classify(new Path(branchRoot + "/schema/schema-0")))
.isEqualTo(FileType.META);
assertThat(FileType.classify(new Path(branchRoot + "/changelog/changelog-1")))
.isEqualTo(FileType.META);
assertThat(FileType.classify(new Path(branchRoot + "/index/index-a1b2c3d4-0")))
.isEqualTo(FileType.BUCKET_INDEX);
assertThat(
FileType.classify(
new Path(branchRoot + "/index/btree-global-index-a1b2c3d4.index")))
.isEqualTo(FileType.GLOBAL_INDEX);
}
}