Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions fe/fe-common/src/main/java/org/apache/doris/common/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -3418,6 +3418,26 @@ public static int metaServiceRpcRetryTimes() {
+ "for example: s3_load_endpoint_white_list=a,b,c"})
public static String[] s3_load_endpoint_white_list = {};

@ConfField(mutable = true, description = {
"对于确定性的 S3 路径(无通配符如 *, ?),使用 HEAD 请求代替 ListObjects 来避免需要 ListBucket 权限。"
+ "花括号模式 {1,2,3} 和非否定方括号模式 [abc] 会展开为具体路径。"
+ "这对于只有 GetObject 权限的场景很有用。如果遇到问题可以设置为 false 回退到原有行为。",
"For deterministic S3 paths (without wildcards like *, ?), use HEAD requests instead of "
+ "ListObjects to avoid requiring ListBucket permission. Brace patterns {1,2,3} and "
+ "non-negated bracket patterns [abc] are expanded to concrete paths. This is useful when only "
+ "GetObject permission is granted. Set to false to fall back to the original listing behavior."
})
public static boolean s3_skip_list_for_deterministic_path = true;

@ConfField(mutable = true, description = {
"当使用 HEAD 请求代替 ListObjects 时,展开路径的最大数量。如果展开的路径数量超过此限制,"
+ "将回退到使用 ListObjects。这可以防止类似 {1..100}/{1..100} 的模式触发过多的 HEAD 请求。",
"Maximum number of expanded paths when using HEAD requests instead of ListObjects. "
+ "If the expanded path count exceeds this limit, falls back to ListObjects. "
+ "This prevents patterns like {1..100}/{1..100} from triggering too many HEAD requests."
})
public static int s3_head_request_max_paths = 100;

@ConfField(mutable = true, description = {
"指定 Azure endpoint 域名后缀白名单(包含 blob 与 dfs),多个值使用逗号分隔。"
+ "默认值为 .blob.core.windows.net,.dfs.core.windows.net,"
Expand Down
232 changes: 232 additions & 0 deletions fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java
Original file line number Diff line number Diff line change
Expand Up @@ -433,4 +433,236 @@ public static void validateAndTestEndpoint(String endpoint) throws UserException
SecurityChecker.getInstance().stopSSRFChecking();
}
}

/**
* Check if a path pattern is deterministic, meaning all file paths can be determined
* without listing. A pattern is deterministic if it contains no true wildcard characters
* (*, ?) but may contain brace patterns ({...}) and non-negated bracket patterns ([abc], [0-9])
* which can be expanded to concrete paths.
*
* Negated bracket patterns ([!abc], [^abc]) are NOT deterministic because they match
* any character except those listed, requiring a listing to discover matches.
*
* This allows skipping S3 ListBucket operations when only GetObject permission is available.
*
* @param pathPattern Path that may contain glob patterns
* @return true if the pattern is deterministic (expandable without listing)
*/
public static boolean isDeterministicPattern(String pathPattern) {
// Check for wildcard characters that require listing
// Note: '{' is NOT a wildcard - it's a brace expansion pattern that can be deterministically expanded
// Note: '[' is conditionally deterministic - [abc] can be expanded, but [!abc]/[^abc] cannot
char[] wildcardChars = {'*', '?'};
for (char c : wildcardChars) {
if (pathPattern.indexOf(c) != -1) {
return false;
}
}
// Check for escaped characters which indicate complex patterns
if (pathPattern.indexOf('\\') != -1) {
return false;
}
// Check bracket patterns: [abc] and [0-9] are deterministic, [!abc] and [^abc] are not
if (!areBracketPatternsDeterministic(pathPattern)) {
return false;
}
return true;
}

/**
* Check if all bracket patterns in the path are deterministic (non-negated).
* - [abc], [0-9], [a-zA-Z] are deterministic (can be expanded to finite character sets)
* - [!abc], [^abc] are non-deterministic (negation requires listing)
* - Malformed brackets (no closing ]) are non-deterministic
*/
private static boolean areBracketPatternsDeterministic(String pattern) {
int i = 0;
while (i < pattern.length()) {
if (pattern.charAt(i) == '[') {
int end = pattern.indexOf(']', i + 1);
if (end == -1) {
// Malformed bracket - no closing ], treat as non-deterministic
return false;
}
int contentStart = i + 1;
if (contentStart == end) {
// Empty brackets [] - malformed, treat as non-deterministic
return false;
}
// Check for negation
char first = pattern.charAt(contentStart);
if (first == '!' || first == '^') {
return false;
}
i = end + 1;
} else {
i++;
}
}
return true;
}

/**
* Expand bracket character class patterns to brace patterns.
* This converts [abc] to {a,b,c} and [0-9] to {0,1,2,...,9} so that
* the existing brace expansion can handle them.
*
* Only call this on patterns already verified as deterministic by isDeterministicPattern()
* (i.e., no negated brackets like [!...] or [^...]).
*
* Examples:
* - "file[abc].csv" => "file{a,b,c}.csv"
* - "file[0-9].csv" => "file{0,1,2,3,4,5,6,7,8,9}.csv"
* - "file[a-cX].csv" => "file{a,b,c,X}.csv"
* - "file.csv" => "file.csv" (no brackets)
*
* @param pathPattern Path with optional bracket patterns (must not contain negated brackets)
* @return Path with brackets converted to brace patterns
*/
public static String expandBracketPatterns(String pathPattern) {
StringBuilder result = new StringBuilder();
int i = 0;
while (i < pathPattern.length()) {
if (pathPattern.charAt(i) == '[') {
int end = pathPattern.indexOf(']', i + 1);
if (end == -1) {
// Malformed, keep as-is
result.append(pathPattern.charAt(i));
i++;
continue;
}
String content = pathPattern.substring(i + 1, end);
List<Character> chars = expandBracketContent(content);
result.append('{');
for (int j = 0; j < chars.size(); j++) {
if (j > 0) {
result.append(',');
}
result.append(chars.get(j));
}
result.append('}');
i = end + 1;
} else {
result.append(pathPattern.charAt(i));
i++;
}
}
return result.toString();
}

private static List<Character> expandBracketContent(String content) {
List<Character> chars = new ArrayList<>();
int i = 0;
while (i < content.length()) {
if (i + 2 < content.length() && content.charAt(i + 1) == '-') {
// Range like a-z or 0-9
char start = content.charAt(i);
char end = content.charAt(i + 2);
if (start <= end) {
for (char c = start; c <= end; c++) {
if (!chars.contains(c)) {
chars.add(c);
}
}
} else {
for (char c = start; c >= end; c--) {
if (!chars.contains(c)) {
chars.add(c);
}
}
}
i += 3;
} else {
char c = content.charAt(i);
if (!chars.contains(c)) {
chars.add(c);
}
i++;
}
}
return chars;
}

/**
* Expand brace patterns in a path to generate all concrete file paths.
* Handles nested and multiple brace patterns.
*
* Examples:
* - "file{1,2,3}.csv" => ["file1.csv", "file2.csv", "file3.csv"]
* - "data/part{1..3}/file.csv" => ["data/part1/file.csv", "data/part2/file.csv", "data/part3/file.csv"]
* - "file.csv" => ["file.csv"] (no braces)
*
* @param pathPattern Path with optional brace patterns (already processed by extendGlobs)
* @return List of expanded concrete paths
*/
public static List<String> expandBracePatterns(String pathPattern) {
List<String> result = new ArrayList<>();
expandBracePatternsRecursive(pathPattern, result);
return result;
}

private static void expandBracePatternsRecursive(String pattern, List<String> result) {
int braceStart = pattern.indexOf('{');
if (braceStart == -1) {
// No more braces, add the pattern as-is
result.add(pattern);
return;
}

// Find matching closing brace (handle nested braces)
int braceEnd = findMatchingBrace(pattern, braceStart);
if (braceEnd == -1) {
// Malformed pattern, treat as literal
result.add(pattern);
return;
}

String prefix = pattern.substring(0, braceStart);
String braceContent = pattern.substring(braceStart + 1, braceEnd);
String suffix = pattern.substring(braceEnd + 1);

// Split by comma, but respect nested braces
List<String> alternatives = splitBraceContent(braceContent);

for (String alt : alternatives) {
// Recursively expand any remaining braces in the suffix
expandBracePatternsRecursive(prefix + alt + suffix, result);
}
}

private static int findMatchingBrace(String pattern, int start) {
int depth = 0;
for (int i = start; i < pattern.length(); i++) {
char c = pattern.charAt(i);
if (c == '{') {
depth++;
} else if (c == '}') {
depth--;
if (depth == 0) {
return i;
}
}
}
return -1;
}

private static List<String> splitBraceContent(String content) {
List<String> parts = new ArrayList<>();
int depth = 0;
int start = 0;

for (int i = 0; i < content.length(); i++) {
char c = content.charAt(i);
if (c == '{') {
depth++;
} else if (c == '}') {
depth--;
} else if (c == ',' && depth == 0) {
parts.add(content.substring(start, i));
start = i + 1;
}
}
parts.add(content.substring(start));
return parts;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.doris.fs.obj;

import org.apache.doris.backup.Status;
import org.apache.doris.common.Config;
import org.apache.doris.common.DdlException;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.S3URI;
Expand Down Expand Up @@ -357,8 +358,24 @@ public Status globList(String remotePath, List<RemoteFile> result, boolean fileN
try {
remotePath = AzurePropertyUtils.validateAndNormalizeUri(remotePath);
S3URI uri = S3URI.create(remotePath, isUsePathStyle, forceParsingByStandardUri);
String globPath = S3Util.extendGlobs(uri.getKey());
String bucket = uri.getBucket();

// Optimization: For deterministic paths (no wildcards like *, ?),
// use getProperties requests instead of listing to avoid requiring list permission.
// Controlled by config: s3_skip_list_for_deterministic_path
// Note: Skip when using path style (see S3ObjStorage for detailed explanation)
String keyPattern = uri.getKey();
if (Config.s3_skip_list_for_deterministic_path
&& !isUsePathStyle
&& S3Util.isDeterministicPattern(keyPattern)) {
Status headStatus = globListByGetProperties(bucket, keyPattern, result, fileNameOnly, startTime);
if (headStatus != null) {
return headStatus;
}
// If headStatus is null, fall through to use listing
}

String globPath = S3Util.extendGlobs(uri.getKey());
if (LOG.isDebugEnabled()) {
LOG.debug("try to glob list for azure, remote path {}, orig {}", globPath, remotePath);
}
Expand Down Expand Up @@ -436,6 +453,86 @@ public Status globList(String remotePath, List<RemoteFile> result, boolean fileN
return st;
}

/**
* Get file metadata using getProperties requests for deterministic paths.
* This avoids requiring list permission when only read permission is granted.
*
* @param bucket Azure container name
* @param keyPattern The key pattern (may contain {..} brace or [...] bracket patterns but no wildcards)
* @param result List to store matching RemoteFile objects
* @param fileNameOnly If true, only store file names; otherwise store full paths
* @param startTime Start time for logging duration
* @return Status if successful, null if should fall back to listing
*/
private Status globListByGetProperties(String bucket, String keyPattern,
List<RemoteFile> result, boolean fileNameOnly, long startTime) {
try {
// First expand [...] brackets to {...} braces, then expand {..} ranges, then expand braces
String expandedPattern = S3Util.expandBracketPatterns(keyPattern);
expandedPattern = S3Util.extendGlobs(expandedPattern);
List<String> expandedPaths = S3Util.expandBracePatterns(expandedPattern);

// Fall back to listing if too many paths to avoid overwhelming Azure with requests
// Controlled by config: s3_head_request_max_paths
if (expandedPaths.size() > Config.s3_head_request_max_paths) {
LOG.info("Expanded path count {} exceeds limit {}, falling back to LIST",
expandedPaths.size(), Config.s3_head_request_max_paths);
return null;
}
Comment on lines +470 to +481
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s3_head_request_max_paths is checked only after expanding patterns into the full expandedPaths list, so extremely large deterministic patterns can still consume significant CPU/memory during expansion before triggering the fallback. Consider adding short-circuiting/early-abort expansion once the configured max is exceeded.

Copilot uses AI. Check for mistakes.

if (LOG.isDebugEnabled()) {
LOG.debug("Using getProperties requests for deterministic path pattern, expanded to {} paths",
expandedPaths.size());
}

BlobContainerClient containerClient = getClient().getBlobContainerClient(bucket);
long matchCnt = 0;
for (String key : expandedPaths) {
String fullPath = constructS3Path(key, bucket);
try {
BlobClient blobClient = containerClient.getBlobClient(key);
BlobProperties props = blobClient.getProperties();

matchCnt++;
RemoteFile remoteFile = new RemoteFile(
fileNameOnly ? Paths.get(key).getFileName().toString() : fullPath,
true, // isFile
props.getBlobSize(),
props.getBlobSize(),
props.getLastModified() != null
? props.getLastModified().toEpochSecond() : 0
);
result.add(remoteFile);

if (LOG.isDebugEnabled()) {
LOG.debug("getProperties success for {}: size={}", fullPath, props.getBlobSize());
}
} catch (BlobStorageException e) {
if (e.getStatusCode() == HttpStatus.SC_NOT_FOUND
|| BlobErrorCode.BLOB_NOT_FOUND.equals(e.getErrorCode())) {
// File does not exist, skip it (this is expected for some expanded patterns)
if (LOG.isDebugEnabled()) {
LOG.debug("File does not exist (skipped): {}", fullPath);
}
} else {
throw e;
}
}
}

if (LOG.isDebugEnabled()) {
long duration = System.nanoTime() - startTime;
LOG.debug("Deterministic path getProperties requests: checked {} paths, found {} files, took {} ms",
expandedPaths.size(), matchCnt, duration / 1000 / 1000);
}

return Status.OK;
} catch (Exception e) {
LOG.warn("Failed to use getProperties requests, falling back to listing: {}", e.getMessage());
return null;
}
Comment on lines +530 to +533
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If an exception occurs after some successful getProperties calls, this method returns null to fall back to LIST but leaves already-added entries in result, which can lead to duplicates/partial results when the listing path runs. Consider building results in a temporary list and only appending on success (or rollback result before returning null).

Copilot uses AI. Check for mistakes.
}

public Status listFiles(String remotePath, boolean recursive, List<RemoteFile> result) {
try {
remotePath = AzurePropertyUtils.validateAndNormalizeUri(remotePath);
Expand Down
Loading
Loading