From dcac65f281c6c366a4856ede96917a6ba82f07f9 Mon Sep 17 00:00:00 2001 From: Yongqiang YANG Date: Sat, 28 Feb 2026 00:02:00 -0800 Subject: [PATCH] [opt](s3) Skip S3 listing for deterministic file paths using HEAD requests (#60414) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - For S3 paths without wildcards (`*`, `?`, `[...]`), use HEAD requests instead of ListObjectsV2 to avoid requiring `s3:ListBucket` permission - Brace patterns like `{1..10}` are expanded to concrete file paths and verified individually with HEAD requests - This enables loading data from S3 when only `s3:GetObject` permission is granted S3 `ListBucket` permission is often more restricted than `GetObject` in enterprise environments. When users specify exact file paths or deterministic patterns like `file{1..3}.csv`, listing is unnecessary since the file names can be determined from the input. | File | Description | |------|-------------| | `S3Util.java` | Added `isDeterministicPattern()` to detect paths without wildcards, and `expandBracePatterns()` to expand brace patterns to concrete paths | | `S3ObjStorage.java` | Modified `globListInternal()` to use HEAD requests for deterministic paths | | `S3UtilTest.java` | Added unit tests for new utility methods | | Path | Deterministic? | Behavior | |------|----------------|----------| | `s3://bucket/data/file.csv` | ✅ Yes | Single HEAD request | | `s3://bucket/data/file{1..3}.csv` | ✅ Yes | 3 HEAD requests | | `s3://bucket/data/*.csv` | ❌ No | Falls back to LIST | - [x] Added unit tests for `isDeterministicPattern()` - [x] Added unit tests for `expandBracePatterns()` - [ ] Manual testing with S3 TVF and Broker Load 🤖 Generated with [Claude Code](https://claude.ai/code) --- .../java/org/apache/doris/common/Config.java | 20 ++ .../org/apache/doris/common/util/S3Util.java | 232 ++++++++++++++++++ .../apache/doris/fs/obj/AzureObjStorage.java | 99 +++++++- .../org/apache/doris/fs/obj/S3ObjStorage.java | 108 ++++++++ .../apache/doris/common/util/S3UtilTest.java | 208 ++++++++++++++++ 5 files changed, 666 insertions(+), 1 deletion(-) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 921796670c48b5..8cb93b0688adb6 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -3417,6 +3417,26 @@ public static int metaServiceRpcRetryTimes() { + "for example: s3_load_endpoint_white_list=a,b,c"}) public static String[] s3_load_endpoint_white_list = {}; + @ConfField(mutable = true, description = { + "对于确定性的 S3 路径(无通配符如 *, ?),使用 HEAD 请求代替 ListObjects 来避免需要 ListBucket 权限。" + + "花括号模式 {1,2,3} 和非否定方括号模式 [abc] 会展开为具体路径。" + + "这对于只有 GetObject 权限的场景很有用。如果遇到问题可以设置为 false 回退到原有行为。", + "For deterministic S3 paths (without wildcards like *, ?), use HEAD requests instead of " + + "ListObjects to avoid requiring ListBucket permission. Brace patterns {1,2,3} and " + + "non-negated bracket patterns [abc] are expanded to concrete paths. This is useful when only " + + "GetObject permission is granted. Set to false to fall back to the original listing behavior." + }) + public static boolean s3_skip_list_for_deterministic_path = true; + + @ConfField(mutable = true, description = { + "当使用 HEAD 请求代替 ListObjects 时,展开路径的最大数量。如果展开的路径数量超过此限制," + + "将回退到使用 ListObjects。这可以防止类似 {1..100}/{1..100} 的模式触发过多的 HEAD 请求。", + "Maximum number of expanded paths when using HEAD requests instead of ListObjects. " + + "If the expanded path count exceeds this limit, falls back to ListObjects. " + + "This prevents patterns like {1..100}/{1..100} from triggering too many HEAD requests." + }) + public static int s3_head_request_max_paths = 100; + @ConfField(mutable = true, description = { "指定 Azure endpoint 域名后缀白名单(包含 blob 与 dfs),多个值使用逗号分隔。" + "默认值为 .blob.core.windows.net,.dfs.core.windows.net," diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java index e537d1f47b0f1f..3e4f4e7a62f9f6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3Util.java @@ -433,4 +433,236 @@ public static void validateAndTestEndpoint(String endpoint) throws UserException SecurityChecker.getInstance().stopSSRFChecking(); } } + + /** + * Check if a path pattern is deterministic, meaning all file paths can be determined + * without listing. A pattern is deterministic if it contains no true wildcard characters + * (*, ?) but may contain brace patterns ({...}) and non-negated bracket patterns ([abc], [0-9]) + * which can be expanded to concrete paths. + * + * Negated bracket patterns ([!abc], [^abc]) are NOT deterministic because they match + * any character except those listed, requiring a listing to discover matches. + * + * This allows skipping S3 ListBucket operations when only GetObject permission is available. + * + * @param pathPattern Path that may contain glob patterns + * @return true if the pattern is deterministic (expandable without listing) + */ + public static boolean isDeterministicPattern(String pathPattern) { + // Check for wildcard characters that require listing + // Note: '{' is NOT a wildcard - it's a brace expansion pattern that can be deterministically expanded + // Note: '[' is conditionally deterministic - [abc] can be expanded, but [!abc]/[^abc] cannot + char[] wildcardChars = {'*', '?'}; + for (char c : wildcardChars) { + if (pathPattern.indexOf(c) != -1) { + return false; + } + } + // Check for escaped characters which indicate complex patterns + if (pathPattern.indexOf('\\') != -1) { + return false; + } + // Check bracket patterns: [abc] and [0-9] are deterministic, [!abc] and [^abc] are not + if (!areBracketPatternsDeterministic(pathPattern)) { + return false; + } + return true; + } + + /** + * Check if all bracket patterns in the path are deterministic (non-negated). + * - [abc], [0-9], [a-zA-Z] are deterministic (can be expanded to finite character sets) + * - [!abc], [^abc] are non-deterministic (negation requires listing) + * - Malformed brackets (no closing ]) are non-deterministic + */ + private static boolean areBracketPatternsDeterministic(String pattern) { + int i = 0; + while (i < pattern.length()) { + if (pattern.charAt(i) == '[') { + int end = pattern.indexOf(']', i + 1); + if (end == -1) { + // Malformed bracket - no closing ], treat as non-deterministic + return false; + } + int contentStart = i + 1; + if (contentStart == end) { + // Empty brackets [] - malformed, treat as non-deterministic + return false; + } + // Check for negation + char first = pattern.charAt(contentStart); + if (first == '!' || first == '^') { + return false; + } + i = end + 1; + } else { + i++; + } + } + return true; + } + + /** + * Expand bracket character class patterns to brace patterns. + * This converts [abc] to {a,b,c} and [0-9] to {0,1,2,...,9} so that + * the existing brace expansion can handle them. + * + * Only call this on patterns already verified as deterministic by isDeterministicPattern() + * (i.e., no negated brackets like [!...] or [^...]). + * + * Examples: + * - "file[abc].csv" => "file{a,b,c}.csv" + * - "file[0-9].csv" => "file{0,1,2,3,4,5,6,7,8,9}.csv" + * - "file[a-cX].csv" => "file{a,b,c,X}.csv" + * - "file.csv" => "file.csv" (no brackets) + * + * @param pathPattern Path with optional bracket patterns (must not contain negated brackets) + * @return Path with brackets converted to brace patterns + */ + public static String expandBracketPatterns(String pathPattern) { + StringBuilder result = new StringBuilder(); + int i = 0; + while (i < pathPattern.length()) { + if (pathPattern.charAt(i) == '[') { + int end = pathPattern.indexOf(']', i + 1); + if (end == -1) { + // Malformed, keep as-is + result.append(pathPattern.charAt(i)); + i++; + continue; + } + String content = pathPattern.substring(i + 1, end); + List chars = expandBracketContent(content); + result.append('{'); + for (int j = 0; j < chars.size(); j++) { + if (j > 0) { + result.append(','); + } + result.append(chars.get(j)); + } + result.append('}'); + i = end + 1; + } else { + result.append(pathPattern.charAt(i)); + i++; + } + } + return result.toString(); + } + + private static List expandBracketContent(String content) { + List chars = new ArrayList<>(); + int i = 0; + while (i < content.length()) { + if (i + 2 < content.length() && content.charAt(i + 1) == '-') { + // Range like a-z or 0-9 + char start = content.charAt(i); + char end = content.charAt(i + 2); + if (start <= end) { + for (char c = start; c <= end; c++) { + if (!chars.contains(c)) { + chars.add(c); + } + } + } else { + for (char c = start; c >= end; c--) { + if (!chars.contains(c)) { + chars.add(c); + } + } + } + i += 3; + } else { + char c = content.charAt(i); + if (!chars.contains(c)) { + chars.add(c); + } + i++; + } + } + return chars; + } + + /** + * Expand brace patterns in a path to generate all concrete file paths. + * Handles nested and multiple brace patterns. + * + * Examples: + * - "file{1,2,3}.csv" => ["file1.csv", "file2.csv", "file3.csv"] + * - "data/part{1..3}/file.csv" => ["data/part1/file.csv", "data/part2/file.csv", "data/part3/file.csv"] + * - "file.csv" => ["file.csv"] (no braces) + * + * @param pathPattern Path with optional brace patterns (already processed by extendGlobs) + * @return List of expanded concrete paths + */ + public static List expandBracePatterns(String pathPattern) { + List result = new ArrayList<>(); + expandBracePatternsRecursive(pathPattern, result); + return result; + } + + private static void expandBracePatternsRecursive(String pattern, List result) { + int braceStart = pattern.indexOf('{'); + if (braceStart == -1) { + // No more braces, add the pattern as-is + result.add(pattern); + return; + } + + // Find matching closing brace (handle nested braces) + int braceEnd = findMatchingBrace(pattern, braceStart); + if (braceEnd == -1) { + // Malformed pattern, treat as literal + result.add(pattern); + return; + } + + String prefix = pattern.substring(0, braceStart); + String braceContent = pattern.substring(braceStart + 1, braceEnd); + String suffix = pattern.substring(braceEnd + 1); + + // Split by comma, but respect nested braces + List alternatives = splitBraceContent(braceContent); + + for (String alt : alternatives) { + // Recursively expand any remaining braces in the suffix + expandBracePatternsRecursive(prefix + alt + suffix, result); + } + } + + private static int findMatchingBrace(String pattern, int start) { + int depth = 0; + for (int i = start; i < pattern.length(); i++) { + char c = pattern.charAt(i); + if (c == '{') { + depth++; + } else if (c == '}') { + depth--; + if (depth == 0) { + return i; + } + } + } + return -1; + } + + private static List splitBraceContent(String content) { + List parts = new ArrayList<>(); + int depth = 0; + int start = 0; + + for (int i = 0; i < content.length(); i++) { + char c = content.charAt(i); + if (c == '{') { + depth++; + } else if (c == '}') { + depth--; + } else if (c == ',' && depth == 0) { + parts.add(content.substring(start, i)); + start = i + 1; + } + } + parts.add(content.substring(start)); + return parts; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java b/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java index 6b0c198d8412ca..4929e34e7f5a74 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java +++ b/fe/fe-core/src/main/java/org/apache/doris/fs/obj/AzureObjStorage.java @@ -18,6 +18,7 @@ package org.apache.doris.fs.obj; import org.apache.doris.backup.Status; +import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; import org.apache.doris.common.UserException; import org.apache.doris.common.util.S3URI; @@ -357,8 +358,24 @@ public Status globList(String remotePath, List result, boolean fileN try { remotePath = AzurePropertyUtils.validateAndNormalizeUri(remotePath); S3URI uri = S3URI.create(remotePath, isUsePathStyle, forceParsingByStandardUri); - String globPath = S3Util.extendGlobs(uri.getKey()); String bucket = uri.getBucket(); + + // Optimization: For deterministic paths (no wildcards like *, ?), + // use getProperties requests instead of listing to avoid requiring list permission. + // Controlled by config: s3_skip_list_for_deterministic_path + // Note: Skip when using path style (see S3ObjStorage for detailed explanation) + String keyPattern = uri.getKey(); + if (Config.s3_skip_list_for_deterministic_path + && !isUsePathStyle + && S3Util.isDeterministicPattern(keyPattern)) { + Status headStatus = globListByGetProperties(bucket, keyPattern, result, fileNameOnly, startTime); + if (headStatus != null) { + return headStatus; + } + // If headStatus is null, fall through to use listing + } + + String globPath = S3Util.extendGlobs(uri.getKey()); if (LOG.isDebugEnabled()) { LOG.debug("try to glob list for azure, remote path {}, orig {}", globPath, remotePath); } @@ -436,6 +453,86 @@ public Status globList(String remotePath, List result, boolean fileN return st; } + /** + * Get file metadata using getProperties requests for deterministic paths. + * This avoids requiring list permission when only read permission is granted. + * + * @param bucket Azure container name + * @param keyPattern The key pattern (may contain {..} brace or [...] bracket patterns but no wildcards) + * @param result List to store matching RemoteFile objects + * @param fileNameOnly If true, only store file names; otherwise store full paths + * @param startTime Start time for logging duration + * @return Status if successful, null if should fall back to listing + */ + private Status globListByGetProperties(String bucket, String keyPattern, + List result, boolean fileNameOnly, long startTime) { + try { + // First expand [...] brackets to {...} braces, then expand {..} ranges, then expand braces + String expandedPattern = S3Util.expandBracketPatterns(keyPattern); + expandedPattern = S3Util.extendGlobs(expandedPattern); + List expandedPaths = S3Util.expandBracePatterns(expandedPattern); + + // Fall back to listing if too many paths to avoid overwhelming Azure with requests + // Controlled by config: s3_head_request_max_paths + if (expandedPaths.size() > Config.s3_head_request_max_paths) { + LOG.info("Expanded path count {} exceeds limit {}, falling back to LIST", + expandedPaths.size(), Config.s3_head_request_max_paths); + return null; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Using getProperties requests for deterministic path pattern, expanded to {} paths", + expandedPaths.size()); + } + + BlobContainerClient containerClient = getClient().getBlobContainerClient(bucket); + long matchCnt = 0; + for (String key : expandedPaths) { + String fullPath = constructS3Path(key, bucket); + try { + BlobClient blobClient = containerClient.getBlobClient(key); + BlobProperties props = blobClient.getProperties(); + + matchCnt++; + RemoteFile remoteFile = new RemoteFile( + fileNameOnly ? Paths.get(key).getFileName().toString() : fullPath, + true, // isFile + props.getBlobSize(), + props.getBlobSize(), + props.getLastModified() != null + ? props.getLastModified().toEpochSecond() : 0 + ); + result.add(remoteFile); + + if (LOG.isDebugEnabled()) { + LOG.debug("getProperties success for {}: size={}", fullPath, props.getBlobSize()); + } + } catch (BlobStorageException e) { + if (e.getStatusCode() == HttpStatus.SC_NOT_FOUND + || BlobErrorCode.BLOB_NOT_FOUND.equals(e.getErrorCode())) { + // File does not exist, skip it (this is expected for some expanded patterns) + if (LOG.isDebugEnabled()) { + LOG.debug("File does not exist (skipped): {}", fullPath); + } + } else { + throw e; + } + } + } + + if (LOG.isDebugEnabled()) { + long duration = System.nanoTime() - startTime; + LOG.debug("Deterministic path getProperties requests: checked {} paths, found {} files, took {} ms", + expandedPaths.size(), matchCnt, duration / 1000 / 1000); + } + + return Status.OK; + } catch (Exception e) { + LOG.warn("Failed to use getProperties requests, falling back to listing: {}", e.getMessage()); + return null; + } + } + public Status listFiles(String remotePath, boolean recursive, List result) { try { remotePath = AzurePropertyUtils.validateAndNormalizeUri(remotePath); diff --git a/fe/fe-core/src/main/java/org/apache/doris/fs/obj/S3ObjStorage.java b/fe/fe-core/src/main/java/org/apache/doris/fs/obj/S3ObjStorage.java index d9d0c7e94420da..ec827c785a52b1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/fs/obj/S3ObjStorage.java +++ b/fe/fe-core/src/main/java/org/apache/doris/fs/obj/S3ObjStorage.java @@ -18,6 +18,7 @@ package org.apache.doris.fs.obj; import org.apache.doris.backup.Status; +import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; import org.apache.doris.common.UserException; import org.apache.doris.common.util.S3URI; @@ -584,6 +585,28 @@ private GlobListResult globListInternal(String remotePath, List resu } bucket = uri.getBucket(); + + // Optimization: For deterministic paths (no wildcards like *, ?), + // use HEAD requests instead of listing to avoid requiring ListBucket permission. + // This is useful when only GetObject permission is granted. + // Controlled by config: s3_skip_list_for_deterministic_path + // Note: Skip when using path style because path-style parsing of virtual-host URLs + // can produce accidental HEAD successes where LIST would correctly fail. + // (e.g., http://bucket.endpoint/key with path_style=true: HEAD URL coincidentally + // matches the correct virtual-host URL, while LIST URL format is different and fails) + String keyPattern = uri.getKey(); + if (Config.s3_skip_list_for_deterministic_path + && !isUsePathStyle + && S3Util.isDeterministicPattern(keyPattern) + && !hasLimits && startFile == null) { + GlobListResult headResult = globListByHeadRequests( + bucket, keyPattern, result, fileNameOnly, startTime); + if (headResult != null) { + return headResult; + } + // If headResult is null, fall through to use listing + } + String globPath = S3Util.extendGlobs(uri.getKey()); if (LOG.isDebugEnabled()) { @@ -718,6 +741,91 @@ private GlobListResult globListInternal(String remotePath, List resu } } + /** + * Get file metadata using HEAD requests for deterministic paths. + * This avoids requiring ListBucket permission when only GetObject permission is granted. + * + * @param bucket S3 bucket name + * @param keyPattern The key pattern (may contain {..} brace or [...] bracket patterns but no wildcards) + * @param result List to store matching RemoteFile objects + * @param fileNameOnly If true, only store file names; otherwise store full S3 paths + * @param startTime Start time for logging duration + * @return GlobListResult if successful, null if should fall back to listing + */ + private GlobListResult globListByHeadRequests(String bucket, String keyPattern, + List result, boolean fileNameOnly, long startTime) { + try { + // First expand [...] brackets to {...} braces, then expand {..} ranges, then expand braces + String expandedPattern = S3Util.expandBracketPatterns(keyPattern); + expandedPattern = S3Util.extendGlobs(expandedPattern); + List expandedPaths = S3Util.expandBracePatterns(expandedPattern); + + // Fall back to listing if too many paths to avoid overwhelming S3 with HEAD requests + // Controlled by config: s3_head_request_max_paths + if (expandedPaths.size() > Config.s3_head_request_max_paths) { + LOG.info("Expanded path count {} exceeds limit {}, falling back to LIST", + expandedPaths.size(), Config.s3_head_request_max_paths); + return null; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Using HEAD requests for deterministic path pattern, expanded to {} paths", + expandedPaths.size()); + } + + long matchCnt = 0; + for (String key : expandedPaths) { + String fullPath = "s3://" + bucket + "/" + key; + try { + HeadObjectResponse headResponse = getClient() + .headObject(HeadObjectRequest.builder() + .bucket(bucket) + .key(key) + .build()); + + matchCnt++; + RemoteFile remoteFile = new RemoteFile( + fileNameOnly ? Paths.get(key).getFileName().toString() : fullPath, + true, // isFile + headResponse.contentLength(), + headResponse.contentLength(), + headResponse.lastModified() != null + ? headResponse.lastModified().toEpochMilli() : 0 + ); + result.add(remoteFile); + + if (LOG.isDebugEnabled()) { + LOG.debug("HEAD success for {}: size={}", fullPath, headResponse.contentLength()); + } + } catch (NoSuchKeyException e) { + // File does not exist, skip it (this is expected for some expanded patterns) + if (LOG.isDebugEnabled()) { + LOG.debug("File does not exist (skipped): {}", fullPath); + } + } catch (S3Exception e) { + if (e.statusCode() == HttpStatus.SC_NOT_FOUND) { + if (LOG.isDebugEnabled()) { + LOG.debug("File does not exist (skipped): {}", fullPath); + } + } else { + throw e; + } + } + } + + if (LOG.isDebugEnabled()) { + long duration = System.nanoTime() - startTime; + LOG.debug("Deterministic path HEAD requests: checked {} paths, found {} files, took {} ms", + expandedPaths.size(), matchCnt, duration / 1000 / 1000); + } + + return new GlobListResult(Status.OK, "", bucket, ""); + } catch (Exception e) { + LOG.warn("Failed to use HEAD requests, falling back to listing: {}", e.getMessage()); + return null; + } + } + private static boolean reachLimit(int matchFileCnt, long matchFileSize, long sizeLimit, long fileNum) { if (matchFileCnt < 0 || sizeLimit < 0 || fileNum < 0) { return false; diff --git a/fe/fe-core/src/test/java/org/apache/doris/common/util/S3UtilTest.java b/fe/fe-core/src/test/java/org/apache/doris/common/util/S3UtilTest.java index 23715440e8c082..4b976ed86cd3c5 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/common/util/S3UtilTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/common/util/S3UtilTest.java @@ -20,6 +20,9 @@ import org.junit.Assert; import org.junit.Test; +import java.util.Arrays; +import java.util.List; + public class S3UtilTest { @Test @@ -248,5 +251,210 @@ public void testExtendGlobs() { String result = S3Util.extendGlobs(input); Assert.assertEquals(expected, result); } + + // Tests for isDeterministicPattern + + @Test + public void testIsDeterministicPattern_simpleFile() { + // Simple file path without any patterns + Assert.assertTrue(S3Util.isDeterministicPattern("path/to/file.csv")); + } + + @Test + public void testIsDeterministicPattern_withBraces() { + // Path with brace pattern (deterministic - can be expanded) + Assert.assertTrue(S3Util.isDeterministicPattern("path/to/file{1,2,3}.csv")); + Assert.assertTrue(S3Util.isDeterministicPattern("path/to/file{1..3}.csv")); + } + + @Test + public void testIsDeterministicPattern_withAsterisk() { + // Path with asterisk wildcard (not deterministic) + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/*.csv")); + Assert.assertFalse(S3Util.isDeterministicPattern("path/*/file.csv")); + } + + @Test + public void testIsDeterministicPattern_withQuestionMark() { + // Path with question mark wildcard (not deterministic) + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file?.csv")); + } + + @Test + public void testIsDeterministicPattern_withBrackets() { + // Non-negated bracket patterns are deterministic (can be expanded) + Assert.assertTrue(S3Util.isDeterministicPattern("path/to/file[0-9].csv")); + Assert.assertTrue(S3Util.isDeterministicPattern("path/to/file[abc].csv")); + Assert.assertTrue(S3Util.isDeterministicPattern("path/to/file[a-zA-Z].csv")); + } + + @Test + public void testIsDeterministicPattern_withNegatedBrackets() { + // Negated bracket patterns are NOT deterministic + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file[!abc].csv")); + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file[^0-9].csv")); + } + + @Test + public void testIsDeterministicPattern_withMalformedBrackets() { + // Malformed brackets (no closing ]) are NOT deterministic + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file[abc.csv")); + // Empty brackets [] are NOT deterministic + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file[].csv")); + } + + @Test + public void testIsDeterministicPattern_withEscape() { + // Path with escape character (not deterministic - complex pattern) + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file\\*.csv")); + } + + @Test + public void testIsDeterministicPattern_mixed() { + // Path with both braces and wildcards + Assert.assertFalse(S3Util.isDeterministicPattern("path/to/file{1,2}/*.csv")); + } + + // Tests for expandBracePatterns + + @Test + public void testExpandBracePatterns_noBraces() { + // No braces - returns single path + List result = S3Util.expandBracePatterns("path/to/file.csv"); + Assert.assertEquals(Arrays.asList("path/to/file.csv"), result); + } + + @Test + public void testExpandBracePatterns_simpleBrace() { + // Simple brace expansion + List result = S3Util.expandBracePatterns("file{1,2,3}.csv"); + Assert.assertEquals(Arrays.asList("file1.csv", "file2.csv", "file3.csv"), result); + } + + @Test + public void testExpandBracePatterns_multipleBraces() { + // Multiple brace expansions + List result = S3Util.expandBracePatterns("dir{a,b}/file{1,2}.csv"); + Assert.assertEquals(Arrays.asList( + "dira/file1.csv", "dira/file2.csv", + "dirb/file1.csv", "dirb/file2.csv"), result); + } + + @Test + public void testExpandBracePatterns_emptyBrace() { + // Empty brace content + List result = S3Util.expandBracePatterns("file{}.csv"); + Assert.assertEquals(Arrays.asList("file.csv"), result); + } + + @Test + public void testExpandBracePatterns_singleValue() { + // Single value in brace + List result = S3Util.expandBracePatterns("file{1}.csv"); + Assert.assertEquals(Arrays.asList("file1.csv"), result); + } + + @Test + public void testExpandBracePatterns_withPath() { + // Full path with braces: 2 years × 2 months = 4 paths + List result = S3Util.expandBracePatterns("data/year{2023,2024}/month{01,02}/file.csv"); + Assert.assertEquals(4, result.size()); + Assert.assertTrue(result.contains("data/year2023/month01/file.csv")); + Assert.assertTrue(result.contains("data/year2023/month02/file.csv")); + Assert.assertTrue(result.contains("data/year2024/month01/file.csv")); + Assert.assertTrue(result.contains("data/year2024/month02/file.csv")); + } + + @Test + public void testExpandBracePatterns_extendedRange() { + // Test with extended range (after extendGlobs processing) + String expanded = S3Util.extendGlobs("file{1..3}.csv"); + List result = S3Util.expandBracePatterns(expanded); + Assert.assertEquals(Arrays.asList("file1.csv", "file2.csv", "file3.csv"), result); + } + + @Test + public void testExpandBracePatterns_malformedBrace() { + // Malformed brace pattern (no closing }) - treated as literal + List result = S3Util.expandBracePatterns("file{1,2.csv"); + Assert.assertEquals(Arrays.asList("file{1,2.csv"), result); + } + + @Test + public void testExpandBracePatterns_malformedBraceWithDots() { + // Malformed range-like pattern (no closing }) - treated as literal + List result = S3Util.expandBracePatterns("file{1..csv"); + Assert.assertEquals(Arrays.asList("file{1..csv"), result); + } + + // Tests for expandBracketPatterns + + @Test + public void testExpandBracketPatterns_noBrackets() { + // No brackets - returns unchanged + Assert.assertEquals("path/to/file.csv", S3Util.expandBracketPatterns("path/to/file.csv")); + } + + @Test + public void testExpandBracketPatterns_simpleCharList() { + // [abc] => {a,b,c} + Assert.assertEquals("file{a,b,c}.csv", S3Util.expandBracketPatterns("file[abc].csv")); + } + + @Test + public void testExpandBracketPatterns_charRange() { + // [0-3] => {0,1,2,3} + Assert.assertEquals("file{0,1,2,3}.csv", S3Util.expandBracketPatterns("file[0-3].csv")); + } + + @Test + public void testExpandBracketPatterns_mixedRangeAndChars() { + // [a-cX] => {a,b,c,X} + Assert.assertEquals("file{a,b,c,X}.csv", S3Util.expandBracketPatterns("file[a-cX].csv")); + } + + @Test + public void testExpandBracketPatterns_multipleRanges() { + // [a-c0-2] => {a,b,c,0,1,2} + Assert.assertEquals("file{a,b,c,0,1,2}.csv", S3Util.expandBracketPatterns("file[a-c0-2].csv")); + } + + @Test + public void testExpandBracketPatterns_fullPipeline() { + // Full pipeline: bracket expansion -> extendGlobs -> brace expansion + // file[abc].csv => file{a,b,c}.csv => [filea.csv, fileb.csv, filec.csv] + String bracketExpanded = S3Util.expandBracketPatterns("file[abc].csv"); + String globExpanded = S3Util.extendGlobs(bracketExpanded); + List result = S3Util.expandBracePatterns(globExpanded); + Assert.assertEquals(Arrays.asList("filea.csv", "fileb.csv", "filec.csv"), result); + } + + @Test + public void testExpandBracketPatterns_withBracesAndBrackets() { + // Mixed brackets and braces: dir[ab]/file{1,2}.csv + // => dir{a,b}/file{1,2}.csv => [dira/file1.csv, dira/file2.csv, dirb/file1.csv, dirb/file2.csv] + String bracketExpanded = S3Util.expandBracketPatterns("dir[ab]/file{1,2}.csv"); + Assert.assertEquals("dir{a,b}/file{1,2}.csv", bracketExpanded); + List result = S3Util.expandBracePatterns(bracketExpanded); + Assert.assertEquals(Arrays.asList( + "dira/file1.csv", "dira/file2.csv", + "dirb/file1.csv", "dirb/file2.csv"), result); + } + + @Test + public void testExpandBracketPatterns_digitRange() { + // [0-9] => {0,1,2,3,4,5,6,7,8,9} + String expanded = S3Util.expandBracketPatterns("part[0-9].dat"); + List result = S3Util.expandBracePatterns(expanded); + Assert.assertEquals(10, result.size()); + Assert.assertTrue(result.contains("part0.dat")); + Assert.assertTrue(result.contains("part9.dat")); + } + + @Test + public void testExpandBracketPatterns_malformedBracket() { + // Malformed bracket (no closing ]) - [ kept as literal + Assert.assertEquals("file[abc.csv", S3Util.expandBracketPatterns("file[abc.csv")); + } }