|
1 | 1 | /** |
2 | | - * @fileoverview Shared Unicode property escape transformations for --with-intl=none. |
| 2 | + * @fileoverview Transform Unicode property escapes for --with-intl=none compatibility. |
3 | 3 | * |
4 | | - * Transforms Unicode property escapes (\p{...}) into basic character class alternatives |
5 | | - * that work without ICU support. This enables Node.js builds with --with-intl=none to |
6 | | - * save ~6-8MB by removing ICU. |
7 | | - * |
8 | | - * Used by: |
9 | | - * - babel-plugin-with-intl-none.mjs (CLI Babel transforms) |
10 | | - * - bootstrap esbuild-plugin-smol-transform.mjs (Bootstrap esbuild transforms) |
11 | | - * |
12 | | - * @example |
13 | | - * import { transformUnicodePropertyEscapes } from './unicode-property-escape-transform.mjs' |
14 | | - * |
15 | | - * const code = 'const regex = /[\\p{Letter}\\p{Number}]+/u' |
16 | | - * const transformed = transformUnicodePropertyEscapes(code) |
17 | | - * // Result: 'const regex = /[a-zA-Z0-9]+/' |
| 4 | + * This module provides transformations to convert Unicode property escapes |
| 5 | + * (\p{Property}) into basic character class equivalents that work without ICU support. |
18 | 6 | */ |
19 | 7 |
|
| 8 | +import { parse } from '@babel/parser' |
| 9 | +import traverseModule from '@babel/traverse' |
| 10 | +import MagicString from 'magic-string' |
| 11 | + |
| 12 | +// Handle CommonJS default export. |
| 13 | +const traverse = traverseModule.default || traverseModule |
| 14 | + |
20 | 15 | /** |
21 | 16 | * Map of Unicode property escapes to basic character class alternatives. |
22 | | - * Approximations are used where exact equivalents don't exist. |
23 | | - * |
24 | | - * @type {Record<string, string>} |
25 | 17 | */ |
26 | 18 | export const unicodePropertyMap = { |
27 | 19 | __proto__: null, |
28 | 20 | // Letter categories. |
29 | | - 'Letter': 'a-zA-Z', |
30 | | - 'L': 'a-zA-Z', |
31 | | - 'Alpha': 'a-zA-Z', |
32 | 21 | 'Alphabetic': 'a-zA-Z', |
| 22 | + 'Alpha': 'a-zA-Z', |
| 23 | + 'L': 'a-zA-Z', |
| 24 | + 'Letter': 'a-zA-Z', |
33 | 25 | // Number categories. |
34 | | - 'Number': '0-9', |
35 | | - 'N': '0-9', |
36 | 26 | 'Digit': '0-9', |
| 27 | + 'N': '0-9', |
37 | 28 | 'Nd': '0-9', |
| 29 | + 'Number': '0-9', |
38 | 30 | // Whitespace. |
39 | 31 | 'Space': '\\s', |
40 | 32 | 'White_Space': '\\s', |
41 | 33 | // ASCII range. |
42 | 34 | 'ASCII': '\\x00-\\x7F', |
43 | 35 | // Control characters (basic approximation). |
44 | | - 'Control': '\\x00-\\x1F\\x7F-\\x9F', |
45 | 36 | 'Cc': '\\x00-\\x1F\\x7F-\\x9F', |
| 37 | + 'Control': '\\x00-\\x1F\\x7F-\\x9F', |
46 | 38 | // Format characters (approximate with zero-width space). |
47 | | - 'Format': '\\u200B-\\u200D\\uFEFF', |
48 | 39 | 'Cf': '\\u200B-\\u200D\\uFEFF', |
| 40 | + 'Format': '\\u200B-\\u200D\\uFEFF', |
49 | 41 | // Mark categories (combining marks - approximate). |
50 | | - 'Mark': '\\u0300-\\u036F', |
51 | 42 | 'M': '\\u0300-\\u036F', |
| 43 | + 'Mark': '\\u0300-\\u036F', |
52 | 44 | // Default_Ignorable_Code_Point (approximate with common invisibles). |
53 | | - // Covers most common cases: soft hyphen, zero-width spaces, format controls, etc. |
54 | 45 | 'Default_Ignorable_Code_Point': '\\u00AD\\u034F\\u061C\\u115F-\\u1160\\u17B4-\\u17B5\\u180B-\\u180D\\u200B-\\u200F\\u202A-\\u202E\\u2060-\\u206F\\u3164\\uFE00-\\uFE0F\\uFEFF\\uFFA0\\uFFF0-\\uFFF8', |
55 | 46 | } |
56 | 47 |
|
57 | 48 | /** |
58 | | - * Transform Unicode property escapes in regex patterns for ICU-free environments. |
59 | | - * |
60 | | - * @param {string} content - Source code to transform |
61 | | - * @returns {string} Transformed source code |
| 49 | + * Check if a regex pattern has unsupported Unicode features. |
62 | 50 | */ |
63 | | -export function transformUnicodePropertyEscapes(content) { |
64 | | - let transformed = content |
65 | | - |
66 | | - // Transform \p{Property} inside character classes [...]. |
67 | | - // Example: /[\p{Letter}\p{Number}]+/u → /[a-zA-Z0-9]+/ |
68 | | - transformed = transformed.replace( |
69 | | - /\[([^\]]*\\p\{[^}]+\}[^\]]*)\]/g, |
70 | | - (_match, charClass) => { |
71 | | - let newCharClass = charClass |
72 | | - |
73 | | - // Replace each \p{Property} with its character class equivalent. |
74 | | - for (const [prop, replacement] of Object.entries(unicodePropertyMap)) { |
75 | | - const escapedProp = prop.replace(/[\\{}]/g, '\\$&') |
76 | | - newCharClass = newCharClass.replace( |
77 | | - new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'), |
78 | | - replacement, |
79 | | - ) |
80 | | - } |
| 51 | +function hasUnsupportedUnicodeFeatures(pattern) { |
| 52 | + // Check for \u{} escapes (require /u flag). |
| 53 | + if (/\\u\{[0-9a-fA-F]+\}/.test(pattern)) { |
| 54 | + return true |
| 55 | + } |
| 56 | + // Check for remaining \p{} or \P{} escapes that we don't support. |
| 57 | + if (/\\[pP]\{/.test(pattern)) { |
| 58 | + return true |
| 59 | + } |
| 60 | + return false |
| 61 | +} |
81 | 62 |
|
82 | | - return `[${newCharClass}]` |
83 | | - }, |
84 | | - ) |
| 63 | +/** |
| 64 | + * Transform a regex pattern by replacing \p{Property} with character classes. |
| 65 | + */ |
| 66 | +function transformRegexPattern(pattern) { |
| 67 | + let transformed = pattern |
85 | 68 |
|
86 | | - // Transform standalone \p{Property} (not inside character class). |
87 | | - // Example: /\p{Letter}+/u → /[a-zA-Z]+/ |
| 69 | + // Replace \p{Property} with character class equivalents. |
88 | 70 | for (const [prop, replacement] of Object.entries(unicodePropertyMap)) { |
89 | 71 | const escapedProp = prop.replace(/[\\{}]/g, '\\$&') |
90 | | - // Match \p{Property} that's NOT inside square brackets. |
91 | | - // This is a simplified approach - proper parsing would be better. |
| 72 | + // Replace \p{Property} with [replacement]. |
92 | 73 | transformed = transformed.replace( |
93 | 74 | new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'), |
94 | 75 | `[${replacement}]`, |
95 | 76 | ) |
96 | 77 | } |
97 | 78 |
|
98 | | - // Remove /u and /v flags from regexes that used Unicode property escapes. |
99 | | - // This is safe because we've replaced them with basic character classes. |
100 | | - // Match regex literals: /pattern/flags |
101 | | - transformed = transformed.replace( |
102 | | - /\/([^/\\]|\\.)+\/([gimsuvy]+)/g, |
103 | | - (match, _pattern, flags) => { |
104 | | - // Only remove u/v flags if the regex originally had Unicode escapes. |
105 | | - if (flags.includes('u') || flags.includes('v')) { |
| 79 | + return transformed |
| 80 | +} |
| 81 | + |
| 82 | +/** |
| 83 | + * Transform Unicode property escapes in regex patterns for ICU-free environments. |
| 84 | + * |
| 85 | + * Uses Babel AST parsing to properly identify regex literals and transform them. |
| 86 | + * |
| 87 | + * @param {string} content - Source code to transform |
| 88 | + * @returns {string} Transformed source code |
| 89 | + */ |
| 90 | +export function transformUnicodePropertyEscapes(content) { |
| 91 | + let ast |
| 92 | + try { |
| 93 | + ast = parse(content, { |
| 94 | + sourceType: 'module', |
| 95 | + plugins: [], |
| 96 | + }) |
| 97 | + } catch (e) { |
| 98 | + // If parsing fails, return content unchanged. |
| 99 | + console.warn('Failed to parse code for Unicode transform:', e.message) |
| 100 | + return content |
| 101 | + } |
| 102 | + |
| 103 | + const s = new MagicString(content) |
| 104 | + |
| 105 | + traverse(ast, { |
| 106 | + RegExpLiteral(path) { |
| 107 | + const { node } = path |
| 108 | + const { pattern, flags } = node |
| 109 | + const { start, end } = node |
| 110 | + |
| 111 | + // Check if this regex has /u or /v flags. |
| 112 | + const hasUFlag = flags.includes('u') |
| 113 | + const hasVFlag = flags.includes('v') |
| 114 | + |
| 115 | + if (!hasUFlag && !hasVFlag) { |
| 116 | + // No Unicode flags, nothing to transform. |
| 117 | + return |
| 118 | + } |
| 119 | + |
| 120 | + // Transform the pattern. |
| 121 | + const transformedPattern = transformRegexPattern(pattern) |
| 122 | + |
| 123 | + // Check if transformed pattern still has unsupported Unicode features. |
| 124 | + if (hasUnsupportedUnicodeFeatures(transformedPattern)) { |
| 125 | + // Replace entire regex with /(?:)/ (no-op regex). |
| 126 | + s.overwrite(start, end, '/(?:)/') |
| 127 | + return |
| 128 | + } |
| 129 | + |
| 130 | + // If pattern changed, update it and remove Unicode flags. |
| 131 | + if (transformedPattern !== pattern) { |
| 132 | + // Remove /u and /v flags. |
106 | 133 | const newFlags = flags.replace(/[uv]/g, '') |
107 | | - return match.slice(0, -flags.length) + newFlags |
| 134 | + const newRegex = `/${transformedPattern}/${newFlags}` |
| 135 | + s.overwrite(start, end, newRegex) |
| 136 | + return |
| 137 | + } |
| 138 | + |
| 139 | + // Pattern unchanged but has Unicode flags - check if safe to remove flags. |
| 140 | + // Only remove flags if pattern has no \u{} escapes or other Unicode-specific syntax. |
| 141 | + if (!hasUnsupportedUnicodeFeatures(pattern)) { |
| 142 | + // Safe to remove Unicode flags. |
| 143 | + const newFlags = flags.replace(/[uv]/g, '') |
| 144 | + const newRegex = `/${pattern}/${newFlags}` |
| 145 | + s.overwrite(start, end, newRegex) |
| 146 | + } else { |
| 147 | + // Has unsupported features, replace with no-op. |
| 148 | + s.overwrite(start, end, '/(?:)/') |
108 | 149 | } |
109 | | - return match |
110 | 150 | }, |
111 | | - ) |
| 151 | + }) |
112 | 152 |
|
113 | | - return transformed |
| 153 | + return s.toString() |
114 | 154 | } |
0 commit comments