Skip to content

Commit 281ce8b

Browse files
committed
refactor(build): use Babel AST for Unicode regex transformation
Replace regex-based string replacement with proper AST parsing for transforming Unicode property escapes. The previous approach could incorrectly match non-regex code (like strings in .match() calls), breaking valid JavaScript. Changes: - Rewrite unicode-property-escape-transform.mjs to use @babel/parser, @babel/traverse, and magic-string for accurate regex literal detection - Add @babel/parser, @babel/traverse, and magic-string to catalog - Add dependencies to build-infra package.json - Update bootstrap build script to write npm output - Fix CLI bin/cli.js to load from dist/index.js (decompressor) The AST-based approach properly identifies RegExpLiteral nodes and safely transforms or replaces them based on Unicode feature support.
1 parent ad4357f commit 281ce8b

File tree

8 files changed

+173
-93
lines changed

8 files changed

+173
-93
lines changed

packages/bootstrap/dist/bootstrap-npm.js

Lines changed: 12 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/bootstrap/scripts/build.mjs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ try {
2929
console.log('→ Building npm bootstrap...')
3030
const npmResult = await build(npmConfig)
3131

32+
// Write the transformed output (build had write: false).
33+
if (npmResult.outputFiles && npmResult.outputFiles.length > 0) {
34+
for (const output of npmResult.outputFiles) {
35+
writeFileSync(output.path, output.contents)
36+
}
37+
}
38+
3239
console.log(`✓ ${npmConfig.outfile}`)
3340

3441
if (npmResult.metafile) {
Lines changed: 108 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,114 +1,154 @@
11
/**
2-
* @fileoverview Shared Unicode property escape transformations for --with-intl=none.
2+
* @fileoverview Transform Unicode property escapes for --with-intl=none compatibility.
33
*
4-
* Transforms Unicode property escapes (\p{...}) into basic character class alternatives
5-
* that work without ICU support. This enables Node.js builds with --with-intl=none to
6-
* save ~6-8MB by removing ICU.
7-
*
8-
* Used by:
9-
* - babel-plugin-with-intl-none.mjs (CLI Babel transforms)
10-
* - bootstrap esbuild-plugin-smol-transform.mjs (Bootstrap esbuild transforms)
11-
*
12-
* @example
13-
* import { transformUnicodePropertyEscapes } from './unicode-property-escape-transform.mjs'
14-
*
15-
* const code = 'const regex = /[\\p{Letter}\\p{Number}]+/u'
16-
* const transformed = transformUnicodePropertyEscapes(code)
17-
* // Result: 'const regex = /[a-zA-Z0-9]+/'
4+
* This module provides transformations to convert Unicode property escapes
5+
* (\p{Property}) into basic character class equivalents that work without ICU support.
186
*/
197

8+
import { parse } from '@babel/parser'
9+
import traverseModule from '@babel/traverse'
10+
import MagicString from 'magic-string'
11+
12+
// Handle CommonJS default export.
13+
const traverse = traverseModule.default || traverseModule
14+
2015
/**
2116
* Map of Unicode property escapes to basic character class alternatives.
22-
* Approximations are used where exact equivalents don't exist.
23-
*
24-
* @type {Record<string, string>}
2517
*/
2618
export const unicodePropertyMap = {
2719
__proto__: null,
2820
// Letter categories.
29-
'Letter': 'a-zA-Z',
30-
'L': 'a-zA-Z',
31-
'Alpha': 'a-zA-Z',
3221
'Alphabetic': 'a-zA-Z',
22+
'Alpha': 'a-zA-Z',
23+
'L': 'a-zA-Z',
24+
'Letter': 'a-zA-Z',
3325
// Number categories.
34-
'Number': '0-9',
35-
'N': '0-9',
3626
'Digit': '0-9',
27+
'N': '0-9',
3728
'Nd': '0-9',
29+
'Number': '0-9',
3830
// Whitespace.
3931
'Space': '\\s',
4032
'White_Space': '\\s',
4133
// ASCII range.
4234
'ASCII': '\\x00-\\x7F',
4335
// Control characters (basic approximation).
44-
'Control': '\\x00-\\x1F\\x7F-\\x9F',
4536
'Cc': '\\x00-\\x1F\\x7F-\\x9F',
37+
'Control': '\\x00-\\x1F\\x7F-\\x9F',
4638
// Format characters (approximate with zero-width space).
47-
'Format': '\\u200B-\\u200D\\uFEFF',
4839
'Cf': '\\u200B-\\u200D\\uFEFF',
40+
'Format': '\\u200B-\\u200D\\uFEFF',
4941
// Mark categories (combining marks - approximate).
50-
'Mark': '\\u0300-\\u036F',
5142
'M': '\\u0300-\\u036F',
43+
'Mark': '\\u0300-\\u036F',
5244
// Default_Ignorable_Code_Point (approximate with common invisibles).
53-
// Covers most common cases: soft hyphen, zero-width spaces, format controls, etc.
5445
'Default_Ignorable_Code_Point': '\\u00AD\\u034F\\u061C\\u115F-\\u1160\\u17B4-\\u17B5\\u180B-\\u180D\\u200B-\\u200F\\u202A-\\u202E\\u2060-\\u206F\\u3164\\uFE00-\\uFE0F\\uFEFF\\uFFA0\\uFFF0-\\uFFF8',
5546
}
5647

5748
/**
58-
* Transform Unicode property escapes in regex patterns for ICU-free environments.
59-
*
60-
* @param {string} content - Source code to transform
61-
* @returns {string} Transformed source code
49+
* Check if a regex pattern has unsupported Unicode features.
6250
*/
63-
export function transformUnicodePropertyEscapes(content) {
64-
let transformed = content
65-
66-
// Transform \p{Property} inside character classes [...].
67-
// Example: /[\p{Letter}\p{Number}]+/u → /[a-zA-Z0-9]+/
68-
transformed = transformed.replace(
69-
/\[([^\]]*\\p\{[^}]+\}[^\]]*)\]/g,
70-
(_match, charClass) => {
71-
let newCharClass = charClass
72-
73-
// Replace each \p{Property} with its character class equivalent.
74-
for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
75-
const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
76-
newCharClass = newCharClass.replace(
77-
new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
78-
replacement,
79-
)
80-
}
51+
function hasUnsupportedUnicodeFeatures(pattern) {
52+
// Check for \u{} escapes (require /u flag).
53+
if (/\\u\{[0-9a-fA-F]+\}/.test(pattern)) {
54+
return true
55+
}
56+
// Check for remaining \p{} or \P{} escapes that we don't support.
57+
if (/\\[pP]\{/.test(pattern)) {
58+
return true
59+
}
60+
return false
61+
}
8162

82-
return `[${newCharClass}]`
83-
},
84-
)
63+
/**
64+
* Transform a regex pattern by replacing \p{Property} with character classes.
65+
*/
66+
function transformRegexPattern(pattern) {
67+
let transformed = pattern
8568

86-
// Transform standalone \p{Property} (not inside character class).
87-
// Example: /\p{Letter}+/u → /[a-zA-Z]+/
69+
// Replace \p{Property} with character class equivalents.
8870
for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
8971
const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
90-
// Match \p{Property} that's NOT inside square brackets.
91-
// This is a simplified approach - proper parsing would be better.
72+
// Replace \p{Property} with [replacement].
9273
transformed = transformed.replace(
9374
new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
9475
`[${replacement}]`,
9576
)
9677
}
9778

98-
// Remove /u and /v flags from regexes that used Unicode property escapes.
99-
// This is safe because we've replaced them with basic character classes.
100-
// Match regex literals: /pattern/flags
101-
transformed = transformed.replace(
102-
/\/([^/\\]|\\.)+\/([gimsuvy]+)/g,
103-
(match, _pattern, flags) => {
104-
// Only remove u/v flags if the regex originally had Unicode escapes.
105-
if (flags.includes('u') || flags.includes('v')) {
79+
return transformed
80+
}
81+
82+
/**
83+
* Transform Unicode property escapes in regex patterns for ICU-free environments.
84+
*
85+
* Uses Babel AST parsing to properly identify regex literals and transform them.
86+
*
87+
* @param {string} content - Source code to transform
88+
* @returns {string} Transformed source code
89+
*/
90+
export function transformUnicodePropertyEscapes(content) {
91+
let ast
92+
try {
93+
ast = parse(content, {
94+
sourceType: 'module',
95+
plugins: [],
96+
})
97+
} catch (e) {
98+
// If parsing fails, return content unchanged.
99+
console.warn('Failed to parse code for Unicode transform:', e.message)
100+
return content
101+
}
102+
103+
const s = new MagicString(content)
104+
105+
traverse(ast, {
106+
RegExpLiteral(path) {
107+
const { node } = path
108+
const { pattern, flags } = node
109+
const { start, end } = node
110+
111+
// Check if this regex has /u or /v flags.
112+
const hasUFlag = flags.includes('u')
113+
const hasVFlag = flags.includes('v')
114+
115+
if (!hasUFlag && !hasVFlag) {
116+
// No Unicode flags, nothing to transform.
117+
return
118+
}
119+
120+
// Transform the pattern.
121+
const transformedPattern = transformRegexPattern(pattern)
122+
123+
// Check if transformed pattern still has unsupported Unicode features.
124+
if (hasUnsupportedUnicodeFeatures(transformedPattern)) {
125+
// Replace entire regex with /(?:)/ (no-op regex).
126+
s.overwrite(start, end, '/(?:)/')
127+
return
128+
}
129+
130+
// If pattern changed, update it and remove Unicode flags.
131+
if (transformedPattern !== pattern) {
132+
// Remove /u and /v flags.
106133
const newFlags = flags.replace(/[uv]/g, '')
107-
return match.slice(0, -flags.length) + newFlags
134+
const newRegex = `/${transformedPattern}/${newFlags}`
135+
s.overwrite(start, end, newRegex)
136+
return
137+
}
138+
139+
// Pattern unchanged but has Unicode flags - check if safe to remove flags.
140+
// Only remove flags if pattern has no \u{} escapes or other Unicode-specific syntax.
141+
if (!hasUnsupportedUnicodeFeatures(pattern)) {
142+
// Safe to remove Unicode flags.
143+
const newFlags = flags.replace(/[uv]/g, '')
144+
const newRegex = `/${pattern}/${newFlags}`
145+
s.overwrite(start, end, newRegex)
146+
} else {
147+
// Has unsupported features, replace with no-op.
148+
s.overwrite(start, end, '/(?:)/')
108149
}
109-
return match
110150
},
111-
)
151+
})
112152

113-
return transformed
153+
return s.toString()
114154
}

packages/build-infra/package.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
"./lib/unicode-property-escape-transform": "./lib/unicode-property-escape-transform.mjs"
2323
},
2424
"dependencies": {
25-
"@socketsecurity/lib": "catalog:"
25+
"@babel/parser": "catalog:",
26+
"@babel/traverse": "catalog:",
27+
"@socketsecurity/lib": "catalog:",
28+
"magic-string": "catalog:"
2629
}
2730
}

packages/cli/.config/esbuild.cli.build.mjs

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import { build } from 'esbuild'
88
import { execSync } from 'node:child_process'
99
import { randomUUID } from 'node:crypto'
10-
import { existsSync, readFileSync } from 'node:fs'
10+
import { existsSync, readFileSync, writeFileSync } from 'node:fs'
1111
import path from 'node:path'
1212
import { fileURLToPath } from 'node:url'
1313

@@ -389,10 +389,19 @@ const config = {
389389
// Run build if invoked directly.
390390
// Use fileURLToPath to handle Windows paths correctly.
391391
if (fileURLToPath(import.meta.url) === process.argv[1]) {
392-
build(config).catch(error => {
393-
console.error('Build failed:', error)
394-
process.exitCode = 1
395-
})
392+
build(config)
393+
.then(result => {
394+
// Write the transformed output (build had write: false).
395+
if (result.outputFiles && result.outputFiles.length > 0) {
396+
for (const output of result.outputFiles) {
397+
writeFileSync(output.path, output.contents)
398+
}
399+
}
400+
})
401+
.catch(error => {
402+
console.error('Build failed:', error)
403+
process.exitCode = 1
404+
})
396405
}
397406

398407
export default config

packages/cli/bin/cli.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@ void (async () => {
88
const rootPath = path.join(__dirname, '..')
99
Module.enableCompileCache?.(path.join(rootPath, '.cache'))
1010

11-
// Execute the CLI bundle.
12-
require(path.join(rootPath, 'dist/cli.js'))
11+
// Execute the CLI bundle (decompresses cli.js.bz).
12+
require(path.join(rootPath, 'dist/index.js'))
1313
})()

pnpm-lock.yaml

Lines changed: 23 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pnpm-workspace.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@ packages:
33

44
catalog:
55
# Socket shared dependencies - define versions once, reference with catalog: protocol.
6+
'@babel/parser': 7.28.4
7+
'@babel/traverse': 7.28.4
68
'@socketsecurity/config': 3.0.1
79
'@socketsecurity/lib': 2.7.0
810
'@socketsecurity/registry': 2.0.0
911
'@socketsecurity/sdk': 3.0.14
1012
del-cli: 6.0.0
1113
esbuild: 0.24.0
14+
magic-string: 0.30.19
1215
postject: 1.0.0-alpha.6
1316
semver: 7.6.3

0 commit comments

Comments
 (0)