refactor(build): use Babel AST for Unicode regex transformation

jdalton · jdalton · commit 281ce8b65ccd · 2025-10-29T11:50:07.000-04:00
Replace regex-based string replacement with proper AST parsing for
transforming Unicode property escapes. The previous approach could
incorrectly match non-regex code (like strings in .match() calls),
breaking valid JavaScript.

Changes:
- Rewrite unicode-property-escape-transform.mjs to use @babel/parser,
  @babel/traverse, and magic-string for accurate regex literal detection
- Add @babel/parser, @babel/traverse, and magic-string to catalog
- Add dependencies to build-infra package.json
- Update bootstrap build script to write npm output
- Fix CLI bin/cli.js to load from dist/index.js (decompressor)

The AST-based approach properly identifies RegExpLiteral nodes and
safely transforms or replaces them based on Unicode feature support.
diff --git a/packages/bootstrap/dist/bootstrap-npm.js b/packages/bootstrap/dist/bootstrap-npm.js
diff --git a/packages/bootstrap/scripts/build.mjs b/packages/bootstrap/scripts/build.mjs
@@ -29,6 +29,13 @@ try {
   console.log('→ Building npm bootstrap...')
   const npmResult = await build(npmConfig)
 
+  // Write the transformed output (build had write: false).
+  if (npmResult.outputFiles && npmResult.outputFiles.length > 0) {
+    for (const output of npmResult.outputFiles) {
+      writeFileSync(output.path, output.contents)
+    }
+  }
+
   console.log(`✓ ${npmConfig.outfile}`)
 
   if (npmResult.metafile) {
diff --git a/packages/build-infra/lib/unicode-property-escape-transform.mjs b/packages/build-infra/lib/unicode-property-escape-transform.mjs
@@ -1,114 +1,154 @@
 /**
- * @fileoverview Shared Unicode property escape transformations for --with-intl=none.
+ * @fileoverview Transform Unicode property escapes for --with-intl=none compatibility.
  *
- * Transforms Unicode property escapes (\p{...}) into basic character class alternatives
- * that work without ICU support. This enables Node.js builds with --with-intl=none to
- * save ~6-8MB by removing ICU.
- *
- * Used by:
- * - babel-plugin-with-intl-none.mjs (CLI Babel transforms)
- * - bootstrap esbuild-plugin-smol-transform.mjs (Bootstrap esbuild transforms)
- *
- * @example
- * import { transformUnicodePropertyEscapes } from './unicode-property-escape-transform.mjs'
- *
- * const code = 'const regex = /[\\p{Letter}\\p{Number}]+/u'
- * const transformed = transformUnicodePropertyEscapes(code)
- * // Result: 'const regex = /[a-zA-Z0-9]+/'
+ * This module provides transformations to convert Unicode property escapes
+ * (\p{Property}) into basic character class equivalents that work without ICU support.
  */
 
+import { parse } from '@babel/parser'
+import traverseModule from '@babel/traverse'
+import MagicString from 'magic-string'
+
+// Handle CommonJS default export.
+const traverse = traverseModule.default || traverseModule
+
 /**
  * Map of Unicode property escapes to basic character class alternatives.
- * Approximations are used where exact equivalents don't exist.
- *
- * @type {Record<string, string>}
  */
 export const unicodePropertyMap = {
   __proto__: null,
   // Letter categories.
-  'Letter': 'a-zA-Z',
-  'L': 'a-zA-Z',
-  'Alpha': 'a-zA-Z',
   'Alphabetic': 'a-zA-Z',
+  'Alpha': 'a-zA-Z',
+  'L': 'a-zA-Z',
+  'Letter': 'a-zA-Z',
   // Number categories.
-  'Number': '0-9',
-  'N': '0-9',
   'Digit': '0-9',
+  'N': '0-9',
   'Nd': '0-9',
+  'Number': '0-9',
   // Whitespace.
   'Space': '\\s',
   'White_Space': '\\s',
   // ASCII range.
   'ASCII': '\\x00-\\x7F',
   // Control characters (basic approximation).
-  'Control': '\\x00-\\x1F\\x7F-\\x9F',
   'Cc': '\\x00-\\x1F\\x7F-\\x9F',
+  'Control': '\\x00-\\x1F\\x7F-\\x9F',
   // Format characters (approximate with zero-width space).
-  'Format': '\\u200B-\\u200D\\uFEFF',
   'Cf': '\\u200B-\\u200D\\uFEFF',
+  'Format': '\\u200B-\\u200D\\uFEFF',
   // Mark categories (combining marks - approximate).
-  'Mark': '\\u0300-\\u036F',
   'M': '\\u0300-\\u036F',
+  'Mark': '\\u0300-\\u036F',
   // Default_Ignorable_Code_Point (approximate with common invisibles).
-  // Covers most common cases: soft hyphen, zero-width spaces, format controls, etc.
   'Default_Ignorable_Code_Point': '\\u00AD\\u034F\\u061C\\u115F-\\u1160\\u17B4-\\u17B5\\u180B-\\u180D\\u200B-\\u200F\\u202A-\\u202E\\u2060-\\u206F\\u3164\\uFE00-\\uFE0F\\uFEFF\\uFFA0\\uFFF0-\\uFFF8',
 }
 
 /**
- * Transform Unicode property escapes in regex patterns for ICU-free environments.
- *
- * @param {string} content - Source code to transform
- * @returns {string} Transformed source code
+ * Check if a regex pattern has unsupported Unicode features.
  */
-export function transformUnicodePropertyEscapes(content) {
-  let transformed = content
-
-  // Transform \p{Property} inside character classes [...].
-  // Example: /[\p{Letter}\p{Number}]+/u → /[a-zA-Z0-9]+/
-  transformed = transformed.replace(
-    /\[([^\]]*\\p\{[^}]+\}[^\]]*)\]/g,
-    (_match, charClass) => {
-      let newCharClass = charClass
-
-      // Replace each \p{Property} with its character class equivalent.
-      for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
-        const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
-        newCharClass = newCharClass.replace(
-          new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
-          replacement,
-        )
-      }
+function hasUnsupportedUnicodeFeatures(pattern) {
+  // Check for \u{} escapes (require /u flag).
+  if (/\\u\{[0-9a-fA-F]+\}/.test(pattern)) {
+    return true
+  }
+  // Check for remaining \p{} or \P{} escapes that we don't support.
+  if (/\\[pP]\{/.test(pattern)) {
+    return true
+  }
+  return false
+}
 
-      return `[${newCharClass}]`
-    },
-  )
+/**
+ * Transform a regex pattern by replacing \p{Property} with character classes.
+ */
+function transformRegexPattern(pattern) {
+  let transformed = pattern
 
-  // Transform standalone \p{Property} (not inside character class).
-  // Example: /\p{Letter}+/u → /[a-zA-Z]+/
+  // Replace \p{Property} with character class equivalents.
   for (const [prop, replacement] of Object.entries(unicodePropertyMap)) {
     const escapedProp = prop.replace(/[\\{}]/g, '\\$&')
-    // Match \p{Property} that's NOT inside square brackets.
-    // This is a simplified approach - proper parsing would be better.
+    // Replace \p{Property} with [replacement].
     transformed = transformed.replace(
       new RegExp(`\\\\p\\{${escapedProp}\\}`, 'g'),
       `[${replacement}]`,
     )
   }
 
-  // Remove /u and /v flags from regexes that used Unicode property escapes.
-  // This is safe because we've replaced them with basic character classes.
-  // Match regex literals: /pattern/flags
-  transformed = transformed.replace(
-    /\/([^/\\]|\\.)+\/([gimsuvy]+)/g,
-    (match, _pattern, flags) => {
-      // Only remove u/v flags if the regex originally had Unicode escapes.
-      if (flags.includes('u') || flags.includes('v')) {
+  return transformed
+}
+
+/**
+ * Transform Unicode property escapes in regex patterns for ICU-free environments.
+ *
+ * Uses Babel AST parsing to properly identify regex literals and transform them.
+ *
+ * @param {string} content - Source code to transform
+ * @returns {string} Transformed source code
+ */
+export function transformUnicodePropertyEscapes(content) {
+  let ast
+  try {
+    ast = parse(content, {
+      sourceType: 'module',
+      plugins: [],
+    })
+  } catch (e) {
+    // If parsing fails, return content unchanged.
+    console.warn('Failed to parse code for Unicode transform:', e.message)
+    return content
+  }
+
+  const s = new MagicString(content)
+
+  traverse(ast, {
+    RegExpLiteral(path) {
+      const { node } = path
+      const { pattern, flags } = node
+      const { start, end } = node
+
+      // Check if this regex has /u or /v flags.
+      const hasUFlag = flags.includes('u')
+      const hasVFlag = flags.includes('v')
+
+      if (!hasUFlag && !hasVFlag) {
+        // No Unicode flags, nothing to transform.
+        return
+      }
+
+      // Transform the pattern.
+      const transformedPattern = transformRegexPattern(pattern)
+
+      // Check if transformed pattern still has unsupported Unicode features.
+      if (hasUnsupportedUnicodeFeatures(transformedPattern)) {
+        // Replace entire regex with /(?:)/ (no-op regex).
+        s.overwrite(start, end, '/(?:)/')
+        return
+      }
+
+      // If pattern changed, update it and remove Unicode flags.
+      if (transformedPattern !== pattern) {
+        // Remove /u and /v flags.
         const newFlags = flags.replace(/[uv]/g, '')
-        return match.slice(0, -flags.length) + newFlags
+        const newRegex = `/${transformedPattern}/${newFlags}`
+        s.overwrite(start, end, newRegex)
+        return
+      }
+
+      // Pattern unchanged but has Unicode flags - check if safe to remove flags.
+      // Only remove flags if pattern has no \u{} escapes or other Unicode-specific syntax.
+      if (!hasUnsupportedUnicodeFeatures(pattern)) {
+        // Safe to remove Unicode flags.
+        const newFlags = flags.replace(/[uv]/g, '')
+        const newRegex = `/${pattern}/${newFlags}`
+        s.overwrite(start, end, newRegex)
+      } else {
+        // Has unsupported features, replace with no-op.
+        s.overwrite(start, end, '/(?:)/')
       }
-      return match
     },
-  )
+  })
 
-  return transformed
+  return s.toString()
 }
diff --git a/packages/build-infra/package.json b/packages/build-infra/package.json
@@ -22,6 +22,9 @@
     "./lib/unicode-property-escape-transform": "./lib/unicode-property-escape-transform.mjs"
   },
   "dependencies": {
-    "@socketsecurity/lib": "catalog:"
+    "@babel/parser": "catalog:",
+    "@babel/traverse": "catalog:",
+    "@socketsecurity/lib": "catalog:",
+    "magic-string": "catalog:"
   }
 }
diff --git a/packages/cli/.config/esbuild.cli.build.mjs b/packages/cli/.config/esbuild.cli.build.mjs
@@ -7,7 +7,7 @@
 import { build } from 'esbuild'
 import { execSync } from 'node:child_process'
 import { randomUUID } from 'node:crypto'
-import { existsSync, readFileSync } from 'node:fs'
+import { existsSync, readFileSync, writeFileSync } from 'node:fs'
 import path from 'node:path'
 import { fileURLToPath } from 'node:url'
 
@@ -389,10 +389,19 @@ const config = {
 // Run build if invoked directly.
 // Use fileURLToPath to handle Windows paths correctly.
 if (fileURLToPath(import.meta.url) === process.argv[1]) {
-  build(config).catch(error => {
-    console.error('Build failed:', error)
-    process.exitCode = 1
-  })
+  build(config)
+    .then(result => {
+      // Write the transformed output (build had write: false).
+      if (result.outputFiles && result.outputFiles.length > 0) {
+        for (const output of result.outputFiles) {
+          writeFileSync(output.path, output.contents)
+        }
+      }
+    })
+    .catch(error => {
+      console.error('Build failed:', error)
+      process.exitCode = 1
+    })
 }
 
 export default config
diff --git a/packages/cli/bin/cli.js b/packages/cli/bin/cli.js
@@ -8,6 +8,6 @@ void (async () => {
   const rootPath = path.join(__dirname, '..')
   Module.enableCompileCache?.(path.join(rootPath, '.cache'))
 
-  // Execute the CLI bundle.
-  require(path.join(rootPath, 'dist/cli.js'))
+  // Execute the CLI bundle (decompresses cli.js.bz).
+  require(path.join(rootPath, 'dist/index.js'))
 })()
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml
@@ -3,11 +3,14 @@ packages:
 
 catalog:
   # Socket shared dependencies - define versions once, reference with catalog: protocol.
+  '@babel/parser': 7.28.4
+  '@babel/traverse': 7.28.4
   '@socketsecurity/config': 3.0.1
   '@socketsecurity/lib': 2.7.0
   '@socketsecurity/registry': 2.0.0
   '@socketsecurity/sdk': 3.0.14
   del-cli: 6.0.0
   esbuild: 0.24.0
+  magic-string: 0.30.19
   postject: 1.0.0-alpha.6
   semver: 7.6.3

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,9 @@`
`22`	`22`	`"./lib/unicode-property-escape-transform": "./lib/unicode-property-escape-transform.mjs"`
`23`	`23`	`},`
`24`	`24`	`"dependencies": {`
`25`		`- "@socketsecurity/lib": "catalog:"`
	`25`	`+ "@babel/parser": "catalog:",`
	`26`	`+ "@babel/traverse": "catalog:",`
	`27`	`+ "@socketsecurity/lib": "catalog:",`
	`28`	`+ "magic-string": "catalog:"`
`26`	`29`	`}`
`27`	`30`	`}`