From 7a5aa80b3f63ea14316e1bed455aa2b73961a527 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Sun, 1 Mar 2026 17:59:33 -0300 Subject: [PATCH 1/6] Add Unicode runtime prototype script --- README.md | 24 +++ prototype_ucd_all.py | 463 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 487 insertions(+) create mode 100644 prototype_ucd_all.py diff --git a/README.md b/README.md index a36e085..a3cfc84 100644 --- a/README.md +++ b/README.md @@ -156,6 +156,30 @@ on the `compression` parameter. pytest ``` +## Unicode Prototype + +To prototype an "all properties except Unihan" UCD build against Unicode 17.0.0, +download the non-Unihan XML plus the text UCD bundle and analyze every +per-codepoint property with: + +```bash +python prototype_ucd_all.py --download \ + --json-out data/ucd/17.0.0/prototype-summary.json +``` + +The prototype uses `ucd.nounihan.flat.zip` as the primary source and +supplements the small set of non-Unihan properties not present in the XML +(`FC_NFKC`, `Gr_Link`, `Hyphen`, `Name_Alias`, `XO_*`, and empty `isc`) +from the text UCD files. It also applies a couple of property-specific +prototype transforms: `bmg` is packed as `bmg(u) - u`, and Hangul syllables +are elided from `dm` so they can be handled algorithmically. + +By default it uses a runtime-oriented profile that excludes names and the +specialist `kEH_*`, `kTGT_*`, and `kNSHU_*` property families. Use +`--profile full` to analyze the full non-Unihan property surface instead. +Pass `--c-out path/to/ucd_runtime.c` to also emit a single C source file with +one accessor per selected property. + ## History I first wrote something like this back in 2001 when I needed it in FriBidi: diff --git a/prototype_ucd_all.py b/prototype_ucd_all.py new file mode 100644 index 0000000..9675bae --- /dev/null +++ b/prototype_ucd_all.py @@ -0,0 +1,463 @@ +#!/usr/bin/env python3 +"""Prototype a compact all-properties Unicode Character Database build. + +This script downloads and analyzes the non-Unihan Unicode Character Database +and runs packTab over every per-codepoint property it can account for. + +The prototype uses the UCD XML as the primary source, then supplements the +small set of non-Unihan properties not carried in the XML with data from the +text UCD bundle. +""" + +from __future__ import annotations + +import argparse +import json +from collections import Counter, defaultdict +from pathlib import Path +from typing import Callable +import urllib.request +import zipfile + +from packTab import Code, pack_table +from packTab.ucdxml import load_ucdxml, ucdxml_get_repertoire + + +UNICODE_VERSION = "17.0.0" +CODEPOINT_COUNT = 0x110000 +DATA_DIR = Path("data/ucd") / UNICODE_VERSION +XML_NAME = "ucd.nounihan.flat.zip" +UCD_NAME = "UCD.zip" +XML_URL = f"https://www.unicode.org/Public/{UNICODE_VERSION}/ucdxml/{XML_NAME}" +UCD_URL = f"https://www.unicode.org/Public/{UNICODE_VERSION}/ucd/{UCD_NAME}" + +DEFAULT_RUNTIME_EXCLUDES = { + "JSN", + "Name_Alias", + "NFKC_CF", + "NFKC_SCF", + "na", + "na1", +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--data-dir", type=Path, default=DATA_DIR) + parser.add_argument( + "--compression", + type=float, + default=10, + help="packTab compression mode to use for prototype output (default: 10)", + ) + parser.add_argument( + "--download", + action="store_true", + help="download the required Unicode source files if they are missing", + ) + parser.add_argument( + "--json-out", + type=Path, + help="write the summary report as JSON", + ) + parser.add_argument( + "--c-out", + type=Path, + help="write generated C accessors for the selected properties", + ) + parser.add_argument( + "--only", + help="comma-separated property short names to analyze", + ) + parser.add_argument( + "--limit", + type=int, + help="only analyze the first N properties after filtering", + ) + parser.add_argument( + "--profile", + choices=("full", "runtime"), + default="runtime", + help="property profile to analyze (default: runtime)", + ) + parser.add_argument( + "--exclude", + help="comma-separated property short names to exclude in addition to the profile", + ) + return parser.parse_args() + + +def ensure_file(path: Path, url: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + if path.exists(): + return + with urllib.request.urlopen(url) as src, open(path, "wb") as dst: + dst.write(src.read()) + + +def parse_range(field: str) -> range: + if ".." in field: + start, end = field.split("..") + return range(int(start, 16), int(end, 16) + 1) + cp = int(field, 16) + return range(cp, cp + 1) + + +def parse_semicolon_records(text: str): + for raw in text.splitlines(): + line = raw.split("#", 1)[0].strip() + if not line: + continue + yield [field.strip() for field in line.split(";")] + + +def load_property_aliases(ucd_zip: Path) -> dict[str, dict[str, list[str] | str]]: + aliases = {} + with zipfile.ZipFile(ucd_zip) as zf: + for fields in parse_semicolon_records(zf.read("PropertyAliases.txt").decode("utf-8")): + aliases[fields[0]] = { + "long": fields[1], + "aliases": fields[2:], + } + return aliases + + +def load_xml_repertoire(xml_zip: Path): + return ucdxml_get_repertoire(load_ucdxml(xml_zip)) + + +def xml_property_values(repertoire, prop: str) -> list[str]: + return [entry.get(prop, "") for entry in repertoire] + + +def binary_property_values(ucd_zip: Path, member: str, prop_name: str) -> list[str]: + values = ["N"] * CODEPOINT_COUNT + with zipfile.ZipFile(ucd_zip) as zf: + text = zf.read(member).decode("utf-8") + for fields in parse_semicolon_records(text): + if fields[1] != prop_name: + continue + for cp in parse_range(fields[0]): + values[cp] = "Y" + return values + + +def string_property_values(ucd_zip: Path, member: str, prop_name: str) -> list[str]: + values = [""] * CODEPOINT_COUNT + with zipfile.ZipFile(ucd_zip) as zf: + text = zf.read(member).decode("utf-8") + for fields in parse_semicolon_records(text): + if fields[1] != prop_name: + continue + for cp in parse_range(fields[0]): + values[cp] = fields[2] + return values + + +def name_alias_values(ucd_zip: Path) -> list[str]: + aliases = defaultdict(list) + with zipfile.ZipFile(ucd_zip) as zf: + text = zf.read("NameAliases.txt").decode("utf-8") + for fields in parse_semicolon_records(text): + cp = int(fields[0], 16) + aliases[cp].append(f"{fields[2]}:{fields[1]}") + values = [""] * CODEPOINT_COUNT + for cp, vals in aliases.items(): + values[cp] = "|".join(vals) + return values + + +def iso_comment_values(ucd_zip: Path) -> list[str]: + values = [""] * CODEPOINT_COUNT + with zipfile.ZipFile(ucd_zip) as zf: + for line in zf.read("UnicodeData.txt").decode("utf-8").splitlines(): + fields = line.split(";") + values[int(fields[0], 16)] = fields[11] + return values + + +SUPPLEMENT_LOADERS: dict[str, tuple[str, Callable[[Path], list[str]]]] = { + "FC_NFKC": ( + "DerivedNormalizationProps.txt", + lambda ucd_zip: string_property_values( + ucd_zip, "DerivedNormalizationProps.txt", "FC_NFKC" + ), + ), + "Gr_Link": ( + "DerivedCoreProperties.txt", + lambda ucd_zip: binary_property_values( + ucd_zip, "DerivedCoreProperties.txt", "Grapheme_Link" + ), + ), + "Hyphen": ( + "PropList.txt", + lambda ucd_zip: binary_property_values(ucd_zip, "PropList.txt", "Hyphen"), + ), + "Name_Alias": ("NameAliases.txt", name_alias_values), + "XO_NFC": ( + "DerivedNormalizationProps.txt", + lambda ucd_zip: binary_property_values( + ucd_zip, "DerivedNormalizationProps.txt", "Expands_On_NFC" + ), + ), + "XO_NFD": ( + "DerivedNormalizationProps.txt", + lambda ucd_zip: binary_property_values( + ucd_zip, "DerivedNormalizationProps.txt", "Expands_On_NFD" + ), + ), + "XO_NFKC": ( + "DerivedNormalizationProps.txt", + lambda ucd_zip: binary_property_values( + ucd_zip, "DerivedNormalizationProps.txt", "Expands_On_NFKC" + ), + ), + "XO_NFKD": ( + "DerivedNormalizationProps.txt", + lambda ucd_zip: binary_property_values( + ucd_zip, "DerivedNormalizationProps.txt", "Expands_On_NFKD" + ), + ), + "isc": ("UnicodeData.txt", iso_comment_values), +} + +IGNORED_UNIHAN_PROPERTIES = { + "cjkAccountingNumeric", + "cjkCompatibilityVariant", + "cjkIICore", + "cjkIRG_GSource", + "cjkIRG_HSource", + "cjkIRG_JSource", + "cjkIRG_KPSource", + "cjkIRG_KSource", + "cjkIRG_MSource", + "cjkIRG_SSource", + "cjkIRG_TSource", + "cjkIRG_UKSource", + "cjkIRG_USource", + "cjkIRG_VSource", + "cjkMandarin", + "cjkOtherNumeric", + "cjkPrimaryNumeric", + "cjkRSUnicode", + "cjkTotalStrokes", + "cjkUnihanCore2020", +} + + +def runtime_excludes(props: set[str]) -> set[str]: + excluded = set(DEFAULT_RUNTIME_EXCLUDES) + excluded.update(prop for prop in props if prop.startswith("kEH_")) + excluded.update(prop for prop in props if prop.startswith("kTGT_")) + excluded.update(prop for prop in props if prop.startswith("kNSHU_")) + return excluded + + +def transform_bmg(values: list[str]) -> list[int]: + data = [0] * len(values) + for cp, value in enumerate(values): + if value: + data[cp] = int(value, 16) - cp + return data + + +def transform_dm_hangul(values: list[str]) -> list[str]: + out = list(values) + for cp in range(0xAC00, 0xD7A4): + out[cp] = "" + return out + + +PROPERTY_TRANSFORMS: dict[str, tuple[str, Callable[[list], list]]] = { + "bmg": ("delta from codepoint", transform_bmg), + "dm": ("Hangul syllables elided algorithmically", transform_dm_hangul), +} + + +def build_packed_data(values: list) -> tuple[list[int], dict | None, object]: + counts = Counter(values) + default = counts.most_common(1)[0][0] + if all(isinstance(value, int) for value in values): + return values, None, default + + mapping: dict[object, int] = {} + data: list[int] = [] + for value in values: + if value not in mapping: + mapping[value] = len(mapping) + data.append(mapping[value]) + return data, mapping, default + + +def sanitize_symbol(prop: str) -> str: + return "".join(ch if ch.isalnum() else "_" for ch in prop) + + +def analyze_property( + prop: str, + values: list, + compression: float, + metadata: dict[str, str], +) -> tuple[dict[str, object], dict[str, object]]: + transform = PROPERTY_TRANSFORMS.get(prop) + transformed_values = transform[1](values) if transform else values + data, mapping, default = build_packed_data(transformed_values) + packed_default = mapping[default] if mapping is not None else default + solution = pack_table(data, default=packed_default, compression=compression) + non_default = len(data) - transformed_values.count(default) + default_label = "" if default == "" else default + result = { + "property": prop, + "long_name": metadata.get("long", prop), + "source": metadata["source"], + "values": len(mapping) if mapping is not None else len(set(transformed_values)), + "default": default_label, + "non_default_codepoints": non_default, + "lookups": solution.nLookups, + "extra_ops": solution.nExtraOps, + "bytes": solution.cost, + "full_cost": solution.fullCost, + "transform": transform[0] if transform else "", + } + generated = { + "property": prop, + "symbol": sanitize_symbol(prop), + "solution": solution, + "mapping": mapping, + "default": default, + } + return result, generated + + +def write_c_output(path: Path, generated_props: list[dict[str, object]], profile: str) -> None: + code = Code("ucd") + header_lines = [ + f"/* Unicode {UNICODE_VERSION} non-Unihan prototype ({profile}) */", + "/* Generated by prototype_ucd_all.py */", + "", + ] + for item in generated_props: + prop = item["property"] + symbol = item["symbol"] + mapping = item["mapping"] + default = item["default"] + if mapping is None: + header_lines.append(f"/* {prop}: direct integer property; default {default} */") + else: + reverse = sorted(mapping.items(), key=lambda kv: kv[1]) + preview = ", ".join(f"{idx}={value!r}" for value, idx in reverse[:12]) + if len(reverse) > 12: + preview += ", ..." + default_id = mapping[default] + header_lines.append( + f"/* {prop}: default id {default_id}; values {preview} */" + ) + item["solution"].genCode(code, f"{symbol}_get", language="c", private=False) + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + for line in header_lines: + f.write(line) + f.write("\n") + f.write("\n") + code.print_code(file=f, language="c", private=False) + + +def collect_property_sources( + repertoire, + aliases: dict[str, dict[str, list[str] | str]], +) -> tuple[set[str], dict[str, dict[str, str]]]: + xml_props = set() + for entry in repertoire: + xml_props.update(entry.keys()) + + metadata: dict[str, dict[str, str]] = {} + for prop in sorted(xml_props): + long_name = aliases.get(prop, {}).get("long", prop) + metadata[prop] = {"source": "xml", "long": str(long_name)} + + for prop in sorted(SUPPLEMENT_LOADERS): + long_name = aliases.get(prop, {}).get("long", prop) + metadata[prop] = {"source": SUPPLEMENT_LOADERS[prop][0], "long": str(long_name)} + + props = set(metadata) - IGNORED_UNIHAN_PROPERTIES + return props, metadata + + +def main() -> int: + args = parse_args() + + xml_zip = args.data_dir / XML_NAME + ucd_zip = args.data_dir / UCD_NAME + if args.download: + ensure_file(xml_zip, XML_URL) + ensure_file(ucd_zip, UCD_URL) + elif not (xml_zip.exists() and ucd_zip.exists()): + raise SystemExit( + f"Missing UCD input files under {args.data_dir}. Re-run with --download." + ) + + repertoire = load_xml_repertoire(xml_zip) + aliases = load_property_aliases(ucd_zip) + props, metadata = collect_property_sources(repertoire, aliases) + excluded = set() + if args.profile == "runtime": + excluded.update(runtime_excludes(props)) + if args.exclude: + excluded.update(item.strip() for item in args.exclude.split(",") if item.strip()) + props = sorted(props - excluded) + + if args.only: + wanted = {item.strip() for item in args.only.split(",") if item.strip()} + props = [prop for prop in props if prop in wanted] + if args.limit is not None: + props = props[: args.limit] + + results = [] + generated_props = [] + for prop in props: + source = metadata[prop]["source"] + if source == "xml": + values = xml_property_values(repertoire, prop) + else: + values = SUPPLEMENT_LOADERS[prop][1](ucd_zip) + result, generated = analyze_property(prop, values, args.compression, metadata[prop]) + results.append(result) + generated_props.append(generated) + print( + f"{prop:12} {results[-1]['bytes']:8} bytes " + f"{results[-1]['lookups']} lookups {results[-1]['values']:6} values " + f"{results[-1]['source']}" + ) + + total_bytes = sum(item["bytes"] for item in results) + total_full_cost = sum(item["full_cost"] for item in results) + summary = { + "unicode_version": UNICODE_VERSION, + "compression": args.compression, + "profile": args.profile, + "excluded_properties": sorted(excluded), + "property_count": len(results), + "total_bytes": total_bytes, + "total_full_cost": total_full_cost, + "properties": results, + } + + print() + print(f"Unicode {UNICODE_VERSION} non-Unihan prototype ({args.profile})") + print(f"Properties analyzed: {len(results)}") + print(f"Total packed bytes: {total_bytes}") + print(f"Total full cost: {total_full_cost}") + + if args.json_out: + args.json_out.parent.mkdir(parents=True, exist_ok=True) + with open(args.json_out, "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2, sort_keys=True) + f.write("\n") + if args.c_out: + write_c_output(args.c_out, generated_props, args.profile) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 7c1c240aac3f6e5005a5e12c92a6ffe691bfb7fd Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Sun, 1 Mar 2026 18:10:59 -0300 Subject: [PATCH 2/6] Track inlined Unicode prototype properties --- prototype_ucd_all.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/prototype_ucd_all.py b/prototype_ucd_all.py index 9675bae..601343a 100644 --- a/prototype_ucd_all.py +++ b/prototype_ucd_all.py @@ -167,15 +167,6 @@ def name_alias_values(ucd_zip: Path) -> list[str]: return values -def iso_comment_values(ucd_zip: Path) -> list[str]: - values = [""] * CODEPOINT_COUNT - with zipfile.ZipFile(ucd_zip) as zf: - for line in zf.read("UnicodeData.txt").decode("utf-8").splitlines(): - fields = line.split(";") - values[int(fields[0], 16)] = fields[11] - return values - - SUPPLEMENT_LOADERS: dict[str, tuple[str, Callable[[Path], list[str]]]] = { "FC_NFKC": ( "DerivedNormalizationProps.txt", @@ -218,7 +209,6 @@ def iso_comment_values(ucd_zip: Path) -> list[str]: ucd_zip, "DerivedNormalizationProps.txt", "Expands_On_NFKD" ), ), - "isc": ("UnicodeData.txt", iso_comment_values), } IGNORED_UNIHAN_PROPERTIES = { @@ -293,6 +283,12 @@ def sanitize_symbol(prop: str) -> str: return "".join(ch if ch.isalnum() else "_" for ch in prop) +def is_fully_inlined(solution, symbol: str) -> bool: + code = Code("probe") + solution.genCode(code, f"{sanitize_symbol(symbol)}_get", language="c", private=False) + return not code.arrays + + def analyze_property( prop: str, values: list, @@ -318,6 +314,7 @@ def analyze_property( "bytes": solution.cost, "full_cost": solution.fullCost, "transform": transform[0] if transform else "", + "fully_inlined": is_fully_inlined(solution, prop), } generated = { "property": prop, @@ -431,12 +428,14 @@ def main() -> int: total_bytes = sum(item["bytes"] for item in results) total_full_cost = sum(item["full_cost"] for item in results) + inlined = [item["property"] for item in results if item["fully_inlined"]] summary = { "unicode_version": UNICODE_VERSION, "compression": args.compression, "profile": args.profile, "excluded_properties": sorted(excluded), "property_count": len(results), + "fully_inlined_properties": inlined, "total_bytes": total_bytes, "total_full_cost": total_full_cost, "properties": results, @@ -445,6 +444,7 @@ def main() -> int: print() print(f"Unicode {UNICODE_VERSION} non-Unihan prototype ({args.profile})") print(f"Properties analyzed: {len(results)}") + print(f"Fully inlined properties: {len(inlined)}") print(f"Total packed bytes: {total_bytes}") print(f"Total full cost: {total_full_cost}") From ebf4cc459f679c42e87cdb47288a06c52693c44c Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Sun, 1 Mar 2026 18:16:03 -0300 Subject: [PATCH 3/6] Exclude deprecated UCD properties from prototype --- README.md | 1 + prototype_ucd_all.py | 81 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a3cfc84..cbb4198 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,7 @@ specialist `kEH_*`, `kTGT_*`, and `kNSHU_*` property families. Use `--profile full` to analyze the full non-Unihan property surface instead. Pass `--c-out path/to/ucd_runtime.c` to also emit a single C source file with one accessor per selected property. +Deprecated UCD properties such as `FC_NFKC`, `Gr_Link`, and `XO_*` are omitted. ## History diff --git a/prototype_ucd_all.py b/prototype_ucd_all.py index 601343a..9f1d0bd 100644 --- a/prototype_ucd_all.py +++ b/prototype_ucd_all.py @@ -40,6 +40,15 @@ "na1", } +DEPRECATED_PROPERTIES = { + "FC_NFKC", + "Gr_Link", + "XO_NFC", + "XO_NFD", + "XO_NFKC", + "XO_NFKD", +} + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) @@ -263,6 +272,22 @@ def transform_dm_hangul(values: list[str]) -> list[str]: "dm": ("Hangul syllables elided algorithmically", transform_dm_hangul), } +SHARED_STRING_POOL_CANDIDATES = { + "FC_NFKC", + "EqUIdeo", + "bpb", + "cf", + "dm", + "lc", + "nv", + "scf", + "slc", + "stc", + "suc", + "tc", + "uc", +} + def build_packed_data(values: list) -> tuple[list[int], dict | None, object]: counts = Counter(values) @@ -289,6 +314,51 @@ def is_fully_inlined(solution, symbol: str) -> bool: return not code.arrays +def string_storage_bytes(value: str) -> int: + return len(value.encode("utf-8")) + 1 + + +def analyze_shared_string_pool(generated_props: list[dict[str, object]]) -> dict[str, object]: + candidate_props = [] + local_bytes = 0 + shared_strings: set[str] = set() + string_occurrences: defaultdict[str, set[str]] = defaultdict(set) + + for item in generated_props: + prop = item["property"] + mapping = item["mapping"] + if mapping is None or prop not in SHARED_STRING_POOL_CANDIDATES: + continue + values = set(mapping.keys()) + candidate_props.append(prop) + local_bytes += sum(string_storage_bytes(value) for value in values) + shared_strings.update(values) + for value in values: + string_occurrences[value].add(prop) + + shared_bytes = sum(string_storage_bytes(value) for value in shared_strings) + repeated = [ + { + "value": value, + "properties": sorted(props), + "bytes": string_storage_bytes(value), + } + for value, props in string_occurrences.items() + if len(props) > 1 + ] + repeated.sort(key=lambda item: (-len(item["properties"]), -item["bytes"], item["value"])) + + return { + "candidate_properties": sorted(candidate_props), + "property_count": len(candidate_props), + "local_string_bytes": local_bytes, + "shared_string_bytes": shared_bytes, + "potential_savings": local_bytes - shared_bytes, + "reused_string_count": len(repeated), + "most_reused_strings": repeated[:20], + } + + def analyze_property( prop: str, values: list, @@ -376,7 +446,7 @@ def collect_property_sources( long_name = aliases.get(prop, {}).get("long", prop) metadata[prop] = {"source": SUPPLEMENT_LOADERS[prop][0], "long": str(long_name)} - props = set(metadata) - IGNORED_UNIHAN_PROPERTIES + props = set(metadata) - IGNORED_UNIHAN_PROPERTIES - DEPRECATED_PROPERTIES return props, metadata @@ -439,6 +509,7 @@ def main() -> int: "total_bytes": total_bytes, "total_full_cost": total_full_cost, "properties": results, + "shared_string_pool": analyze_shared_string_pool(generated_props), } print() @@ -447,6 +518,14 @@ def main() -> int: print(f"Fully inlined properties: {len(inlined)}") print(f"Total packed bytes: {total_bytes}") print(f"Total full cost: {total_full_cost}") + pool = summary["shared_string_pool"] + print( + "Shared string pool candidates: " + f"{pool['property_count']} properties, " + f"{pool['local_string_bytes']} local bytes -> " + f"{pool['shared_string_bytes']} shared bytes " + f"(save {pool['potential_savings']})" + ) if args.json_out: args.json_out.parent.mkdir(parents=True, exist_ok=True) From 3eb52931530cfa7c5a9ec9e75e59bcb7aea07d70 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Sun, 1 Mar 2026 18:20:02 -0300 Subject: [PATCH 4/6] Analyze pooled string UCD properties --- prototype_ucd_all.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/prototype_ucd_all.py b/prototype_ucd_all.py index 9f1d0bd..a4a620c 100644 --- a/prototype_ucd_all.py +++ b/prototype_ucd_all.py @@ -308,12 +308,6 @@ def sanitize_symbol(prop: str) -> str: return "".join(ch if ch.isalnum() else "_" for ch in prop) -def is_fully_inlined(solution, symbol: str) -> bool: - code = Code("probe") - solution.genCode(code, f"{sanitize_symbol(symbol)}_get", language="c", private=False) - return not code.arrays - - def string_storage_bytes(value: str) -> int: return len(value.encode("utf-8")) + 1 @@ -323,6 +317,8 @@ def analyze_shared_string_pool(generated_props: list[dict[str, object]]) -> dict local_bytes = 0 shared_strings: set[str] = set() string_occurrences: defaultdict[str, set[str]] = defaultdict(set) + current_table_bytes = 0 + pooled_table_bytes = 0 for item in generated_props: prop = item["property"] @@ -333,10 +329,23 @@ def analyze_shared_string_pool(generated_props: list[dict[str, object]]) -> dict candidate_props.append(prop) local_bytes += sum(string_storage_bytes(value) for value in values) shared_strings.update(values) + current_table_bytes += item["solution"].cost for value in values: string_occurrences[value].add(prop) shared_bytes = sum(string_storage_bytes(value) for value in shared_strings) + global_mapping = {value: i for i, value in enumerate(sorted(shared_strings))} + for item in generated_props: + prop = item["property"] + mapping = item["mapping"] + if mapping is None or prop not in SHARED_STRING_POOL_CANDIDATES: + continue + transformed_values = item["transformed_values"] + pooled_data = [global_mapping[value] for value in transformed_values] + pooled_default = global_mapping[item["default"]] + pooled_solution = pack_table(pooled_data, default=pooled_default, compression=10) + pooled_table_bytes += pooled_solution.cost + repeated = [ { "value": value, @@ -353,7 +362,12 @@ def analyze_shared_string_pool(generated_props: list[dict[str, object]]) -> dict "property_count": len(candidate_props), "local_string_bytes": local_bytes, "shared_string_bytes": shared_bytes, - "potential_savings": local_bytes - shared_bytes, + "current_table_bytes": current_table_bytes, + "pooled_table_bytes": pooled_table_bytes, + "current_total_bytes": current_table_bytes + local_bytes, + "pooled_total_bytes": pooled_table_bytes + shared_bytes, + "potential_savings": (current_table_bytes + local_bytes) + - (pooled_table_bytes + shared_bytes), "reused_string_count": len(repeated), "most_reused_strings": repeated[:20], } @@ -384,7 +398,6 @@ def analyze_property( "bytes": solution.cost, "full_cost": solution.fullCost, "transform": transform[0] if transform else "", - "fully_inlined": is_fully_inlined(solution, prop), } generated = { "property": prop, @@ -392,6 +405,7 @@ def analyze_property( "solution": solution, "mapping": mapping, "default": default, + "transformed_values": transformed_values, } return result, generated @@ -498,14 +512,12 @@ def main() -> int: total_bytes = sum(item["bytes"] for item in results) total_full_cost = sum(item["full_cost"] for item in results) - inlined = [item["property"] for item in results if item["fully_inlined"]] summary = { "unicode_version": UNICODE_VERSION, "compression": args.compression, "profile": args.profile, "excluded_properties": sorted(excluded), "property_count": len(results), - "fully_inlined_properties": inlined, "total_bytes": total_bytes, "total_full_cost": total_full_cost, "properties": results, @@ -515,15 +527,14 @@ def main() -> int: print() print(f"Unicode {UNICODE_VERSION} non-Unihan prototype ({args.profile})") print(f"Properties analyzed: {len(results)}") - print(f"Fully inlined properties: {len(inlined)}") print(f"Total packed bytes: {total_bytes}") print(f"Total full cost: {total_full_cost}") pool = summary["shared_string_pool"] print( "Shared string pool candidates: " f"{pool['property_count']} properties, " - f"{pool['local_string_bytes']} local bytes -> " - f"{pool['shared_string_bytes']} shared bytes " + f"{pool['current_total_bytes']} current bytes -> " + f"{pool['pooled_total_bytes']} pooled bytes " f"(save {pool['potential_savings']})" ) From 7c285fa104933cb2a9b1e82e7bd53ceee4e6b13f Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Sun, 1 Mar 2026 18:22:26 -0300 Subject: [PATCH 5/6] Prototype pooled Unicode string tables --- README.md | 2 ++ prototype_ucd_all.py | 78 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/README.md b/README.md index cbb4198..6671e83 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,8 @@ specialist `kEH_*`, `kTGT_*`, and `kNSHU_*` property families. Use Pass `--c-out path/to/ucd_runtime.c` to also emit a single C source file with one accessor per selected property. Deprecated UCD properties such as `FC_NFKC`, `Gr_Link`, and `XO_*` are omitted. +Pass `--pooled-c-out path/to/ucd_pooled.c` to prototype a shared-string C +backend for the string-heavy runtime properties. ## History diff --git a/prototype_ucd_all.py b/prototype_ucd_all.py index a4a620c..911dbde 100644 --- a/prototype_ucd_all.py +++ b/prototype_ucd_all.py @@ -74,6 +74,11 @@ def parse_args() -> argparse.Namespace: type=Path, help="write generated C accessors for the selected properties", ) + parser.add_argument( + "--pooled-c-out", + type=Path, + help="write a pooled-string C prototype for string-heavy properties", + ) parser.add_argument( "--only", help="comma-separated property short names to analyze", @@ -443,6 +448,75 @@ def write_c_output(path: Path, generated_props: list[dict[str, object]], profile code.print_code(file=f, language="c", private=False) +def pooled_string_candidates(generated_props: list[dict[str, object]]) -> list[dict[str, object]]: + return [ + item + for item in generated_props + if item["mapping"] is not None and item["property"] in SHARED_STRING_POOL_CANDIDATES + ] + + +def write_pooled_c_output( + path: Path, generated_props: list[dict[str, object]], profile: str, compression: float +) -> None: + candidates = pooled_string_candidates(generated_props) + shared_strings = sorted( + {value for item in candidates for value in item["mapping"].keys()} + ) + string_to_id = {value: i for i, value in enumerate(shared_strings)} + + blob = bytearray() + offsets = [] + for value in shared_strings: + offsets.append(len(blob)) + blob.extend(value.encode("utf-8")) + blob.append(0) + + code = Code("ucdpool") + header_lines = [ + f"/* Unicode {UNICODE_VERSION} pooled-string prototype ({profile}) */", + "/* Generated by prototype_ucd_all.py */", + "", + "/* Shared string pool covers these properties: */", + "/* " + ", ".join(item["property"] for item in candidates) + " */", + "", + ] + + code.addArray("uint8_t", "strpool", list(blob)) + code.addArray("uint32_t", "stroff", offsets) + + for item in candidates: + prop = item["property"] + symbol = item["symbol"] + transformed_values = item["transformed_values"] + pooled_data = [string_to_id[value] for value in transformed_values] + pooled_default = string_to_id[item["default"]] + pooled_solution = pack_table(pooled_data, default=pooled_default, compression=compression) + pooled_solution.genCode(code, f"{symbol}_id_get", language="c", private=False) + header_lines.append( + f"/* {prop}: pooled ids {len(set(pooled_data))}, default {pooled_default} */" + ) + body = ( + f"return (const char *)(ucdpool_strpool + " + f"ucdpool_stroff[ucdpool_{symbol}_id_get(u)]);" + ) + code.addFunction( + "const char *", + f"{symbol}_get", + (("unsigned", "u"),), + body, + private=False, + ) + + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + for line in header_lines: + f.write(line) + f.write("\n") + f.write("\n") + code.print_code(file=f, language="c", private=False) + + def collect_property_sources( repertoire, aliases: dict[str, dict[str, list[str] | str]], @@ -545,6 +619,10 @@ def main() -> int: f.write("\n") if args.c_out: write_c_output(args.c_out, generated_props, args.profile) + if args.pooled_c_out: + write_pooled_c_output( + args.pooled_c_out, generated_props, args.profile, args.compression + ) return 0 From aadd9b02424be326263e6c97f80075d2542704fa Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Sun, 1 Mar 2026 18:40:46 -0300 Subject: [PATCH 6/6] Exclude EqUIdeo from runtime profile --- prototype_ucd_all.py | 1 + 1 file changed, 1 insertion(+) diff --git a/prototype_ucd_all.py b/prototype_ucd_all.py index 911dbde..3aeca1a 100644 --- a/prototype_ucd_all.py +++ b/prototype_ucd_all.py @@ -32,6 +32,7 @@ UCD_URL = f"https://www.unicode.org/Public/{UNICODE_VERSION}/ucd/{UCD_NAME}" DEFAULT_RUNTIME_EXCLUDES = { + "EqUIdeo", "JSN", "Name_Alias", "NFKC_CF",