Skip to content

Commit a04e01a

Browse files
committed
add character sets
1 parent f998f5f commit a04e01a

29 files changed

+708
-401
lines changed

.eslintrc.yml

Lines changed: 0 additions & 50 deletions
This file was deleted.

.github/workflows/ci.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,13 @@ jobs:
3838
- name: Node ESM import smoke
3939
run: node scripts/ci/esm-smoke.mjs
4040

41-
- name: Test
42-
run: yarn test:unit
41+
- name: Test with coverage
42+
run: yarn test:unit && yarn cov:check
43+
44+
- name: Upload coverage
45+
run: yarn cov:send
46+
env:
47+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
4348

4449
esm-example:
4550
name: esm example (ubuntu)

.vscode/settings.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,22 @@
66
"**/.nyc_output": true
77
},
88
"cSpell.words": [
9+
"ACGT",
910
"aeiou",
1011
"alphanum",
1112
"ATCG",
13+
"bech",
1214
"bitauth",
1315
"charsets",
1416
"Crockford",
1517
"DÎÑG",
1618
"dîñgø",
19+
"dingodog",
1720
"dîngøsky",
1821
"dingoskyme",
1922
"Dyîdkø",
2023
"GACGGTCG",
24+
"geohash",
2125
"insgkskn",
2226
"îøsîndøk",
2327
"kiyooodd",

README.md

Lines changed: 86 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -327,106 +327,91 @@ Example: (output is rounded).
327327

328328
There are 19 pre-defined character sets:
329329

330-
| Name | Characters |
331-
| :------------- | :-------------------------------------------------------------------------------------------- |
332-
| Alpha | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz |
333-
| AlphaLower | abcdefghijklmnopqrstuvwxyz |
334-
| AlphaUpper | ABCDEFGHIJKLMNOPQRSTUVWXYZ |
335-
| AlphaNum | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 |
336-
| AlphaNumLower | abcdefghijklmnopqrstuvwxyz0123456789 |
337-
| AlphaNumUpper | ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 |
338-
| Base16 | 0123456789ABCDEF |
339-
| Base32 | ABCDEFGHIJKLMNOPQRSTUVWXYZ234567 |
340-
| Base32Hex | 0123456789abcdefghijklmnopqrstuv |
341-
| Base32HexUpper | 0123456789ABCDEFGHIJKLMNOPQRSTUV |
342-
| Crockford32 | 0123456789ABCDEFGHJKMNPQRSTVWXYZ |
343-
| Decimal | 0123456789 |
344-
| Hex | 0123456789abcdef |
345-
| HexUpper | 0123456789ABCDEF |
346-
| SafeAscii | !#$%&()\*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^\_abcdefghijklmnopqrstuvwxyz{\|}~ |
347-
| Safe32 | 2346789bdfghjmnpqrtBDFGHJLMNPQRT |
348-
| Safe64 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-\_ |
349-
| Symbol | !#$%&()\*+,-./:;<=>?@[]^\_{\|}~ |
350-
| WordSafe32 | 23456789CFGHJMPQRVWXcfghjmpqrvwx |
351-
352-
Any string of up to 256 unique characters, including unicode, can be used for **`puid`** generation.
353-
354-
#### Description of non-obvious character sets
355-
356-
| Name | Description |
357-
| :------------- | :--------------------------------------------------------- |
358-
| Base16 | https://datatracker.ietf.org/doc/html/rfc4648#section-8 |
359-
| Base32 | https://datatracker.ietf.org/doc/html/rfc4648#section-6 |
360-
| Base32Hex | Lowercase of Base32HexUpper |
361-
| Base32HexUpper | https://datatracker.ietf.org/doc/html/rfc4648#section-7 |
362-
| Crockford32 | https://www.crockford.com/base32.html |
363-
| SafeAscii | Printable ascii that does not require escape in String |
364-
| Safe32 | Alpha and numbers picked to reduce chance of English words |
365-
| Safe64 | https://datatracker.ietf.org/doc/html/rfc4648#section-5 |
366-
| WordSafe32 | Alpha and numbers picked to reduce chance of English words |
367-
368-
Note: Safe32 and WordSafe32 are two different strategies for the same goal.
369-
370-
#### Character Set Metrics
371-
372-
`puid-js` provides functions to analyze the efficiency of character sets, particularly their **Entropy Transform Efficiency (ETE)**.
373-
374-
```js
375-
const { Chars, charMetrics } = require('puid-js')
376-
377-
const metrics = charMetrics(Chars.Safe64)
378-
// => {
379-
// avgBits: 6.0,
380-
// bitShifts: [[63, 6]],
381-
// ere: 0.75,
382-
// ete: 1.0
383-
// }
384-
```
385-
386-
##### Entropy Transform Efficiency (ETE)
387-
388-
ETE measures how efficiently random bits are transformed into characters during ID generation. Character sets with a power-of-2 number of characters (16, 32, 64, etc.) have perfect efficiency (ETE = 1.0), meaning no random bits are wasted. Other character sets must occasionally reject bit patterns that would produce invalid indices, resulting in some waste.
389-
390-
```js
391-
const { entropyTransformEfficiency, Chars } = require('puid-js')
392-
393-
entropyTransformEfficiency(Chars.Safe64) // => 1.0 (64 chars, perfect)
394-
entropyTransformEfficiency(Chars.AlphaNum) // => 0.966 (62 chars, ~3.4% waste)
395-
entropyTransformEfficiency(Chars.Decimal) // => 0.615 (10 chars, ~38.5% waste)
396-
```
397-
398-
##### Average Bits Per Character
399-
400-
The `avgBitsPerChar` function returns the average number of random bits consumed when generating each character, accounting for bit rejection in non-power-of-2 character sets:
401-
402-
```js
403-
const { avgBitsPerChar, Chars } = require('puid-js')
404-
405-
avgBitsPerChar(Chars.Safe64) // => 6.0 (uses exactly 6 bits per char)
406-
avgBitsPerChar(Chars.AlphaNum) // => 6.16 (uses ~6.16 bits due to rejection)
407-
avgBitsPerChar(Chars.Decimal) // => 5.4 (uses ~5.4 bits due to rejection)
408-
```
409-
410-
##### Complete Metrics
411-
412-
The `charMetrics` function returns comprehensive metrics for a character set:
413-
414-
```js
415-
const { charMetrics, Chars } = require('puid-js')
416-
417-
const metrics = charMetrics(Chars.AlphaNum)
418-
// => {
419-
// avgBits: 6.1613, // Average bits consumed per character
420-
// bitShifts: [[61,6],[63,5]], // Bit shift rules for generation
421-
// ere: 0.7443, // Entropy representation efficiency
422-
// ete: 0.9664 // Entropy transform efficiency
423-
// }
424-
```
425-
426-
These metrics help you understand the trade-offs between different character sets:
427-
- Power-of-2 sets (16, 32, 64 chars) have perfect efficiency but limited choice
428-
- Sets close to powers of 2 (like 62-char AlphaNum) have good efficiency
429-
- Small sets (like 10-char Decimal) have lower efficiency but may be required for specific use cases
330+
| Name | Count | ERE | ETE | Characters |
331+
|------|--------|-----|-----|------------|
332+
| Alpha | 52 | 5.7 | 0.84 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz |
333+
| AlphaLower | 26 | 4.7 | 0.81 | abcdefghijklmnopqrstuvwxyz |
334+
| AlphaUpper | 26 | 4.7 | 0.81 | ABCDEFGHIJKLMNOPQRSTUVWXYZ |
335+
| AlphaNum | 62 | 5.95 | 0.97 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 |
336+
| AlphaNumLower | 36 | 5.17 | 0.65 | abcdefghijklmnopqrstuvwxyz0123456789 |
337+
| AlphaNumUpper | 36 | 5.17 | 0.65 | ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 |
338+
| Base16 | 16 | 4.0 | 1.0 | 0123456789ABCDEF |
339+
| Base32 | 32 | 5.0 | 1.0 | ABCDEFGHIJKLMNOPQRSTUVWXYZ234567 |
340+
| Base32Hex | 32 | 5.0 | 1.0 | 0123456789abcdefghijklmnopqrstuv |
341+
| Base32HexUpper | 32 | 5.0 | 1.0 | 0123456789ABCDEFGHIJKLMNOPQRSTUV |
342+
| Base36 | 36 | 5.17 | 0.65 | 0123456789abcdefghijklmnopqrstuvwxyz |
343+
| Base36Upper | 36 | 5.17 | 0.65 | 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ |
344+
| Base45 | 45 | 5.49 | 0.78 | 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ $%\*+-./: |
345+
| Base58 | 58 | 5.86 | 0.91 | 123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz |
346+
| Base62 | 62 | 5.95 | 0.97 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 |
347+
| Base85 | 85 | 6.41 | 0.77 | !"#$%&'()\*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\\]^\_\`abcdefghijklmnopqrstu |
348+
| Bech32 | 32 | 5.0 | 1.0 | 023456789acdefghjklmnpqrstuvwxyz |
349+
| Boolean | 2 | 1.0 | 1.0 | TF |
350+
| Crockford32 | 32 | 5.0 | 1.0 | 0123456789ABCDEFGHJKMNPQRSTVWXYZ |
351+
| Decimal | 10 | 3.32 | 0.62 | 0123456789 |
352+
| Dna | 4 | 2.0 | 1.0 | ACGT |
353+
| Geohash | 32 | 5.0 | 1.0 | 0123456789bcdefghjkmnpqrstuvwxyz |
354+
| Hex | 16 | 4.0 | 1.0 | 0123456789abcdef |
355+
| HexUpper | 16 | 4.0 | 1.0 | 0123456789ABCDEF |
356+
| SafeAscii | 90 | 6.49 | 0.8 | !#$%&()\*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^\_abcdefghijklmnopqrstuvwxyz{\|}~ |
357+
| Safe32 | 32 | 5.0 | 1.0 | 2346789bdfghjmnpqrtBDFGHJLMNPQRT |
358+
| Safe64 | 64 | 6.0 | 1.0 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-\_ |
359+
| Symbol | 28 | 4.81 | 0.89 | !#$%&()\*+,-./:;<=>?@\[\]^\_{\|}~ |
360+
| UrlSafe | 66 | 6.04 | 0.63 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-.\_~ |
361+
| WordSafe32 | 32 | 5.0 | 1.0 | 23456789CFGHJMPQRVWXcfghjmpqrvwx |
362+
| ZBase32 | 32 | 5.0 | 1.0 | ybndrfg8ejkmcpqxot1uwisza345h769 |
363+
364+
Note: The [Metrics](#metrics) section explains ERE and ETE.
365+
366+
##### Description of non-obvious character sets
367+
368+
| Name | Description |
369+
| :---------------- | :--------------------------------------------------------- |
370+
| Base16 | https://datatracker.ietf.org/doc/html/rfc4648#section-8 |
371+
| Base32 | https://datatracker.ietf.org/doc/html/rfc4648#section-6 |
372+
| Base32Hex | Lowercase of Base32HexUpper |
373+
| Base32HexUpper | https://datatracker.ietf.org/doc/html/rfc4648#section-7 |
374+
| Base36 | Used by many URL shorteners |
375+
| Base58 | Bitcoin base58 alphabet (excludes 0, O, I, l) |
376+
| Base85 | Used in Adobe PostScript and PDF |
377+
| Bech32 | Bitcoin SegWit address encoding |
378+
| Dna | DNA nucleotide bases (Adenine, Cytosine, Guanine, Thymine) |
379+
| Ascii85 | Same as SafeAscii |
380+
| Ascii90 | Same as Base85 |
381+
| Crockford32 | https://www.crockford.com/base32.html |
382+
| Geohash | Used for encoding geographic coordinates |
383+
| SafeAscii | Printable ascii that does not require escape in String |
384+
| Safe32 | Alpha and numbers picked to reduce chance of English words |
385+
| Safe64 | https://datatracker.ietf.org/doc/html/rfc4648#section-5 |
386+
| UrlSafe | https://datatracker.ietf.org/doc/html/rfc3986#section-2.3 |
387+
| WordSafe32 | Alpha and numbers picked to reduce chance of English words |
388+
| ZBase32 | Zooko's Base32 |
389+
390+
#### Custom
391+
392+
Any `String` of up to 256 unique characters can be used for **`puid`** generation, with custom characters optimized in the same manner as the pre-defined character sets. The characters must be unique. This isn't strictly a technical requirement, **PUID** could handle duplicate characters, but the resulting randomness of the IDs is maximal when the characters are unique, so **PUID** enforces that restriction.
393+
394+
### Metrics
395+
396+
#### Entropy Representation Efficiency
397+
398+
Entropy Representation Efficiency (ERE) is a measure of how efficient a string ID represents the entropy of the ID itself. When referring to the entropy of an ID, we mean the Shannon Entropy of the character sequence, and that is maximal when all the permissible characters are equally likely to occur. In most random ID generators, this is the case, and the ERE is solely dependent on the count of characters in the charset, where each character represents **log2(count)** of entropy (a computer specific calc of general Shannon entropy). For example, for a hex charset there are **16** hex characters, so each character "carries" **log2(16) = 4** bits of entropy in the string ID. We say the bits per character is **4** and a random ID of **12** hex characters has **48** bits of entropy.
399+
400+
ERE is measured as a ratio of the bits of entropy for the ID divided by the number of bits require to represent the string (**8** bits per ID character). If each character is equally probably (the most common case), ERE is **(bits-per-char * id_len) / (8 bits * id_len)**, which simplifies to **bits-per-character/8**. The BPC displayed in the Puid Characters table is equivalent to the ERE for that charset.
401+
402+
There is, however, a particular random ID exception where each character is _**not**_ equally probable, namely, the often used v4 format of UUIDs. In that format, there are hyphens that carry no entropy (entropy is uncertainty, and there is _**no uncertainly**_ as to where those hyphens will be), one hex digit that is actually constrained to 1 of only 4 hex values and another that is fixed. This formatting results in a ID of 36 characters with a total entropy of 122 bits. The ERE of a v4 UUID is, therefore, **122 / (8 * 36) = 0.4236**.
403+
404+
#### Entropy Transform Efficiency
405+
406+
Entropy Transform Efficiency (ETE) is a measure of how efficiently source entropy is transformed into random ID entropy. For charsets with a character count that is a power of 2, all of the source entropy bits can be utilized during random ID generation. Each generated ID character requires exactly **log2(count)** bits, so the incoming source entropy can easily be carved into appropriate indices for character selection. Since ETE represents the ratio of output entropy bits to input entropy source, when all of the bits are utilized ETE is **1.0**.
407+
408+
Even for charsets with power of 2 character count, ETE is only the theoretical maximum of **1.0** _**if**_ the input entropy source is used as described above. Unfortunately, that is not the case with many random ID generation schemes. Some schemes use the entire output of a call to source entropy to create a single index used to select a character. Such schemes have very poor ETE.
409+
410+
For charsets with a character count that is not a power of 2, some bits will inevitably be discarded since the smallest number of bits required to select a character, **ceil(log2(count))**, will potentially result in an index beyond the character count. A first-cut, naïve approach to this reality is to simply throw away all the bits when the index is too large.
411+
412+
However, a more sophisticated scheme of bit slicing can actually improve on the naïve approach. Puid extends the bit slicing scheme by adding a bit shifting scheme to the algorithm, wherein a _**minimum**_ number of bits in the "over the limit" bits are discarded by observing that some bit patterns of length less than **ceil(log2(count))** already guarantee the bits will be over the limit, and _**only**_ those bits need be discarded.
413+
414+
As example, using the **AlphaNumLower** charset, which has 36 characters, **ceil(log2(36)) = 6** bits are required to create a suitable index. However, if those bits start with the bit pattern **11xxxx**, the index would be out of bounds regardless of the **xxxx** bits, so Puid only tosses the first two bits and keeps the trailing four bits for use in the next index. (It is beyond scope to discuss here, but analysis shows this bit shifting scheme does not alter the random characteristics of generated IDs). So whereas the naïve approach would have an ETE of **0.485**, Puid achieves an ETE of **0.646**, a **33%** improvement. The `bench/alphanum_lower_ete.exs` script has detailed analysis.
430415

431416
[TOC](#TOC)
432417

@@ -633,7 +618,7 @@ Hmmm. Looks like there are 500,000 IDs expected and the repeat risk is 1 in a tr
633618
```js
634619
const { Chars, puid } = require('puid-js')
635620

636-
const { generator: randId } = puid(chars:Chars.SAFE32, total:10e6, risk: 1e15)
621+
const { generator: randId } = puid({ chars: Chars.Safe32, total: 10e6, risk: 1e15 })
637622
randId()
638623
// => 'tp9TJPNM2rmMLN22prFf'
639624
```

eslint.config.mjs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import { FlatCompat } from '@eslint/eslintrc'
22
import js from '@eslint/js'
3-
import tsParser from '@typescript-eslint/parser'
43
import tsPlugin from '@typescript-eslint/eslint-plugin'
5-
import importPlugin from 'eslint-plugin-import'
4+
import tsParser from '@typescript-eslint/parser'
65
import eslintComments from 'eslint-plugin-eslint-comments'
76
import functionalPlugin from 'eslint-plugin-functional'
7+
import importPlugin from 'eslint-plugin-import'
88
import globals from 'globals'
9+
import { fileURLToPath } from 'node:url'
10+
import { dirname } from 'node:path'
911

1012
const functionalPatched = {
1113
...functionalPlugin,
@@ -15,8 +17,11 @@ const functionalPatched = {
1517
}
1618
}
1719

20+
const __filename = fileURLToPath(import.meta.url)
21+
const __dirname = dirname(__filename)
22+
1823
const compat = new FlatCompat({
19-
baseDirectory: new URL('.', import.meta.url).pathname,
24+
baseDirectory: __dirname,
2025
recommendedConfig: js.configs.recommended
2126
})
2227

package.json

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,17 +69,14 @@
6969
"build:module": "tsc -p tsconfig.module.json && node scripts/build/mjsify.cjs",
7070
"build:pack": "yarn build && npm pack",
7171
"build:test": "rm -rf ./build/test && tsc -p tsconfig.test.json",
72-
"check-cli": "run-s test diff-integration-tests check-integration-tests",
73-
"check-integration-tests": "run-s check-integration-test:*",
7472
"cov": "run-s build test:unit cov:html cov:lcov && open-cli coverage/index.html",
7573
"cov:html": "nyc report --reporter=html",
7674
"cov:lcov": "nyc report --reporter=lcov",
7775
"cov:send": "run-s cov:lcov && codecov",
7876
"cov:check": "nyc report && nyc check-coverage --lines 100 --functions 100 --branches 100",
7977
"doc": "run-s doc:html && open-cli build/docs/index.html",
80-
"doc:html": "typedoc src/ --exclude **/*.spec.ts --target ES6 --mode file --out build/docs",
81-
"doc:json": "typedoc src/ --exclude **/*.spec.ts --target ES6 --mode file --json build/docs/typedoc.json",
82-
"diff-integration-tests": "mkdir -p diff && rm -rf diff/test && cp -r test diff/test && rm -rf diff/test/test-*/.git && cd diff && git init --quiet && git add -A && git commit --quiet --no-verify --allow-empty -m 'WIP' && echo '\\n\\nCommitted most recent integration test output in the \"diff\" directory. Review the changes with \"cd diff && git diff HEAD\" or your preferred git diff viewer.'",
78+
"doc:html": "typedoc --out build/docs --entryPoints src --exclude \"**/*.spec.ts\"",
79+
"doc:json": "typedoc --json build/docs/typedoc.json --entryPoints src --exclude \"**/*.spec.ts\"",
8380
"doc:publish": "gh-pages -m \"[ci skip] Updates\" -d build/docs",
8481
"fix": "run-s fix:*",
8582
"fix:prettier": "prettier \"src/**/*.ts\" --write",

0 commit comments

Comments
 (0)