Skip to content

Commit f998f5f

Browse files
committed
add ete
1 parent 43c8db1 commit f998f5f

File tree

8 files changed

+510
-6
lines changed

8 files changed

+510
-6
lines changed

README.md

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ Example:
262262
- `chars`: Source characters
263263
- `charsName`: Name of pre-defined `Chars` or `custom`
264264
- `ere`: Entropy representation efficiency
265+
- `ete`: Entropy transform efficiency
265266
- `length`: ID string length
266267

267268
Example:
@@ -281,6 +282,7 @@ Example:
281282
chars: '234567ABCDEFGHIJKLMNOPQRSTUVWXYZ',
282283
charsName: 'base32',
283284
ere: 0.63,
285+
ete: 1.0,
284286
length: 16
285287
}
286288
```
@@ -365,6 +367,67 @@ Any string of up to 256 unique characters, including unicode, can be used for **
365367

366368
Note: Safe32 and WordSafe32 are two different strategies for the same goal.
367369

370+
#### Character Set Metrics
371+
372+
`puid-js` provides functions to analyze the efficiency of character sets, particularly their **Entropy Transform Efficiency (ETE)**.
373+
374+
```js
375+
const { Chars, charMetrics } = require('puid-js')
376+
377+
const metrics = charMetrics(Chars.Safe64)
378+
// => {
379+
// avgBits: 6.0,
380+
// bitShifts: [[63, 6]],
381+
// ere: 0.75,
382+
// ete: 1.0
383+
// }
384+
```
385+
386+
##### Entropy Transform Efficiency (ETE)
387+
388+
ETE measures how efficiently random bits are transformed into characters during ID generation. Character sets with a power-of-2 number of characters (16, 32, 64, etc.) have perfect efficiency (ETE = 1.0), meaning no random bits are wasted. Other character sets must occasionally reject bit patterns that would produce invalid indices, resulting in some waste.
389+
390+
```js
391+
const { entropyTransformEfficiency, Chars } = require('puid-js')
392+
393+
entropyTransformEfficiency(Chars.Safe64) // => 1.0 (64 chars, perfect)
394+
entropyTransformEfficiency(Chars.AlphaNum) // => 0.966 (62 chars, ~3.4% waste)
395+
entropyTransformEfficiency(Chars.Decimal) // => 0.615 (10 chars, ~38.5% waste)
396+
```
397+
398+
##### Average Bits Per Character
399+
400+
The `avgBitsPerChar` function returns the average number of random bits consumed when generating each character, accounting for bit rejection in non-power-of-2 character sets:
401+
402+
```js
403+
const { avgBitsPerChar, Chars } = require('puid-js')
404+
405+
avgBitsPerChar(Chars.Safe64) // => 6.0 (uses exactly 6 bits per char)
406+
avgBitsPerChar(Chars.AlphaNum) // => 6.16 (uses ~6.16 bits due to rejection)
407+
avgBitsPerChar(Chars.Decimal) // => 5.4 (uses ~5.4 bits due to rejection)
408+
```
409+
410+
##### Complete Metrics
411+
412+
The `charMetrics` function returns comprehensive metrics for a character set:
413+
414+
```js
415+
const { charMetrics, Chars } = require('puid-js')
416+
417+
const metrics = charMetrics(Chars.AlphaNum)
418+
// => {
419+
// avgBits: 6.1613, // Average bits consumed per character
420+
// bitShifts: [[61,6],[63,5]], // Bit shift rules for generation
421+
// ere: 0.7443, // Entropy representation efficiency
422+
// ete: 0.9664 // Entropy transform efficiency
423+
// }
424+
```
425+
426+
These metrics help you understand the trade-offs between different character sets:
427+
- Power-of-2 sets (16, 32, 64 chars) have perfect efficiency but limited choice
428+
- Sets close to powers of 2 (like 62-char AlphaNum) have good efficiency
429+
- Small sets (like 10-char Decimal) have lower efficiency but may be required for specific use cases
430+
368431
[TOC](#TOC)
369432

370433
## <a name="Motivation"></a>Motivation

src/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
export type { EntropyByBytes, EntropyByValues, Puid, PuidInfo } from './types/puid'
2-
export { Chars, validChars } from './lib/chars'
2+
export type { CharMetrics } from './lib/chars'
3+
export { Chars, validChars, charMetrics, entropyTransformEfficiency, avgBitsPerChar } from './lib/chars'
34
export { bitsForLen, entropyBits, entropyBitsPerChar, lenForBits } from './lib/entropy'
45
export { default as puid } from './lib/puid'
56
export { default as generate } from './generate'

src/lib/chars.spec.ts

Lines changed: 203 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import test from 'ava'
22

3-
import { Chars, validChars } from './chars'
3+
import { Chars, validChars, charMetrics, entropyTransformEfficiency, avgBitsPerChar } from './chars'
44

55
test('pre-defined Chars', (t) => {
66
const allChars = [
@@ -53,3 +53,205 @@ test('too many custom characters', (t) => {
5353
const tooMany = new Array(257).fill('a').toString()
5454
t.regex(invalidChars(tooMany), /greater/)
5555
})
56+
57+
// ETE (Entropy Transform Efficiency) Tests
58+
59+
test('charMetrics returns correct structure', (t) => {
60+
const metrics = charMetrics(Chars.Hex)
61+
t.truthy(metrics.avgBits)
62+
t.truthy(metrics.bitShifts)
63+
t.truthy(metrics.ere)
64+
t.truthy(metrics.ete)
65+
t.is(typeof metrics.avgBits, 'number')
66+
t.is(typeof metrics.ere, 'number')
67+
t.is(typeof metrics.ete, 'number')
68+
t.true(Array.isArray(metrics.bitShifts))
69+
})
70+
71+
test('power-of-2 charsets have ETE = 1.0', (t) => {
72+
// Test all power-of-2 charsets
73+
const powerOf2Charsets = [
74+
{ chars: Chars.Hex, size: 16 },
75+
{ chars: Chars.HexUpper, size: 16 },
76+
{ chars: Chars.Base32, size: 32 },
77+
{ chars: Chars.Base32Hex, size: 32 },
78+
{ chars: Chars.Base32HexUpper, size: 32 },
79+
{ chars: Chars.Crockford32, size: 32 },
80+
{ chars: Chars.Safe32, size: 32 },
81+
{ chars: Chars.WordSafe32, size: 32 },
82+
{ chars: Chars.Safe64, size: 64 }
83+
]
84+
85+
powerOf2Charsets.forEach(({ chars, size }) => {
86+
const metrics = charMetrics(chars)
87+
t.is(chars.length, size, `${size} char charset should have ${size} chars`)
88+
t.is(metrics.ete, 1.0, `${size} char charset should have ETE = 1.0`)
89+
t.is(metrics.avgBits, Math.ceil(Math.log2(size)), `${size} char charset avgBits should equal bits per char`)
90+
})
91+
})
92+
93+
test('non-power-of-2 charsets have ETE < 1.0', (t) => {
94+
const nonPowerOf2Charsets = [
95+
Chars.Alpha,
96+
Chars.AlphaLower,
97+
Chars.AlphaUpper,
98+
Chars.AlphaNum,
99+
Chars.AlphaNumLower,
100+
Chars.AlphaNumUpper,
101+
Chars.Decimal,
102+
Chars.Symbol,
103+
Chars.SafeAscii
104+
]
105+
106+
nonPowerOf2Charsets.forEach((chars) => {
107+
const metrics = charMetrics(chars)
108+
t.true(metrics.ete > 0, `${chars.length} char charset ETE should be > 0`)
109+
t.true(metrics.ete < 1.0, `${chars.length} char charset ETE should be < 1.0`)
110+
t.true(metrics.avgBits > Math.log2(chars.length), `${chars.length} char charset avgBits should be > theoretical bits`)
111+
})
112+
})
113+
114+
test('specific ETE values match expected ranges', (t) => {
115+
// Test specific charsets against expected ETE values
116+
// These are based on the actual algorithm results
117+
118+
const metrics52 = charMetrics(Chars.Alpha) // 52 chars
119+
t.true(metrics52.ete > 0.84 && metrics52.ete < 0.85, `Alpha (52 chars) ETE should be ~0.842, got ${metrics52.ete}`)
120+
121+
const metrics26 = charMetrics(Chars.AlphaLower) // 26 chars
122+
t.true(metrics26.ete > 0.81 && metrics26.ete < 0.82, `AlphaLower (26 chars) ETE should be ~0.815, got ${metrics26.ete}`)
123+
124+
const metrics62 = charMetrics(Chars.AlphaNum) // 62 chars
125+
t.true(metrics62.ete > 0.96 && metrics62.ete < 0.97, `AlphaNum (62 chars) ETE should be ~0.966, got ${metrics62.ete}`)
126+
127+
const metrics36 = charMetrics(Chars.AlphaNumLower) // 36 chars
128+
t.true(metrics36.ete > 0.64 && metrics36.ete < 0.65, `AlphaNumLower (36 chars) ETE should be ~0.646, got ${metrics36.ete}`)
129+
130+
const metrics10 = charMetrics(Chars.Decimal) // 10 chars
131+
t.true(metrics10.ete > 0.61 && metrics10.ete < 0.62, `Decimal (10 chars) ETE should be ~0.615, got ${metrics10.ete}`)
132+
133+
const metrics90 = charMetrics(Chars.SafeAscii) // 90 chars
134+
t.true(metrics90.ete > 0.80 && metrics90.ete < 0.81, `SafeAscii (90 chars) ETE should be ~0.805, got ${metrics90.ete}`)
135+
})
136+
137+
test('bit shifts are calculated correctly for charMetrics', (t) => {
138+
// Power-of-2 should have single bit shift
139+
const hex = charMetrics(Chars.Hex)
140+
t.is(hex.bitShifts.length, 1)
141+
t.deepEqual(hex.bitShifts[0], [15, 4])
142+
143+
const safe64 = charMetrics(Chars.Safe64)
144+
t.is(safe64.bitShifts.length, 1)
145+
t.deepEqual(safe64.bitShifts[0], [63, 6])
146+
147+
// Non-power-of-2 should have multiple bit shifts
148+
const alpha = charMetrics(Chars.Alpha) // 52 chars
149+
t.true(alpha.bitShifts.length > 1)
150+
t.deepEqual(alpha.bitShifts[0], [51, 6]) // Base value
151+
152+
const decimal = charMetrics(Chars.Decimal) // 10 chars
153+
t.true(decimal.bitShifts.length > 1)
154+
t.deepEqual(decimal.bitShifts[0], [9, 4]) // Base value
155+
})
156+
157+
test('avgBitsPerChar calculates correctly', (t) => {
158+
// Power-of-2 charsets
159+
t.is(avgBitsPerChar(Chars.Hex), 4)
160+
t.is(avgBitsPerChar(Chars.Base32), 5)
161+
t.is(avgBitsPerChar(Chars.Safe64), 6)
162+
163+
// Non-power-of-2 charsets should have higher avgBits than theoretical
164+
const decimal = Chars.Decimal // 10 chars
165+
const decimalAvg = avgBitsPerChar(decimal)
166+
const decimalTheoretical = Math.log2(10)
167+
t.true(decimalAvg > decimalTheoretical)
168+
t.true(decimalAvg > 5) // Actually 5.4 due to rejection
169+
170+
const alpha = Chars.Alpha // 52 chars
171+
const alphaAvg = avgBitsPerChar(alpha)
172+
const alphaTheoretical = Math.log2(52)
173+
t.true(alphaAvg > alphaTheoretical)
174+
t.true(alphaAvg > 6) // Actually 6.77 due to rejection
175+
})
176+
177+
test('entropyTransformEfficiency returns correct values', (t) => {
178+
t.is(entropyTransformEfficiency(Chars.Safe64), 1.0)
179+
t.is(entropyTransformEfficiency(Chars.Hex), 1.0)
180+
181+
const alphaEte = entropyTransformEfficiency(Chars.Alpha)
182+
t.true(alphaEte > 0.84 && alphaEte < 0.85)
183+
184+
const decimalEte = entropyTransformEfficiency(Chars.Decimal)
185+
t.true(decimalEte > 0.61 && decimalEte < 0.62)
186+
})
187+
188+
test('custom charset ETE calculation', (t) => {
189+
// Test with custom charsets
190+
const customPow2 = 'dingosky' // 8 chars (power of 2)
191+
const metrics8 = charMetrics(customPow2)
192+
t.is(metrics8.ete, 1.0)
193+
t.is(metrics8.avgBits, 3)
194+
195+
const custom10 = 'dingoskyab' // 10 chars (non-power of 2)
196+
const metrics10 = charMetrics(custom10)
197+
t.true(metrics10.ete > 0 && metrics10.ete < 1.0)
198+
t.true(metrics10.avgBits > Math.log2(10))
199+
})
200+
201+
test('ERE calculation is reasonable', (t) => {
202+
// ERE should be between 0 and 1 for all charsets
203+
const charsets = [
204+
Chars.Alpha,
205+
Chars.AlphaNum,
206+
Chars.Hex,
207+
Chars.Safe64,
208+
Chars.Decimal,
209+
Chars.SafeAscii
210+
]
211+
212+
charsets.forEach((chars) => {
213+
const metrics = charMetrics(chars)
214+
t.true(metrics.ere > 0, `ERE should be positive for ${chars.length} char charset`)
215+
t.true(metrics.ere <= 1.0, `ERE should be <= 1.0 for ${chars.length} char charset`)
216+
})
217+
})
218+
219+
test('metrics are consistent across multiple calls', (t) => {
220+
const chars = Chars.AlphaNum
221+
const metrics1 = charMetrics(chars)
222+
const metrics2 = charMetrics(chars)
223+
224+
t.deepEqual(metrics1, metrics2, 'Metrics should be identical for same charset')
225+
})
226+
227+
test('odd vs even sized charsets', (t) => {
228+
// Test that odd-sized charsets are handled correctly
229+
const odd9 = 'abcdefghi' // 9 chars (odd)
230+
const metricsOdd = charMetrics(odd9)
231+
t.true(metricsOdd.ete > 0 && metricsOdd.ete < 1.0)
232+
t.is(metricsOdd.bitShifts[0]?.[0], 9) // Base value should be 9 for odd
233+
234+
const even10 = 'abcdefghij' // 10 chars (even)
235+
const metricsEven = charMetrics(even10)
236+
t.true(metricsEven.ete > 0 && metricsEven.ete < 1.0)
237+
t.is(metricsEven.bitShifts[0]?.[0], 9) // Base value should be 9 (10-1) for even
238+
})
239+
240+
test('boundary cases for ETE', (t) => {
241+
// Minimum charset (2 chars - power of 2)
242+
const min2 = 'ab'
243+
const metricsMin = charMetrics(min2)
244+
t.is(metricsMin.ete, 1.0)
245+
t.is(metricsMin.avgBits, 1)
246+
247+
// 3 chars (non-power of 2)
248+
const three = 'abc'
249+
const metrics3 = charMetrics(three)
250+
t.true(metrics3.ete > 0 && metrics3.ete < 1.0)
251+
t.true(metrics3.avgBits > Math.log2(3))
252+
253+
// Large charset close to power of 2
254+
const chars63 = Chars.AlphaNum + '-' // 63 chars (one less than 64)
255+
const metrics63 = charMetrics(chars63)
256+
t.true(metrics63.ete > 0.98 && metrics63.ete < 1.0, `63 chars should have high ETE, got ${metrics63.ete}`)
257+
})

0 commit comments

Comments
 (0)