Skip to content

Commit 8ee8cda

Browse files
committed
feat(hashing): add 6 advanced open addressing algorithms
- LinearProbing: Standard collision resolution - RobinHoodHashMap: PSL optimization - HopscotchHashMap: Neighborhood caching - CoalescedHashMap: Hybrid chaining - FNVHashMap: FNV-1a hash function - PowerOfTwoHashMap: Bitmask sizing - Add comprehensive unit tests in test_advanced_hashing.py
1 parent 2c15b8c commit 8ee8cda

File tree

7 files changed

+1080
-0
lines changed

7 files changed

+1080
-0
lines changed
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Coalesced hashing (hybrid of open addressing + chaining inside the table).
4+
5+
Reference: https://en.wikipedia.org/wiki/Hash_table#Coalesced_hashing
6+
"""
7+
8+
from __future__ import annotations
9+
10+
from collections.abc import Iterator, MutableMapping
11+
from dataclasses import dataclass
12+
from typing import TypeVar
13+
14+
KEY = TypeVar("KEY")
15+
VAL = TypeVar("VAL")
16+
17+
18+
@dataclass(slots=True)
19+
class _Node[KEY, VAL]:
20+
key: KEY
21+
val: VAL
22+
next: int # -1 means end of chain
23+
24+
25+
class CoalescedHashMap(MutableMapping[KEY, VAL]):
26+
"""
27+
Coalesced hashing stores the chain pointers inside the array.
28+
29+
This implementation uses:
30+
- Primary area: all indices, chaining occurs via `next` pointers.
31+
- Free slot choice: highest-index free slot (easy to explain + deterministic).
32+
33+
>>> ch = CoalescedHashMap(5)
34+
>>> ch["a"] = 1
35+
>>> ch["b"] = 2
36+
>>> ch["a"]
37+
1
38+
>>> len(ch)
39+
2
40+
"""
41+
42+
def __init__(self, capacity: int = 8, capacity_factor: float = 0.8) -> None:
43+
if capacity < 1:
44+
raise ValueError("capacity must be >= 1")
45+
if not (0.0 < capacity_factor < 1.0):
46+
raise ValueError("capacity_factor must be between 0 and 1")
47+
48+
self._capacity_factor = capacity_factor
49+
self._table: list[_Node[KEY, VAL] | None] = [None] * capacity
50+
self._len = 0
51+
52+
def _home(self, key: KEY) -> int:
53+
return hash(key) % len(self._table)
54+
55+
def _is_full(self) -> bool:
56+
return self._len >= int(len(self._table) * self._capacity_factor)
57+
58+
def _find_free_from_end(self) -> int:
59+
for i in range(len(self._table) - 1, -1, -1):
60+
if self._table[i] is None:
61+
return i
62+
return -1
63+
64+
def _resize(self, new_capacity: int) -> None:
65+
old_items = list(self.items())
66+
self._table = [None] * new_capacity
67+
self._len = 0
68+
for k, v in old_items:
69+
self[k] = v
70+
71+
def __setitem__(self, key: KEY, val: VAL) -> None:
72+
if self._is_full():
73+
self._resize(len(self._table) * 2)
74+
75+
home = self._home(key)
76+
node = self._table[home]
77+
78+
if node is None:
79+
self._table[home] = _Node(key, val, -1)
80+
self._len += 1
81+
return
82+
83+
# Search chain for update.
84+
cur = home
85+
while True:
86+
n = self._table[cur]
87+
if n is None:
88+
break
89+
# FIX: Ensure n is not None before access
90+
assert n is not None
91+
92+
if n.key == key:
93+
n.val = val
94+
return
95+
if n.next == -1:
96+
break
97+
cur = n.next
98+
99+
# Insert new node at a free slot and link it.
100+
free = self._find_free_from_end()
101+
if free == -1:
102+
self._resize(len(self._table) * 2)
103+
self[key] = val
104+
return
105+
106+
self._table[free] = _Node(key, val, -1)
107+
108+
# FIX: Ensure we are linking from a valid node
109+
if (tail_node := self._table[cur]) is not None:
110+
tail_node.next = free
111+
112+
self._len += 1
113+
114+
def __getitem__(self, key: KEY) -> VAL:
115+
home = self._home(key)
116+
cur = home
117+
while cur != -1:
118+
node = self._table[cur]
119+
if node is None:
120+
break
121+
assert node is not None
122+
if node.key == key:
123+
return node.val
124+
cur = node.next
125+
raise KeyError(key)
126+
127+
def __delitem__(self, key: KEY) -> None:
128+
home = self._home(key)
129+
prev = -1
130+
cur = home
131+
132+
while cur != -1:
133+
node = self._table[cur]
134+
if node is None:
135+
break
136+
assert node is not None
137+
if node.key == key:
138+
# If deleting head: copy next node into home if exists
139+
# (keeps chains valid).
140+
if prev == -1:
141+
if node.next == -1:
142+
self._table[cur] = None
143+
else:
144+
nxt = node.next
145+
# Safely copy next node data
146+
nxt_node = self._table[nxt]
147+
# Mypy needs to know nxt_node exists if we are copying from it
148+
if nxt_node is not None:
149+
self._table[cur] = _Node(
150+
nxt_node.key,
151+
nxt_node.val,
152+
nxt_node.next,
153+
)
154+
self._table[nxt] = None
155+
else:
156+
prev_node = self._table[prev]
157+
if prev_node is not None:
158+
prev_node.next = node.next
159+
self._table[cur] = None
160+
self._len -= 1
161+
return
162+
prev, cur = cur, node.next
163+
164+
raise KeyError(key)
165+
166+
def __iter__(self) -> Iterator[KEY]:
167+
for node in self._table:
168+
if node is not None:
169+
yield node.key
170+
171+
def __len__(self) -> int:
172+
return self._len
173+
174+
175+
if __name__ == "__main__":
176+
import doctest
177+
178+
doctest.testmod()
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/usr/bin/env python3
2+
"""
3+
FNV-1a hashing + a small educational hash map.
4+
5+
FNV-1a is a fast, non-cryptographic hash often used for hashing bytes/strings.
6+
Reference: https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
7+
"""
8+
9+
from __future__ import annotations
10+
11+
from collections.abc import Iterator, MutableMapping
12+
from dataclasses import dataclass
13+
from typing import TypeVar
14+
15+
KEY = TypeVar("KEY")
16+
VAL = TypeVar("VAL")
17+
18+
19+
def fnv1a_32(data: bytes) -> int:
20+
"""
21+
Compute 32-bit FNV-1a over bytes.
22+
23+
>>> fnv1a_32(b"")
24+
2166136261
25+
>>> fnv1a_32(b"a") # deterministic
26+
3826002220
27+
"""
28+
h = 0x811C9DC5 # offset basis
29+
for b in data:
30+
h ^= b
31+
h = (h * 0x01000193) & 0xFFFFFFFF
32+
return h
33+
34+
35+
def fnv1a_64(data: bytes) -> int:
36+
"""
37+
Compute 64-bit FNV-1a over bytes.
38+
39+
>>> fnv1a_64(b"")
40+
14695981039346656037
41+
"""
42+
h = 0xCBF29CE484222325 # offset basis
43+
for b in data:
44+
h ^= b
45+
h = (h * 0x100000001B3) & 0xFFFFFFFFFFFFFFFF
46+
return h
47+
48+
49+
@dataclass(slots=True)
50+
class _Item[KEY, VAL]:
51+
key: KEY
52+
val: VAL
53+
54+
55+
class _DeletedItem(_Item):
56+
def __init__(self) -> None:
57+
super().__init__(None, None)
58+
59+
def __bool__(self) -> bool:
60+
return False
61+
62+
63+
_deleted = _DeletedItem()
64+
65+
66+
class FNVHashMap(MutableMapping[KEY, VAL]):
67+
"""
68+
Hash map using FNV-1a for string/bytes keys and Python's hash otherwise.
69+
70+
>>> hm = FNVHashMap()
71+
>>> hm["hello"] = 1
72+
>>> hm[b"hello"] = 2
73+
>>> hm["hello"]
74+
1
75+
>>> hm[b"hello"]
76+
2
77+
>>> "missing" in hm
78+
False
79+
"""
80+
81+
def __init__(
82+
self, initial_block_size: int = 8, capacity_factor: float = 0.75
83+
) -> None:
84+
if initial_block_size < 1:
85+
raise ValueError("initial_block_size must be >= 1")
86+
if not (0.0 < capacity_factor < 1.0):
87+
raise ValueError("capacity_factor must be between 0 and 1")
88+
89+
self._initial_block_size = initial_block_size
90+
self._buckets: list[_Item | None] = [None] * initial_block_size
91+
self._capacity_factor = capacity_factor
92+
self._len = 0
93+
94+
def _hash_key(self, key: KEY) -> int:
95+
if isinstance(key, bytes):
96+
return fnv1a_32(key)
97+
if isinstance(key, str):
98+
return fnv1a_32(key.encode("utf-8"))
99+
return hash(key)
100+
101+
def _get_bucket_index(self, key: KEY) -> int:
102+
return self._hash_key(key) % len(self._buckets)
103+
104+
def _iterate_buckets(self, key: KEY) -> Iterator[int]:
105+
ind = self._get_bucket_index(key)
106+
for _ in range(len(self._buckets)):
107+
yield ind
108+
ind = (ind + 1) % len(self._buckets)
109+
110+
def _is_full(self) -> bool:
111+
return self._len >= int(len(self._buckets) * self._capacity_factor)
112+
113+
def _resize(self, new_size: int) -> None:
114+
old = self._buckets
115+
self._buckets = [None] * new_size
116+
self._len = 0
117+
for item in old:
118+
if item:
119+
self[item.key] = item.val
120+
121+
def __setitem__(self, key: KEY, val: VAL) -> None:
122+
if self._is_full():
123+
self._resize(len(self._buckets) * 2)
124+
125+
for ind in self._iterate_buckets(key):
126+
stored = self._buckets[ind]
127+
if not stored:
128+
self._buckets[ind] = _Item(key, val)
129+
self._len += 1
130+
return
131+
if stored.key == key:
132+
stored.val = val
133+
return
134+
135+
# Extremely unlikely due to resize policy, but safe.
136+
self._resize(len(self._buckets) * 2)
137+
self[key] = val
138+
139+
def __getitem__(self, key: KEY) -> VAL:
140+
for ind in self._iterate_buckets(key):
141+
item = self._buckets[ind]
142+
if item is None:
143+
break
144+
if item is _deleted:
145+
continue
146+
if item.key == key:
147+
return item.val
148+
raise KeyError(key)
149+
150+
def __delitem__(self, key: KEY) -> None:
151+
for ind in self._iterate_buckets(key):
152+
item = self._buckets[ind]
153+
if item is None:
154+
break
155+
if item is _deleted:
156+
continue
157+
if item.key == key:
158+
self._buckets[ind] = _deleted
159+
self._len -= 1
160+
return
161+
raise KeyError(key)
162+
163+
def __iter__(self) -> Iterator[KEY]:
164+
yield from (item.key for item in self._buckets if item)
165+
166+
def __len__(self) -> int:
167+
return self._len
168+
169+
def __repr__(self) -> str:
170+
parts = ", ".join(f"{k!r}: {v!r}" for k, v in self.items())
171+
return f"FNVHashMap({parts})"
172+
173+
174+
if __name__ == "__main__":
175+
import doctest
176+
177+
doctest.testmod()

0 commit comments

Comments
 (0)