Skip to content

Commit 86f5936

Browse files
refactor: split builtin_codecs.py into package
Split the 1,286-line builtin_codecs.py into a builtin_codecs/ package with separate modules for each codec: - blob.py: BlobCodec (Python object serialization) - hash.py: HashCodec (hash-addressed storage) - schema.py: SchemaCodec (base class for schema-addressed codecs) - object.py: ObjectCodec (files/folders) - attach.py: AttachCodec (file attachments) - filepath.py: FilepathCodec (references to existing files) - npy.py: NpyCodec + NpyRef (numpy arrays) Benefits: - Each codec is now self-contained and easier to understand - Easier to maintain and test individual codecs - Clearer organization for contributors - No change to public API (same imports work) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 2e24b9e commit 86f5936

File tree

9 files changed

+1329
-1286
lines changed

9 files changed

+1329
-1286
lines changed

src/datajoint/builtin_codecs.py

Lines changed: 0 additions & 1286 deletions
This file was deleted.
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""
2+
Built-in DataJoint codecs.
3+
4+
This package defines the standard codecs that ship with DataJoint.
5+
These serve as both useful built-in codecs and as examples for users who
6+
want to create their own custom codecs.
7+
8+
Built-in Codecs:
9+
- ``<blob>``: Serialize Python objects (in-table storage)
10+
- ``<blob@>``: Serialize Python objects (external with hash-addressed dedup)
11+
- ``<attach>``: File attachment (in-table storage)
12+
- ``<attach@>``: File attachment (external with hash-addressed dedup)
13+
- ``<hash@>``: Hash-addressed storage with MD5 deduplication (external only)
14+
- ``<object@>``: Schema-addressed storage for files/folders (external only)
15+
- ``<npy@>``: Store numpy arrays as portable .npy files (external only)
16+
- ``<filepath@store>``: Reference to existing file in store (external only)
17+
18+
Example - Creating a Custom Codec:
19+
Here's how to define your own codec, modeled after the built-in codecs::
20+
21+
import datajoint as dj
22+
import networkx as nx
23+
24+
class GraphCodec(dj.Codec):
25+
'''Store NetworkX graphs as edge lists.'''
26+
27+
name = "graph" # Use as <graph> in definitions
28+
29+
def get_dtype(self, is_store: bool) -> str:
30+
return "<blob>" # Compose with blob for serialization
31+
32+
def encode(self, graph, *, key=None, store_name=None):
33+
# Convert graph to a serializable format
34+
return {
35+
'nodes': list(graph.nodes(data=True)),
36+
'edges': list(graph.edges(data=True)),
37+
}
38+
39+
def decode(self, stored, *, key=None):
40+
# Reconstruct graph from stored format
41+
G = nx.Graph()
42+
G.add_nodes_from(stored['nodes'])
43+
G.add_edges_from(stored['edges'])
44+
return G
45+
46+
def validate(self, value):
47+
if not isinstance(value, nx.Graph):
48+
raise TypeError(f"Expected nx.Graph, got {type(value).__name__}")
49+
50+
# Now use in table definitions:
51+
@schema
52+
class Networks(dj.Manual):
53+
definition = '''
54+
network_id : int
55+
---
56+
topology : <graph>
57+
'''
58+
"""
59+
60+
from .attach import AttachCodec
61+
from .blob import BlobCodec
62+
from .filepath import FilepathCodec
63+
from .hash import HashCodec
64+
from .npy import NpyCodec, NpyRef
65+
from .object import ObjectCodec
66+
from .schema import SchemaCodec
67+
68+
__all__ = [
69+
"BlobCodec",
70+
"HashCodec",
71+
"SchemaCodec",
72+
"ObjectCodec",
73+
"AttachCodec",
74+
"FilepathCodec",
75+
"NpyCodec",
76+
"NpyRef",
77+
]
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""
2+
File attachment codec with filename preservation.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
from typing import Any
8+
9+
from ..codecs import Codec
10+
11+
12+
class AttachCodec(Codec):
13+
"""
14+
File attachment with filename preserved.
15+
16+
Supports both in-table and in-store storage:
17+
- ``<attach>``: Stored in database (bytes → LONGBLOB)
18+
- ``<attach@>``: Stored in object store via ``<hash@>`` with deduplication
19+
- ``<attach@store>``: Stored in specific named store
20+
21+
The filename is preserved and the file is extracted to the configured
22+
download path on fetch.
23+
24+
Example::
25+
26+
@schema
27+
class Documents(dj.Manual):
28+
definition = '''
29+
doc_id : int
30+
---
31+
config : <attach> # in-table (small file in DB)
32+
dataset : <attach@> # in-store (default store)
33+
archive : <attach@cold> # in-store (specific store)
34+
'''
35+
36+
# Insert a file
37+
table.insert1({'doc_id': 1, 'config': '/path/to/config.json'})
38+
39+
# Fetch extracts to download_path and returns local path
40+
local_path = (table & 'doc_id=1').fetch1('config')
41+
42+
Storage Format (internal):
43+
The blob contains: ``filename\\0contents``
44+
- Filename (UTF-8 encoded) + null byte + raw file contents
45+
"""
46+
47+
name = "attach"
48+
49+
def get_dtype(self, is_store: bool) -> str:
50+
"""Return bytes for in-table, <hash> for in-store storage."""
51+
return "<hash>" if is_store else "bytes"
52+
53+
def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
54+
"""
55+
Read file and encode as filename + contents.
56+
57+
Parameters
58+
----------
59+
value : str or Path
60+
Path to file.
61+
key : dict, optional
62+
Primary key values (unused).
63+
store_name : str, optional
64+
Unused for internal storage.
65+
66+
Returns
67+
-------
68+
bytes
69+
Filename (UTF-8) + null byte + file contents.
70+
"""
71+
from pathlib import Path
72+
73+
path = Path(value)
74+
if not path.exists():
75+
raise FileNotFoundError(f"Attachment file not found: {path}")
76+
if path.is_dir():
77+
raise IsADirectoryError(f"<attach> does not support directories: {path}")
78+
79+
filename = path.name
80+
contents = path.read_bytes()
81+
return filename.encode("utf-8") + b"\x00" + contents
82+
83+
def decode(self, stored: bytes, *, key: dict | None = None) -> str:
84+
"""
85+
Extract file to download path and return local path.
86+
87+
Parameters
88+
----------
89+
stored : bytes
90+
Blob containing filename + null + contents.
91+
key : dict, optional
92+
Primary key values (unused).
93+
94+
Returns
95+
-------
96+
str
97+
Path to extracted file.
98+
"""
99+
from pathlib import Path
100+
101+
from ..settings import config
102+
103+
# Split on first null byte
104+
null_pos = stored.index(b"\x00")
105+
filename = stored[:null_pos].decode("utf-8")
106+
contents = stored[null_pos + 1 :]
107+
108+
# Write to download path
109+
download_path = Path(config.get("download_path", "."))
110+
download_path.mkdir(parents=True, exist_ok=True)
111+
local_path = download_path / filename
112+
113+
# Handle filename collision - if file exists with different content, add suffix
114+
if local_path.exists():
115+
existing_contents = local_path.read_bytes()
116+
if existing_contents != contents:
117+
# Find unique filename
118+
stem = local_path.stem
119+
suffix = local_path.suffix
120+
counter = 1
121+
while local_path.exists() and local_path.read_bytes() != contents:
122+
local_path = download_path / f"{stem}_{counter}{suffix}"
123+
counter += 1
124+
125+
# Only write if file doesn't exist or has different content
126+
if not local_path.exists():
127+
local_path.write_bytes(contents)
128+
129+
return str(local_path)
130+
131+
def validate(self, value: Any) -> None:
132+
"""Validate that value is a valid file path."""
133+
from pathlib import Path
134+
135+
if not isinstance(value, (str, Path)):
136+
raise TypeError(f"<attach> expects a file path, got {type(value).__name__}")
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""
2+
Blob codec for Python object serialization.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
from typing import Any
8+
9+
from ..codecs import Codec
10+
11+
12+
class BlobCodec(Codec):
13+
"""
14+
Serialize Python objects using DataJoint's blob format.
15+
16+
The ``<blob>`` codec handles serialization of arbitrary Python objects
17+
including NumPy arrays, dictionaries, lists, datetime objects, and UUIDs.
18+
19+
Supports both in-table and in-store storage:
20+
- ``<blob>``: Stored in database (bytes → LONGBLOB)
21+
- ``<blob@>``: Stored in object store via ``<hash@>`` with deduplication
22+
- ``<blob@store>``: Stored in specific named store
23+
24+
Format Features:
25+
- Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native)
26+
- Optional zlib compression for data > 1KB
27+
- Support for nested structures
28+
29+
Example::
30+
31+
@schema
32+
class ProcessedData(dj.Manual):
33+
definition = '''
34+
data_id : int
35+
---
36+
small_result : <blob> # in-table (in database)
37+
large_result : <blob@> # in-store (default store)
38+
archive : <blob@cold> # in-store (specific store)
39+
'''
40+
41+
# Insert any serializable object
42+
table.insert1({'data_id': 1, 'small_result': {'scores': [0.9, 0.8]}})
43+
"""
44+
45+
name = "blob"
46+
47+
def get_dtype(self, is_store: bool) -> str:
48+
"""Return bytes for in-table, <hash> for in-store storage."""
49+
return "<hash>" if is_store else "bytes"
50+
51+
def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
52+
"""Serialize a Python object to DataJoint's blob format."""
53+
from .. import blob
54+
55+
return blob.pack(value, compress=True)
56+
57+
def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
58+
"""Deserialize blob bytes back to a Python object."""
59+
from .. import blob
60+
61+
return blob.unpack(stored, squeeze=False)

0 commit comments

Comments
 (0)