Skip to content

Commit 984a9be

Browse files
Merge pull request #1334 from datajoint/feature/unified-stores-config
Clarify dual-mode codecs in builtin_codecs docstring
2 parents bf62620 + 75b6f29 commit 984a9be

File tree

9 files changed

+43
-35
lines changed

9 files changed

+43
-35
lines changed

src/datajoint/builtin_codecs.py

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
want to create their own custom codecs.
77
88
Built-in Codecs:
9-
- ``<blob>``: Serialize Python objects (internal) or external with dedup
10-
- ``<hash>``: Hash-addressed storage with SHA256 deduplication
11-
- ``<object>``: Schema-addressed storage for files/folders (Zarr, HDF5)
12-
- ``<attach>``: File attachment (internal) or external with dedup
13-
- ``<filepath@store>``: Reference to existing file in store
9+
- ``<blob>``: Serialize Python objects (in-table storage)
10+
- ``<blob@>``: Serialize Python objects (external with hash-addressed dedup)
11+
- ``<attach>``: File attachment (in-table storage)
12+
- ``<attach@>``: File attachment (external with hash-addressed dedup)
13+
- ``<hash@>``: Hash-addressed storage with MD5 deduplication (external only)
14+
- ``<object@>``: Schema-addressed storage for files/folders (external only)
1415
- ``<npy@>``: Store numpy arrays as portable .npy files (external only)
16+
- ``<filepath@store>``: Reference to existing file in store (external only)
1517
1618
Example - Creating a Custom Codec:
1719
Here's how to define your own codec, modeled after the built-in codecs::
@@ -75,9 +77,9 @@ class BlobCodec(Codec):
7577
The ``<blob>`` codec handles serialization of arbitrary Python objects
7678
including NumPy arrays, dictionaries, lists, datetime objects, and UUIDs.
7779
78-
Supports both internal and external storage:
80+
Supports both in-table and in-store storage:
7981
- ``<blob>``: Stored in database (bytes → LONGBLOB)
80-
- ``<blob@>``: Stored externally via ``<hash@>`` with deduplication
82+
- ``<blob@>``: Stored in object store via ``<hash@>`` with deduplication
8183
- ``<blob@store>``: Stored in specific named store
8284
8385
Format Features:
@@ -92,9 +94,9 @@ class ProcessedData(dj.Manual):
9294
definition = '''
9395
data_id : int
9496
---
95-
small_result : <blob> # internal (in database)
96-
large_result : <blob@> # external (default store)
97-
archive : <blob@cold> # external (specific store)
97+
small_result : <blob> # in-table (in database)
98+
large_result : <blob@> # in-store (default store)
99+
archive : <blob@cold> # in-store (specific store)
98100
'''
99101
100102
# Insert any serializable object
@@ -104,7 +106,7 @@ class ProcessedData(dj.Manual):
104106
name = "blob"
105107

106108
def get_dtype(self, is_store: bool) -> str:
107-
"""Return bytes for internal, <hash> for external storage."""
109+
"""Return bytes for in-table, <hash> for in-store storage."""
108110
return "<hash>" if is_store else "bytes"
109111

110112
def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
@@ -165,9 +167,9 @@ class RawContent(dj.Manual):
165167
name = "hash"
166168

167169
def get_dtype(self, is_store: bool) -> str:
168-
"""Hash storage is external only."""
170+
"""Hash storage is in-store only."""
169171
if not is_store:
170-
raise DataJointError("<hash> requires @ (external storage only)")
172+
raise DataJointError("<hash> requires @ (in-store storage only)")
171173
return "json"
172174

173175
def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict:
@@ -608,9 +610,9 @@ class AttachCodec(Codec):
608610
"""
609611
File attachment with filename preserved.
610612
611-
Supports both internal and external storage:
613+
Supports both in-table and in-store storage:
612614
- ``<attach>``: Stored in database (bytes → LONGBLOB)
613-
- ``<attach@>``: Stored externally via ``<hash@>`` with deduplication
615+
- ``<attach@>``: Stored in object store via ``<hash@>`` with deduplication
614616
- ``<attach@store>``: Stored in specific named store
615617
616618
The filename is preserved and the file is extracted to the configured
@@ -623,9 +625,9 @@ class Documents(dj.Manual):
623625
definition = '''
624626
doc_id : int
625627
---
626-
config : <attach> # internal (small file in DB)
627-
dataset : <attach@> # external (default store)
628-
archive : <attach@cold> # external (specific store)
628+
config : <attach> # in-table (small file in DB)
629+
dataset : <attach@> # in-store (default store)
630+
archive : <attach@cold> # in-store (specific store)
629631
'''
630632
631633
# Insert a file
@@ -642,7 +644,7 @@ class Documents(dj.Manual):
642644
name = "attach"
643645

644646
def get_dtype(self, is_store: bool) -> str:
645-
"""Return bytes for internal, <hash> for external storage."""
647+
"""Return bytes for in-table, <hash> for in-store storage."""
646648
return "<hash>" if is_store else "bytes"
647649

648650
def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:

src/datajoint/codecs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None
154154
key : dict, optional
155155
Primary key values. May be needed for path construction.
156156
store_name : str, optional
157-
Target store name for external storage.
157+
Target store name for object storage.
158158
159159
Returns
160160
-------
@@ -514,7 +514,7 @@ def decode_attribute(attr, data, squeeze: bool = False):
514514
This is the central decode function used by all fetch methods. It handles:
515515
- Codec chains (e.g., <blob@store> → <hash> → bytes)
516516
- Native type conversions (JSON, UUID)
517-
- External storage downloads (via config["download_path"])
517+
- Object storage downloads (via config["download_path"])
518518
519519
Args:
520520
attr: Attribute from the table's heading.
@@ -533,7 +533,7 @@ def decode_attribute(attr, data, squeeze: bool = False):
533533
return None
534534

535535
if attr.codec:
536-
# Get store if present for external storage
536+
# Get store if present for object storage
537537
store = getattr(attr, "store", None)
538538
if store is not None:
539539
dtype_spec = f"<{attr.codec.name}@{store}>"

src/datajoint/condition.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ def make_condition(
293293
- ``str``: Used directly as SQL condition
294294
- ``dict``: AND of equality conditions for matching attributes
295295
- ``bool``: Returns the boolean value (possibly negated)
296-
- ``QueryExpression``: Generates subquery (semijoin/antijoin)
296+
- ``QueryExpression``: Generates subquery for restriction
297297
- ``AndList``: AND of all conditions
298298
- ``list/set/tuple``: OR of all conditions
299299
- ``numpy.void``: Like dict, from record array
@@ -398,7 +398,7 @@ def combine_conditions(negate, conditions):
398398
if inspect.isclass(condition) and issubclass(condition, QueryExpression):
399399
condition = condition()
400400

401-
# restrict by another expression (aka semijoin and antijoin)
401+
# restrict by another expression
402402
if isinstance(condition, QueryExpression):
403403
assert_join_compatibility(query_expression, condition, semantic_check=semantic_check)
404404
# Match on all non-hidden namesakes (hidden attributes excluded)

src/datajoint/declare.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -650,7 +650,7 @@ def substitute_special_type(match: dict, category: str, foreign_key_sql: list[st
650650
codec, store_name = lookup_codec(match["type"])
651651
if store_name is not None:
652652
match["store"] = store_name
653-
# Determine if external storage is used (store_name is present, even if empty string for default)
653+
# Determine if in-store storage is used (store_name is present, even if empty string for default)
654654
is_store = store_name is not None
655655
inner_dtype = codec.get_dtype(is_store=is_store)
656656

src/datajoint/expression.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -686,7 +686,7 @@ def to_dicts(self, order_by=None, limit=None, offset=None, squeeze=False):
686686
:param squeeze: if True, remove extra dimensions from arrays
687687
:return: list of dictionaries, one per row
688688
689-
For external storage types (attachments, filepaths), files are downloaded
689+
For object storage types (attachments, filepaths), files are downloaded
690690
to config["download_path"]. Use config.override() to change::
691691
692692
with dj.config.override(download_path="/data"):
@@ -1078,7 +1078,7 @@ def make_sql(self):
10781078
alias=next(self.__count),
10791079
sorting=self.sorting_clauses(),
10801080
)
1081-
# with secondary attributes, use union of left join with antijoin
1081+
# with secondary attributes, use union of left join with anti-restriction
10821082
fields = self.heading.names
10831083
sql1 = arg1.join(arg2, left=True).make_sql(fields)
10841084
sql2 = (arg2 - arg1).proj(..., **{k: "NULL" for k in arg1.heading.secondary_attributes}).make_sql(fields)

src/datajoint/gc.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
"""
2-
Garbage collection for external storage.
2+
Garbage collection for object storage.
33
44
This module provides utilities to identify and remove orphaned items
5-
from external storage. Storage items become orphaned when all database rows
5+
from object storage. Storage items become orphaned when all database rows
66
referencing them are deleted.
77
8-
DataJoint uses two external storage patterns:
8+
DataJoint uses two object storage patterns:
99
1010
Hash-addressed storage
1111
Types: ``<hash@>``, ``<blob@>``, ``<attach@>``
@@ -31,7 +31,7 @@
3131
3232
See Also
3333
--------
34-
datajoint.builtin_codecs : Codec implementations for external storage types.
34+
datajoint.builtin_codecs : Codec implementations for object storage types.
3535
"""
3636

3737
from __future__ import annotations
@@ -638,7 +638,7 @@ def format_stats(stats: dict[str, Any]) -> str:
638638
str
639639
Formatted string.
640640
"""
641-
lines = ["External Storage Statistics:"]
641+
lines = ["Object Storage Statistics:"]
642642

643643
# Show hash-addressed storage stats if present
644644
if "hash_referenced" in stats:

src/datajoint/heading.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ def _init_from_database(self) -> None:
449449
# if no codec, then delay the error until the first invocation
450450
attr["codec"] = _MissingType(codec_spec)
451451
else:
452-
# Determine if external storage based on store presence
452+
# Determine if in-store storage based on store presence
453453
is_store = attr.get("store") is not None
454454
attr["type"] = attr["codec"].get_dtype(is_store=is_store)
455455
if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()):

src/datajoint/migrate.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
This module provides tools for migrating existing schemas to use the new
55
Codec system, particularly for upgrading blob columns to use
66
explicit `<blob>` type declarations.
7+
8+
Note on Terminology
9+
-------------------
10+
This module uses "external storage" because that was the term in DataJoint 0.14.6.
11+
In DataJoint 2.0 documentation, this is called "object storage" (general term)
12+
or "in-store storage" (specific to the @ modifier).
713
"""
814

915
from __future__ import annotations

src/datajoint/settings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ class DisplaySettings(BaseSettings):
208208

209209
class StoresSettings(BaseSettings):
210210
"""
211-
Unified external storage configuration.
211+
Unified object storage configuration.
212212
213213
Stores configuration supports both hash-addressed and schema-addressed storage
214214
using the same named stores with _hash and _schema sections.
@@ -296,7 +296,7 @@ class Config(BaseSettings):
296296
# Unified stores configuration (replaces external and object_storage)
297297
stores: dict[str, Any] = Field(
298298
default_factory=dict,
299-
description="Unified external storage configuration. "
299+
description="Unified object storage configuration. "
300300
"Use stores.default to designate default store. "
301301
"Configure named stores as stores.<name>.protocol, stores.<name>.location, etc.",
302302
)

0 commit comments

Comments
 (0)