fix: PostgreSQL compatibility improvements for DataJoint 2.1

dimitri-yatsenko · claude · dimitri-yatsenko · commit bc245d3f671e · 2026-01-20T14:40:39.000-06:00
Multiple fixes for PostgreSQL backend compatibility:

1. Fix composite FK column mapping in dependencies.py
   - Use pg_constraint with unnest() to correctly map FK columns
   - Previous information_schema query created Cartesian product
   - Fixes "Attribute already exists" errors during key_source

2. Fix Part table full_table_name quoting
   - PartMeta.full_table_name now uses adapter.quote_identifier()
   - Previously hardcoded MySQL backticks
   - Fixes "syntax error at or near `" errors with Part tables

3. Fix char type length preservation in postgres.py
   - Reconstruct parametrized types from PostgreSQL info schema
   - Fixes char(n) being truncated to char(1) for FK columns

4. Implement HAVING clause subquery wrapping for PostgreSQL
   - PostgreSQL doesn't allow column aliases in HAVING
   - Aggregation.make_sql() wraps as subquery with WHERE on PostgreSQL
   - MySQL continues to use HAVING directly (more efficient)

5. Implement GROUP_CONCAT/STRING_AGG translation
   - Base adapter has translate_expression() method
   - PostgreSQL: GROUP_CONCAT → STRING_AGG
   - MySQL: STRING_AGG → GROUP_CONCAT
   - heading.py calls translate_expression() in as_sql()

6. Register numpy type adapters for PostgreSQL
   - numpy.bool_, int*, float* types now work with psycopg2
   - Prevents "can't adapt type 'numpy.bool_'" errors

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/src/datajoint/adapters/base.py b/src/datajoint/adapters/base.py
@@ -938,6 +938,34 @@ def json_path_expr(self, column: str, path: str, return_type: str | None = None)
         """
         ...
 
+    def translate_expression(self, expr: str) -> str:
+        """
+        Translate SQL expression for backend compatibility.
+
+        Converts database-specific function calls to the equivalent syntax
+        for the current backend. This enables portable DataJoint code that
+        uses common aggregate functions.
+
+        Translations performed:
+        - GROUP_CONCAT(col) ↔ STRING_AGG(col, ',')
+
+        Parameters
+        ----------
+        expr : str
+            SQL expression that may contain function calls.
+
+        Returns
+        -------
+        str
+            Translated expression for the current backend.
+
+        Notes
+        -----
+        The base implementation returns the expression unchanged.
+        Subclasses override to provide backend-specific translations.
+        """
+        return expr
+
     # =========================================================================
     # DDL Generation
     # =========================================================================
diff --git a/src/datajoint/adapters/mysql.py b/src/datajoint/adapters/mysql.py
@@ -827,6 +827,50 @@ def json_path_expr(self, column: str, path: str, return_type: str | None = None)
         return_clause = f" returning {return_type}" if return_type else ""
         return f"json_value({quoted_col}, _utf8mb4'$.{path}'{return_clause})"
 
+    def translate_expression(self, expr: str) -> str:
+        """
+        Translate SQL expression for MySQL compatibility.
+
+        Converts PostgreSQL-specific functions to MySQL equivalents:
+        - STRING_AGG(col, 'sep') → GROUP_CONCAT(col SEPARATOR 'sep')
+        - STRING_AGG(col, ',') → GROUP_CONCAT(col)
+
+        Parameters
+        ----------
+        expr : str
+            SQL expression that may contain function calls.
+
+        Returns
+        -------
+        str
+            Translated expression for MySQL.
+        """
+        import re
+
+        # STRING_AGG(col, 'sep') → GROUP_CONCAT(col SEPARATOR 'sep')
+        def replace_string_agg(match):
+            inner = match.group(1).strip()
+            # Parse arguments: col, 'separator'
+            # Handle both single and double quoted separators
+            arg_match = re.match(r"(.+?)\s*,\s*(['\"])(.+?)\2", inner)
+            if arg_match:
+                col = arg_match.group(1).strip()
+                sep = arg_match.group(3)
+                # Remove ::text cast if present (PostgreSQL-specific)
+                col = re.sub(r"::text$", "", col)
+                if sep == ",":
+                    return f"GROUP_CONCAT({col})"
+                else:
+                    return f"GROUP_CONCAT({col} SEPARATOR '{sep}')"
+            else:
+                # No separator found, just use the expression
+                col = re.sub(r"::text$", "", inner)
+                return f"GROUP_CONCAT({col})"
+
+        expr = re.sub(r"STRING_AGG\s*\((.+?)\)", replace_string_agg, expr, flags=re.IGNORECASE)
+
+        return expr
+
     # =========================================================================
     # DDL Generation
     # =========================================================================
diff --git a/src/datajoint/adapters/postgres.py b/src/datajoint/adapters/postgres.py
@@ -130,8 +130,38 @@ def connect(
         # DataJoint manages transactions explicitly via start_transaction()
         # Set autocommit=True to avoid implicit transactions
         conn.autocommit = True
+
+        # Register numpy type adapters so numpy types can be used directly in queries
+        self._register_numpy_adapters()
+
         return conn
 
+    def _register_numpy_adapters(self) -> None:
+        """
+        Register psycopg2 adapters for numpy types.
+
+        This allows numpy scalar types (bool_, int64, float64, etc.) to be used
+        directly in queries without explicit conversion to Python native types.
+        """
+        try:
+            import numpy as np
+            from psycopg2.extensions import register_adapter, AsIs
+
+            # Numpy bool type
+            register_adapter(np.bool_, lambda x: AsIs(str(bool(x)).upper()))
+
+            # Numpy integer types
+            for np_type in (np.int8, np.int16, np.int32, np.int64,
+                            np.uint8, np.uint16, np.uint32, np.uint64):
+                register_adapter(np_type, lambda x: AsIs(int(x)))
+
+            # Numpy float types
+            for np_type in (np.float16, np.float32, np.float64):
+                register_adapter(np_type, lambda x: AsIs(repr(float(x))))
+
+        except ImportError:
+            pass  # numpy not available
+
     def close(self, connection: Any) -> None:
         """Close the PostgreSQL connection."""
         connection.close()
@@ -853,6 +883,25 @@ def parse_column_info(self, row: dict[str, Any]) -> dict[str, Any]:
         data_type = row["data_type"]
         if data_type == "USER-DEFINED":
             data_type = row["udt_name"]
+
+        # Reconstruct parametrized types that PostgreSQL splits into separate fields
+        char_max_len = row.get("character_maximum_length")
+        num_precision = row.get("numeric_precision")
+        num_scale = row.get("numeric_scale")
+
+        if data_type == "character" and char_max_len is not None:
+            # char(n) - PostgreSQL reports as "character" with length in separate field
+            data_type = f"char({char_max_len})"
+        elif data_type == "character varying" and char_max_len is not None:
+            # varchar(n)
+            data_type = f"varchar({char_max_len})"
+        elif data_type == "numeric" and num_precision is not None:
+            # numeric(p,s) - reconstruct decimal type
+            if num_scale is not None and num_scale > 0:
+                data_type = f"decimal({num_precision},{num_scale})"
+            else:
+                data_type = f"decimal({num_precision})"
+
         return {
             "name": row["column_name"],
             "type": data_type,
@@ -959,6 +1008,43 @@ def json_path_expr(self, column: str, path: str, return_type: str | None = None)
         # Note: PostgreSQL jsonb_extract_path_text doesn't use return type parameter
         return f"jsonb_extract_path_text({quoted_col}, {path_args})"
 
+    def translate_expression(self, expr: str) -> str:
+        """
+        Translate SQL expression for PostgreSQL compatibility.
+
+        Converts MySQL-specific functions to PostgreSQL equivalents:
+        - GROUP_CONCAT(col) → STRING_AGG(col::text, ',')
+        - GROUP_CONCAT(col SEPARATOR 'sep') → STRING_AGG(col::text, 'sep')
+
+        Parameters
+        ----------
+        expr : str
+            SQL expression that may contain function calls.
+
+        Returns
+        -------
+        str
+            Translated expression for PostgreSQL.
+        """
+        import re
+
+        # GROUP_CONCAT(col) → STRING_AGG(col::text, ',')
+        # GROUP_CONCAT(col SEPARATOR 'sep') → STRING_AGG(col::text, 'sep')
+        def replace_group_concat(match):
+            inner = match.group(1).strip()
+            # Check for SEPARATOR clause
+            sep_match = re.match(r"(.+?)\s+SEPARATOR\s+(['\"])(.+?)\2", inner, re.IGNORECASE)
+            if sep_match:
+                col = sep_match.group(1).strip()
+                sep = sep_match.group(3)
+                return f"STRING_AGG({col}::text, '{sep}')"
+            else:
+                return f"STRING_AGG({inner}::text, ',')"
+
+        expr = re.sub(r"GROUP_CONCAT\s*\((.+?)\)", replace_group_concat, expr, flags=re.IGNORECASE)
+
+        return expr
+
     # =========================================================================
     # DDL Generation
     # =========================================================================
diff --git a/src/datajoint/dependencies.py b/src/datajoint/dependencies.py
@@ -221,25 +221,31 @@ def load(self, force: bool = True) -> None:
             for key in keys:
                 pks[key[0]].add(key[1])
 
-            # load foreign keys (PostgreSQL requires joining multiple tables)
-            ref_tab_expr = "'\"' || ccu.table_schema || '\".\"' || ccu.table_name || '\"'"
+            # load foreign keys using pg_constraint system catalogs
+            # The information_schema approach creates a Cartesian product for composite FKs
+            # because constraint_column_usage doesn't have ordinal_position.
+            # Using pg_constraint with unnest(conkey, confkey) WITH ORDINALITY gives correct mapping.
             fk_keys = self._conn.query(
                 f"""
-                SELECT kcu.constraint_name,
-                    {tab_expr} as referencing_table,
-                    {ref_tab_expr} as referenced_table,
-                    kcu.column_name, ccu.column_name as referenced_column_name
-                FROM information_schema.key_column_usage kcu
-                JOIN information_schema.referential_constraints rc
-                    ON kcu.constraint_name = rc.constraint_name
-                    AND kcu.constraint_schema = rc.constraint_schema
-                JOIN information_schema.constraint_column_usage ccu
-                    ON rc.unique_constraint_name = ccu.constraint_name
-                    AND rc.unique_constraint_schema = ccu.constraint_schema
-                WHERE kcu.table_name NOT LIKE {like_pattern}
-                    AND (ccu.table_schema in ({schemas_list})
-                         OR kcu.table_schema in ({schemas_list}))
-                ORDER BY kcu.constraint_name, kcu.ordinal_position
+                SELECT
+                    c.conname as constraint_name,
+                    '"' || ns1.nspname || '"."' || cl1.relname || '"' as referencing_table,
+                    '"' || ns2.nspname || '"."' || cl2.relname || '"' as referenced_table,
+                    a1.attname as column_name,
+                    a2.attname as referenced_column_name
+                FROM pg_constraint c
+                JOIN pg_class cl1 ON c.conrelid = cl1.oid
+                JOIN pg_namespace ns1 ON cl1.relnamespace = ns1.oid
+                JOIN pg_class cl2 ON c.confrelid = cl2.oid
+                JOIN pg_namespace ns2 ON cl2.relnamespace = ns2.oid
+                CROSS JOIN LATERAL unnest(c.conkey, c.confkey) WITH ORDINALITY AS cols(conkey, confkey, ord)
+                JOIN pg_attribute a1 ON a1.attrelid = cl1.oid AND a1.attnum = cols.conkey
+                JOIN pg_attribute a2 ON a2.attrelid = cl2.oid AND a2.attnum = cols.confkey
+                WHERE c.contype = 'f'
+                    AND cl1.relname NOT LIKE {like_pattern}
+                    AND (ns2.nspname in ({schemas_list})
+                         OR ns1.nspname in ({schemas_list}))
+                ORDER BY c.conname, cols.ord
                 """,
                 as_dict=True,
             )
diff --git a/src/datajoint/expression.py b/src/datajoint/expression.py
@@ -1019,27 +1019,54 @@ def where_clause(self):
         return "" if not self._left_restrict else " WHERE (%s)" % ")AND(".join(str(s) for s in self._left_restrict)
 
     def make_sql(self, fields=None):
-        fields = self.heading.as_sql(fields or self.heading.names, adapter=self.connection.adapter)
+        adapter = self.connection.adapter
+        fields = self.heading.as_sql(fields or self.heading.names, adapter=adapter)
         assert self._grouping_attributes or not self.restriction
         distinct = set(self.heading.names) == set(self.primary_key)
-        return "SELECT {distinct}{fields} FROM {from_}{where}{group_by}{sorting}".format(
-            distinct="DISTINCT " if distinct else "",
-            fields=fields,
-            from_=self.from_clause(),
-            where=self.where_clause(),
-            group_by=(
-                ""
-                if not self.primary_key
-                else (
-                    " GROUP BY {}".format(
-                        ", ".join(self.connection.adapter.quote_identifier(col) for col in self._grouping_attributes)
-                    )
-                    + ("" if not self.restriction else " HAVING (%s)" % ")AND(".join(self.restriction))
-                )
-            ),
-            sorting=self.sorting_clauses(),
+
+        # PostgreSQL doesn't allow column aliases in HAVING clause (SQL standard).
+        # For PostgreSQL with restrictions, wrap aggregation in subquery and use WHERE.
+        use_subquery_for_having = (
+            adapter.backend == "postgresql"
+            and self.restriction
+            and self._grouping_attributes
         )
 
+        if use_subquery_for_having:
+            # Generate inner query without HAVING
+            inner_sql = "SELECT {distinct}{fields} FROM {from_}{where}{group_by}".format(
+                distinct="DISTINCT " if distinct else "",
+                fields=fields,
+                from_=self.from_clause(),
+                where=self.where_clause(),
+                group_by=" GROUP BY {}".format(
+                    ", ".join(adapter.quote_identifier(col) for col in self._grouping_attributes)
+                ),
+            )
+            # Wrap in subquery with WHERE for the HAVING conditions
+            subquery_alias = adapter.quote_identifier(f"_aggr{next(self._subquery_alias_count)}")
+            outer_where = " WHERE (%s)" % ")AND(".join(self.restriction)
+            return f"SELECT * FROM ({inner_sql}) AS {subquery_alias}{outer_where}{self.sorting_clauses()}"
+        else:
+            # MySQL path: use HAVING directly
+            return "SELECT {distinct}{fields} FROM {from_}{where}{group_by}{sorting}".format(
+                distinct="DISTINCT " if distinct else "",
+                fields=fields,
+                from_=self.from_clause(),
+                where=self.where_clause(),
+                group_by=(
+                    ""
+                    if not self.primary_key
+                    else (
+                        " GROUP BY {}".format(
+                            ", ".join(adapter.quote_identifier(col) for col in self._grouping_attributes)
+                        )
+                        + ("" if not self.restriction else " HAVING (%s)" % ")AND(".join(self.restriction))
+                    )
+                ),
+                sorting=self.sorting_clauses(),
+            )
+
     def __len__(self):
         alias = self.connection.adapter.quote_identifier(f"${next(self._subquery_alias_count):x}")
         return self.connection.query(f"SELECT count(1) FROM ({self.make_sql()}) {alias}").fetchone()[0]
diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py
@@ -349,14 +349,20 @@ def quote(name):
             # Use adapter if available, otherwise use ANSI SQL double quotes (not backticks)
             return adapter.quote_identifier(name) if adapter else f'"{name}"'
 
-        return ",".join(
-            (
-                quote(name)
-                if self.attributes[name].attribute_expression is None
-                else self.attributes[name].attribute_expression + (f" as {quote(name)}" if include_aliases else "")
-            )
-            for name in fields
-        )
+        def render_field(name):
+            attr = self.attributes[name]
+            if attr.attribute_expression is None:
+                return quote(name)
+            else:
+                # Translate expression for backend compatibility (e.g., GROUP_CONCAT ↔ STRING_AGG)
+                expr = attr.attribute_expression
+                if adapter:
+                    expr = adapter.translate_expression(expr)
+                if include_aliases:
+                    return f"{expr} as {quote(name)}"
+                return expr
+
+        return ",".join(render_field(name) for name in fields)
 
     def __iter__(self):
         return iter(self.attributes)
diff --git a/src/datajoint/user_tables.py b/src/datajoint/user_tables.py
@@ -182,10 +182,11 @@ def table_name(cls):
 
     @property
     def full_table_name(cls):
-        """The fully qualified table name (`database`.`table`)."""
+        """The fully qualified table name (quoted per backend)."""
         if cls.database is None or cls.table_name is None:
             return None
-        return r"`{0:s}`.`{1:s}`".format(cls.database, cls.table_name)
+        adapter = cls._connection.adapter
+        return f"{adapter.quote_identifier(cls.database)}.{adapter.quote_identifier(cls.table_name)}"
 
     @property
     def master(cls):