pymc-devs
diff --git a/‎pytensor/link/numba/dispatch/elemwise.py‎
Lines changed: 104 additions & 4 deletions b/‎pytensor/link/numba/dispatch/elemwise.py‎
Lines changed: 104 additions & 4 deletions
@@ -20,7 +20,9 @@
 )
 from pytensor.link.numba.dispatch.string_codegen import create_tuple_string
 from pytensor.link.numba.dispatch.vectorize_codegen import (
+    _jit_options,
     _vectorized,
+    _vectorized_with_gather,
     encode_literals,
     store_core_outputs,
 )
@@ -35,12 +37,11 @@
     Mul,
     Sub,
     TrueDiv,
-    get_scalar_type,
     maximum,
 )
 from pytensor.scalar.basic import add as add_as
 from pytensor.tensor.blas import BatchedDot
-from pytensor.tensor.elemwise import CAReduce, DimShuffle, Elemwise
+from pytensor.tensor.elemwise import CAReduce, DimShuffle, Elemwise, ElemwiseWithGather
 from pytensor.tensor.math import Argmax, Dot, MulWithoutZeros, Sum
 from pytensor.tensor.special import LogSoftmax, Softmax, SoftmaxGrad
 
@@ -312,8 +313,7 @@ def axis_apply_fn(x):
 
 @register_funcify_and_cache_key(Elemwise)
 def numba_funcify_Elemwise(op, node, **kwargs):
-    scalar_inputs = [get_scalar_type(dtype=input.dtype)() for input in node.inputs]
-    scalar_node = op.scalar_op.make_node(*scalar_inputs)
+    scalar_node = op.make_scalar_node(*node.inputs)
     scalar_op_fn, scalar_cache_key = numba_funcify_and_cache_key(
         op.scalar_op,
         node=scalar_node,
@@ -390,6 +390,106 @@ def impl(*inputs):
     return elemwise, elemwise_key
 
 
+@register_funcify_and_cache_key(ElemwiseWithGather)
+def numba_funcify_ElemwiseWithGather(op, node, **kwargs):
+    """Generate fused gather+elemwise Numba code.
+
+    Analyzes the inner fgraph to find the Elemwise node and which inputs
+    are gathered via AdvancedSubtensor1, then generates a single vectorized
+    loop with indirect indexing for gathered inputs.
+
+    Outer inputs are expected in order:
+        [elemwise_input_0_or_source, ..., elemwise_input_N_or_source, gather_index]
+    The gather index is always the last input. ``indexed_inputs`` tells the
+    intrinsic which elemwise input positions use indirect indexing via it.
+    """
+    from pytensor.tensor.subtensor import AdvancedSubtensor1
+
+    [elemwise_node] = [n for n in op.fgraph.apply_nodes if isinstance(n.op, Elemwise)]
+
+    # Group gathered inputs by their index array.
+    # indexed_inputs encodes as ((inp_a, inp_b), (inp_c,), ...) — one tuple
+    # per distinct index array, listing the elemwise input positions it gathers.
+    # Outer inputs are [elemwise_inputs (sources substituted)..., idx_0, idx_1, ...]
+    index_groups = {}  # id(idx_var) -> list of elemwise input positions
+    for i, inp in enumerate(elemwise_node.inputs):
+        if inp.owner and isinstance(inp.owner.op, AdvancedSubtensor1):
+            idx_var = inp.owner.inputs[1]
+            index_groups.setdefault(id(idx_var), []).append(i)
+
+    indexed_inputs = tuple(tuple(positions) for positions in index_groups.values())
+
+    indexed_set = {p for positions in indexed_inputs for p in positions}
+
+    scalar_node = elemwise_node.op.make_scalar_node(*elemwise_node.inputs)
+    scalar_op_fn, scalar_cache_key = numba_funcify_and_cache_key(
+        elemwise_node.op.scalar_op, node=scalar_node, **kwargs
+    )
+
+    nin_elemwise = len(elemwise_node.inputs)
+    nout = len(elemwise_node.outputs)
+    core_op_fn = store_core_outputs(scalar_op_fn, nin=nin_elemwise, nout=nout)
+
+    # Gathered inputs use SOURCE's broadcastable; direct inputs use their own
+    input_bc_patterns = tuple(
+        inp.owner.inputs[0].type.broadcastable
+        if i in indexed_set
+        else inp.type.broadcastable
+        for i, inp in enumerate(elemwise_node.inputs)
+    )
+    output_bc_patterns = tuple(out.type.broadcastable for out in node.outputs)
+    output_dtypes = tuple(out.type.dtype for out in node.outputs)
+    inplace_pattern = tuple(elemwise_node.op.inplace_pattern.items())
+    core_output_shapes = tuple(() for _ in range(nout))
+
+    input_bc_patterns_enc = encode_literals(input_bc_patterns)
+    output_bc_patterns_enc = encode_literals(output_bc_patterns)
+    output_dtypes_enc = encode_literals(output_dtypes)
+    inplace_pattern_enc = encode_literals(inplace_pattern)
+    indexed_inputs_enc = encode_literals(indexed_inputs)
+
+    def elemwise_with_gather(*outer_inputs):
+        raise NotImplementedError(
+            "ElemwiseWithGather cannot be evaluated in Python (non-JIT) mode."
+        )
+
+    @overload(elemwise_with_gather, jit_options=_jit_options)
+    def ov_elemwise_with_gather(*outer_inputs):
+        def impl(*outer_inputs):
+            return _vectorized_with_gather(
+                core_op_fn,
+                input_bc_patterns_enc,
+                output_bc_patterns_enc,
+                output_dtypes_enc,
+                inplace_pattern_enc,
+                True,  # allow_core_scalar
+                (),  # constant_inputs
+                outer_inputs,
+                core_output_shapes,
+                None,  # size
+                indexed_inputs_enc,
+            )
+
+        return impl
+
+    if scalar_cache_key is None:
+        key = None
+    else:
+        key = str(
+            (
+                type(op),
+                "ElemwiseWithGather",
+                inplace_pattern,
+                input_bc_patterns,
+                indexed_inputs,
+                scalar_cache_key,
+            )
+        )
+        key = sha256(key.encode()).hexdigest()
+
+    return elemwise_with_gather, key
+
+
 @register_funcify_and_cache_key(CAReduce)
 def numba_funcify_CAReduce(op, node, **kwargs):
     axes = op.axis