Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 189 additions & 28 deletions arkouda/numpy/pdarraycreation.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,22 @@ def array(
*,
copy: bool = ...,
max_bits: int = ...,
unsafe: bool = ...,
any_neg: Optional[bool] = ...,
num_bits: Optional[int] = ...,
) -> Strings: ...


@overload
def array(
a: Union[pdarray, Strings, Iterable[Any], NDArray[Any]],
dtype: _NumericLikeDType = ..., # object covers np.dtype, type, int/float/bool dtypes, etc.
dtype: _NumericLikeDType = ...,
*,
copy: bool = ...,
max_bits: int = ...,
unsafe: bool = ...,
any_neg: Optional[bool] = ...,
num_bits: Optional[int] = ...,
) -> pdarray: ...


Expand All @@ -112,15 +118,26 @@ def array(
*,
copy: bool = ...,
max_bits: int = ...,
unsafe: bool = ...,
any_neg: Optional[bool] = ...,
num_bits: Optional[int] = ...,
) -> Union[pdarray, Strings]: ...


# The remaining overloads can stay as-is, but it's nicer to add the kwargs there too
# so tooling doesn't complain in those call patterns. Minimal version:


@overload
def array(
a: Union[pdarray, List[_NumericLikeDType]],
dtype: Union[_StringDType, _NumericLikeDType, None] = None,
copy: bool = False,
max_bits: int = -1,
*,
unsafe: bool = ...,
any_neg: Optional[bool] = ...,
num_bits: Optional[int] = ...,
) -> pdarray: ...


Expand All @@ -130,6 +147,10 @@ def array(
dtype: Union[_StringDType, _NumericLikeDType, None] = None,
copy: bool = False,
max_bits: int = -1,
*,
unsafe: bool = ...,
any_neg: Optional[bool] = ...,
num_bits: Optional[int] = ...,
) -> Strings: ...


Expand All @@ -139,6 +160,10 @@ def array(
dtype: Union[_StringDType, _NumericLikeDType, None] = None,
copy: bool = False,
max_bits: int = -1,
*,
unsafe: bool = ...,
any_neg: Optional[bool] = ...,
num_bits: Optional[int] = ...,
) -> Union[pdarray, Strings]: ...


Expand All @@ -147,11 +172,31 @@ def array(
dtype: Union[np.dtype, type, str, None] = None,
copy: bool = False,
max_bits: int = -1,
*,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like the comment inside the signature. Can it be moved lower?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this look better? Or should I just move it into notes?

unsafe: bool = False,
any_neg: Optional[bool] = None,
num_bits: Optional[int] = None,
) -> Union[pdarray, Strings]:
"""
Convert a Python, NumPy, or Arkouda array-like into a `pdarray` or `Strings` object,
transferring data to the Arkouda server.

Bigint fast path
----------------
When `dtype` is `bigint` (or ``"bigint"``) and the input is a NumPy ``dtype=object`` array,
building the bigint representation may require Python-level passes over the data to infer
sign and required bit width.

Set ``unsafe=True`` to enable optional performance shortcuts for trusted inputs:

* If both ``any_neg`` and ``num_bits`` are provided, Arkouda will trust these hints and
skip inference passes, proceeding directly to limb extraction.
* If hints are not provided, Arkouda may still perform a single-pass inference step
for ``dtype=object`` inputs (implementation-dependent).

.. warning::
If ``unsafe=True`` and the provided hints are incorrect, results may be incorrect.

Parameters
----------
a : Union[pdarray, np.ndarray, Iterable, Strings]
Expand All @@ -170,6 +215,21 @@ def array(
max_bits : int, optional
The maximum number of bits for bigint arrays. Ignored for other dtypes.

unsafe : bool, default=False
Enable performance-oriented shortcuts for bigint creation from dtype=object arrays.
When True, the caller may supply trusted hints (any_neg, num_bits) to skip Python-level
full-array scans. If hints are not supplied, Arkouda will still do a single-pass scan
(rather than multiple scans) for dtype=object.

WARNING: If unsafe=True and provided hints are wrong, results may be incorrect.

any_neg : Optional[bool], default=None
Bigint hint: whether any value is negative. Only used when dtype=bigint and a is numeric/object.

num_bits : Optional[int], default=None
Bigint hint: required bit-width for values (including sign bit if signed).
If provided with unsafe=True, Arkouda can compute number of limbs without scanning.

Returns
-------
Union[pdarray, Strings]
Expand Down Expand Up @@ -250,7 +310,6 @@ def array(
a = list(a)

# If a is not already a numpy.ndarray, convert it

if not isinstance(a, np.ndarray):
try:
if dtype is not None and dtype != bigint:
Expand All @@ -270,8 +329,13 @@ def array(
a = np.array(a)
except (RuntimeError, TypeError, ValueError):
raise TypeError("a must be a pdarray, np.ndarray, or convertible to a numpy array")

if dtype is not None and dtype not in [bigint, "bigint"]:
a = a.astype(dtype)

# If numpy gave us an object array of integers and dtype was not specified, default to bigint.
# NOTE: This scan can be expensive for very large arrays; callers who already know they want
# bigint should pass dtype=bigint to avoid this inference.
if a.dtype == object and dtype is None and all(isinstance(x, (int, np.integer)) for x in a.flat):
dtype = bigint

Expand All @@ -280,11 +344,18 @@ def array(

# Return multi-dimensional pdarray if a.ndim in get_array_ranks()
# otherwise raise an error

if a.ndim not in get_array_ranks():
raise ValueError(f"array rank {a.ndim} not in compiled ranks {get_array_ranks()}")

# Bigint conversion path (including dtype=object numeric inputs)
if a.dtype.kind in ("i", "u", "f", "O") and (dtype == bigint or dtype == "bigint"):
return _bigint_from_numpy(a, max_bits)
return _bigint_from_numpy(
a,
max_bits,
unsafe=unsafe,
any_neg=any_neg,
num_bits=num_bits,
)
elif not (
np.issubdtype(a.dtype, np.str_)
or (a.dtype == np.object_ and a.size > 0 and isinstance(a[0], str))
Expand Down Expand Up @@ -315,7 +386,6 @@ def array(
return strings if dtype is None else type_cast(Union[pdarray, Strings], akcast(strings, dtype))

# If not strings, then check that dtype is supported in arkouda

from arkouda.numpy.util import _infer_shape_from_size

shape, ndim, full_size = _infer_shape_from_size(a.shape)
Expand Down Expand Up @@ -359,7 +429,33 @@ def array(
)


def _bigint_from_numpy(np_a: np.ndarray, max_bits: int) -> pdarray:
def _bigint_from_numpy(
np_a: np.ndarray,
max_bits: int,
*,
unsafe: bool = False,
any_neg: Optional[bool] = None,
num_bits: Optional[int] = None,
) -> pdarray:
"""
Create a bigint pdarray from a NumPy ndarray.

Fast paths:
- Empty arrays return an empty bigint with shape preserved.
- Non-object numeric arrays (int/uint/float) use a single-limb transfer.
- dtype=object arrays of Python ints are split into uint64 limbs client-side.

Performance notes:
- dtype=object requires Python-level work. To reduce overhead:
* this implementation uses a SINGLE PASS over the data to validate and infer sizing
(instead of multiple passes), unless the caller provides trusted hints.
* if unsafe=True and both any_neg and num_bits are provided, we skip all scans
and proceed directly to limb extraction.

WARNING:
- If unsafe=True and provided hints are wrong (e.g., num_bits too small or any_neg incorrect),
results may be incorrect.
"""
from arkouda.client import generic_msg
from arkouda.numpy.pdarrayclass import create_pdarray, pdarray

Expand All @@ -371,18 +467,13 @@ def _bigint_from_numpy(np_a: np.ndarray, max_bits: int) -> pdarray:
if a.size == 0:
return zeros(size=a.shape, dtype=bigint, max_bits=max_bits)

# Only ints/object should reach here (strings handled upstream)
# Only ints/uints/floats/object should reach here (strings handled upstream)
if a.dtype.kind not in ("i", "u", "f", "O"):
raise TypeError(f"bigint requires numeric input, got dtype={a.dtype}")

if a.dtype.kind == "O":
# Only allow Python ints and floats
if not all(isinstance(x, (int, float)) for x in a.flat):
raise TypeError("bigint requires numeric input, got non-numeric object")
if any(isinstance(x, float) for x in a.flat):
a = a.astype(np.float64, copy=False)

# Fast all-zero path or single limb path
# ------------------------------------------------------------
# Fast path for non-object numeric dtypes: send a single limb.
# ------------------------------------------------------------
if a.dtype.kind in ("i", "u", "f"):
if not np.any(a):
return zeros(size=a.shape, dtype=bigint, max_bits=max_bits)
Expand All @@ -394,7 +485,6 @@ def _bigint_from_numpy(np_a: np.ndarray, max_bits: int) -> pdarray:
else:
ak_a = array(a.astype(np.float64, copy=False))

# Send a single limb array
return create_pdarray(
generic_msg(
cmd=f"big_int_creation_one_limb<{ak_a.dtype},{ak_a.ndim}>",
Expand All @@ -405,31 +495,102 @@ def _bigint_from_numpy(np_a: np.ndarray, max_bits: int) -> pdarray:
},
)
)
else:
if all(int(x) == 0 for x in a.flat):

# ----------------------------------------------------------------
# dtype=object path (typically Python ints, sometimes floats/other):
# ----------------------------------------------------------------

# If caller supplied trusted sizing hints, skip all validation/scans.
# This is intended for benchmarking or trusted pipelines where the caller
# already knows whether the data are signed and how many bits are needed.
if unsafe and any_neg is not None and num_bits is not None:
if num_bits <= 0:
return zeros(size=a.shape, dtype=bigint, max_bits=max_bits)
any_neg = np.any(flat < 0)
req_bits: int
if any_neg:
req_bits = max(flat.max().bit_length(), (-flat.min()).bit_length()) + 1

req_bits = int(num_bits)
req_limbs = (req_bits + 63) >> 6 # == ceil(req_bits / 64)
else:
req_bits = flat.max().bit_length()
req_limbs = (req_bits + 63) // 64
mask = (1 << 64) - 1
# Single-pass scan:
# - validate types (unless unsafe)
# - detect floats (fallback to float64 one-limb path)
# - detect all-zero (fast path)
# - infer any_neg and required bits
has_float = False
any_nonzero = False
inferred_any_neg = False
max_mag_bits = 0

# Iterate once over Python objects.
for x in a.flat:
if unsafe:
# Caller asserts objects are numeric (typically Python ints).
pass
else:
if not isinstance(x, (int, float, np.integer, np.floating)):
raise TypeError("bigint requires numeric input, got non-numeric object")

# Handle floats: bigint-from-float behaves like existing code (cast to float64).
# Note: we only need to know *that* a float exists, not which values.
if isinstance(x, (float, np.floating)):
has_float = True
break

# Integers (Python int or numpy integer)
xi = int(x)
if xi != 0:
any_nonzero = True
if xi < 0:
inferred_any_neg = True
mag_bits = (-xi).bit_length()
else:
mag_bits = xi.bit_length()
max_mag_bits = max(max_mag_bits, mag_bits)

if has_float:
# Match previous behavior: object array containing floats becomes float64
# and uses the single-limb float path above.
a = a.astype(np.float64, copy=False)
if not np.any(a):
return zeros(size=a.shape, dtype=bigint, max_bits=max_bits)
ak_a = array(a)
return create_pdarray(
generic_msg(
cmd=f"big_int_creation_one_limb<{ak_a.dtype},{ak_a.ndim}>",
args={
"array": ak_a,
"shape": ak_a.shape,
"max_bits": max_bits,
},
)
)

if not any_nonzero:
return zeros(size=a.shape, dtype=bigint, max_bits=max_bits)

any_neg = inferred_any_neg
# For signed representation reserve a sign bit (+1).
req_bits = max_mag_bits + (1 if any_neg else 0)
req_limbs = (req_bits + 63) >> 6 # == ceil(req_bits / 64)

mask = 0xFFFFFFFFFFFFFFFF # 2**64 - 1
uint_arrays: List[Union[pdarray, Strings]] = []
# attempt to break bigint into multiple uint64 arrays

# Attempt to break bigint into multiple uint64 limb arrays.
# NOTE: This loop is inherently heavy for dtype=object inputs (Python big ints),
# but we minimize overhead by avoiding extra pre-scans of the data.
for _ in range(req_limbs):
low = flat & mask
flat = flat >> 64 # type: ignore
# low, flat = flat & mask, flat >> 64
uint_arrays.append(array(np.array(low, dtype=np.uint), dtype=akuint64))

# Server expects shape of the resulting bigint array (original shape).
return create_pdarray(
generic_msg(
cmd=f"big_int_creation_multi_limb<{uint_arrays[0].dtype},1>",
args={
"arrays": uint_arrays,
"num_arrays": len(uint_arrays),
"signed": any_neg,
"signed": bool(any_neg),
"shape": flat.shape,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be out_shape instead of flat.shape? Then you wouldn't need the reshape.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. Currently Chapel-side I have

    @arkouda.instantiateAndRegister("big_int_creation_multi_limb")
    proc bigIntCreationMultiLimbMsg(cmd: string,
                                    msgArgs: borrowed MessageArgs,
                                    st: borrowed SymTab,
                                    type array_dtype,
                                    param array_nd: int): MsgTuple throws
    where (array_dtype == uint(64) && array_nd == 1)

I'm not sure how difficult it would be to refactor the code for multiple dimensions, either.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, maybe later. Thanks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we make an issue? Investigate making this a multidim function?

"max_bits": max_bits,
},
Expand Down
7 changes: 6 additions & 1 deletion benchmark_v2/array_transfer_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,13 @@ def bench_array_transfer_from_ndarray(benchmark, dtype):

def from_np():
ak.array(npa, max_bits=pytest.max_bits)
def from_np_bigint():
ak.array(npa, max_bits=-1, dtype=dtype, unsafe=True, num_bits=128, any_neg=False)

benchmark.pedantic(from_np, rounds=pytest.trials)
if dtype == "bigint":
benchmark.pedantic(from_np_bigint, rounds=pytest.trials)
else:
benchmark.pedantic(from_np, rounds=pytest.trials)

benchmark.extra_info["description"] = "Measures the performance of ak.array"
benchmark.extra_info["problem_size"] = N
Expand Down
11 changes: 8 additions & 3 deletions benchmarks/array_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,14 @@ def time_ak_array_transfer(N, trials, dtype, seed, max_bits=-1):
npa = a.to_ndarray()
end = time.time()
to_ndarray_times.append(end - start)
start = time.time()
aka = ak.array(npa, max_bits=max_bits, dtype=dtype)
end = time.time()
if dtype == ak.bigint.name:
start = time.time()
aka = ak.array(npa, max_bits=max_bits, dtype=dtype, unsafe=True, num_bits=128, any_neg=False)
end = time.time()
else:
start = time.time()
aka = ak.array(npa, max_bits=max_bits, dtype=dtype)
end = time.time()
to_pdarray_times.append(end - start)
gc.collect()
avgnd = sum(to_ndarray_times) / trials
Expand Down
Loading