From 7128c6b7b085a6dd7932bad742d769a4bc1b0212 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 30 Jul 2025 07:26:41 -0700
Subject: [PATCH 01/15] Extract _batched_lu_factor() to simplify batched logic
 in _lu_factor()

---
 dpnp/linalg/dpnp_utils_linalg.py | 236 ++++++++++++++++---------------
 1 file changed, 123 insertions(+), 113 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 51cebb2815bb..795ad27dcfb5 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -275,6 +275,126 @@ def _batched_inv(a, res_type):
     return a_h.reshape(orig_shape)
 
 
+def _batched_lu_factor(a, res_type):
+    """Compute pivoted LU decomposition for a batch of matrices."""
+
+    # TODO: Find out at which array sizes the best performance is obtained
+    # getrf_batch implementation shows slow results with large arrays on GPU.
+    # Use getrf_batch only on CPU.
+    # On GPU call getrf for each two-dimensional array by loop
+    use_batch = a.sycl_device.has_aspect_cpu
+
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
+    _manager = dpu.SequentialOrderManager[a_sycl_queue]
+
+    n = a.shape[-2]
+    orig_shape = a.shape
+    # get 3d input arrays by reshape
+    a = dpnp.reshape(a, (-1, n, n))
+    batch_size = a.shape[0]
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+
+    if use_batch:
+        # `a` must be copied because getrf_batch destroys the input matrix
+        a_h = dpnp.empty_like(a, order="C", dtype=res_type)
+        ipiv_h = dpnp.empty(
+            (batch_size, n),
+            dtype=dpnp.int64,
+            order="C",
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
+        dev_info_h = [0] * batch_size
+
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_usm_arr,
+            dst=a_h.get_array(),
+            sycl_queue=a_sycl_queue,
+            depends=_manager.submitted_events,
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+
+        ipiv_stride = n
+        a_stride = a_h.strides[0]
+
+        # Call the LAPACK extension function _getrf_batch
+        # to perform LU decomposition of a batch of general matrices
+        ht_ev, getrf_ev = li._getrf_batch(
+            a_sycl_queue,
+            a_h.get_array(),
+            ipiv_h.get_array(),
+            dev_info_h,
+            n,
+            a_stride,
+            ipiv_stride,
+            batch_size,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, getrf_ev)
+
+        dev_info_array = dpnp.array(
+            dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue
+        )
+
+        # Reshape the results back to their original shape
+        a_h = a_h.reshape(orig_shape)
+        ipiv_h = ipiv_h.reshape(orig_shape[:-1])
+        dev_info_array = dev_info_array.reshape(orig_shape[:-2])
+
+        return (a_h, ipiv_h, dev_info_array)
+
+    # Initialize lists for storing arrays and events for each batch
+    a_vecs = [None] * batch_size
+    ipiv_vecs = [None] * batch_size
+    dev_info_vecs = [None] * batch_size
+
+    dep_evs = _manager.submitted_events
+
+    # Process each batch
+    for i in range(batch_size):
+        # Copy each 2D slice to a new array because getrf will destroy
+        # the input matrix
+        a_vecs[i] = dpnp.empty_like(a[i], order="C", dtype=res_type)
+
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_usm_arr[i],
+            dst=a_vecs[i].get_array(),
+            sycl_queue=a_sycl_queue,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+
+        ipiv_vecs[i] = dpnp.empty(
+            (n,),
+            dtype=dpnp.int64,
+            order="C",
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
+        dev_info_vecs[i] = [0]
+
+        # Call the LAPACK extension function _getrf
+        # to perform LU decomposition on each batch in 'a_vecs[i]'
+        ht_ev, getrf_ev = li._getrf(
+            a_sycl_queue,
+            a_vecs[i].get_array(),
+            ipiv_vecs[i].get_array(),
+            dev_info_vecs[i],
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, getrf_ev)
+
+    # Reshape the results back to their original shape
+    out_a = dpnp.array(a_vecs, order="C").reshape(orig_shape)
+    out_ipiv = dpnp.array(ipiv_vecs).reshape(orig_shape[:-1])
+    out_dev_info = dpnp.array(
+        dev_info_vecs, usm_type=a_usm_type, sycl_queue=a_sycl_queue
+    ).reshape(orig_shape[:-2])
+
+    return (out_a, out_ipiv, out_dev_info)
+
+
 def _batched_solve(a, b, exec_q, res_usm_type, res_type):
     """
     _batched_solve(a, b, exec_q, res_usm_type, res_type)
@@ -901,125 +1021,15 @@ def _lu_factor(a, res_type):
 
     """
 
+    if a.ndim > 2:
+        return _batched_lu_factor(a, res_type)
+
     n = a.shape[-2]
 
     a_sycl_queue = a.sycl_queue
     a_usm_type = a.usm_type
-
-    # TODO: Find out at which array sizes the best performance is obtained
-    # getrf_batch implementation shows slow results with large arrays on GPU.
-    # Use getrf_batch only on CPU.
-    # On GPU call getrf for each two-dimensional array by loop
-    use_batch = a.sycl_device.has_aspect_cpu
-
     _manager = dpu.SequentialOrderManager[a_sycl_queue]
 
-    if a.ndim > 2:
-        orig_shape = a.shape
-        # get 3d input arrays by reshape
-        a = dpnp.reshape(a, (-1, n, n))
-        batch_size = a.shape[0]
-        a_usm_arr = dpnp.get_usm_ndarray(a)
-
-        if use_batch:
-            # `a` must be copied because getrf_batch destroys the input matrix
-            a_h = dpnp.empty_like(a, order="C", dtype=res_type)
-            ipiv_h = dpnp.empty(
-                (batch_size, n),
-                dtype=dpnp.int64,
-                order="C",
-                usm_type=a_usm_type,
-                sycl_queue=a_sycl_queue,
-            )
-            dev_info_h = [0] * batch_size
-
-            ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=a_usm_arr,
-                dst=a_h.get_array(),
-                sycl_queue=a_sycl_queue,
-                depends=_manager.submitted_events,
-            )
-            _manager.add_event_pair(ht_ev, copy_ev)
-
-            ipiv_stride = n
-            a_stride = a_h.strides[0]
-
-            # Call the LAPACK extension function _getrf_batch
-            # to perform LU decomposition of a batch of general matrices
-            ht_ev, getrf_ev = li._getrf_batch(
-                a_sycl_queue,
-                a_h.get_array(),
-                ipiv_h.get_array(),
-                dev_info_h,
-                n,
-                a_stride,
-                ipiv_stride,
-                batch_size,
-                depends=[copy_ev],
-            )
-            _manager.add_event_pair(ht_ev, getrf_ev)
-
-            dev_info_array = dpnp.array(
-                dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue
-            )
-
-            # Reshape the results back to their original shape
-            a_h = a_h.reshape(orig_shape)
-            ipiv_h = ipiv_h.reshape(orig_shape[:-1])
-            dev_info_array = dev_info_array.reshape(orig_shape[:-2])
-
-            return (a_h, ipiv_h, dev_info_array)
-
-        # Initialize lists for storing arrays and events for each batch
-        a_vecs = [None] * batch_size
-        ipiv_vecs = [None] * batch_size
-        dev_info_vecs = [None] * batch_size
-
-        dep_evs = _manager.submitted_events
-
-        # Process each batch
-        for i in range(batch_size):
-            # Copy each 2D slice to a new array because getrf will destroy
-            # the input matrix
-            a_vecs[i] = dpnp.empty_like(a[i], order="C", dtype=res_type)
-
-            ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=a_usm_arr[i],
-                dst=a_vecs[i].get_array(),
-                sycl_queue=a_sycl_queue,
-                depends=dep_evs,
-            )
-            _manager.add_event_pair(ht_ev, copy_ev)
-
-            ipiv_vecs[i] = dpnp.empty(
-                (n,),
-                dtype=dpnp.int64,
-                order="C",
-                usm_type=a_usm_type,
-                sycl_queue=a_sycl_queue,
-            )
-            dev_info_vecs[i] = [0]
-
-            # Call the LAPACK extension function _getrf
-            # to perform LU decomposition on each batch in 'a_vecs[i]'
-            ht_ev, getrf_ev = li._getrf(
-                a_sycl_queue,
-                a_vecs[i].get_array(),
-                ipiv_vecs[i].get_array(),
-                dev_info_vecs[i],
-                depends=[copy_ev],
-            )
-            _manager.add_event_pair(ht_ev, getrf_ev)
-
-        # Reshape the results back to their original shape
-        out_a = dpnp.array(a_vecs, order="C").reshape(orig_shape)
-        out_ipiv = dpnp.array(ipiv_vecs).reshape(orig_shape[:-1])
-        out_dev_info = dpnp.array(
-            dev_info_vecs, usm_type=a_usm_type, sycl_queue=a_sycl_queue
-        ).reshape(orig_shape[:-2])
-
-        return (out_a, out_ipiv, out_dev_info)
-
     a_usm_arr = dpnp.get_usm_ndarray(a)
 
     # `a` must be copied because getrf destroys the input matrix

From 8d491a09ea6534fdf35e9b874e8b02992ce52c35 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 4 Aug 2025 06:24:21 -0700
Subject: [PATCH 02/15] Allow F-contig input in getrf

---
 dpnp/backend/extensions/lapack/getrf.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/getrf.cpp b/dpnp/backend/extensions/lapack/getrf.cpp
index 5ba1a7cc9b65..65bd191265c0 100644
--- a/dpnp/backend/extensions/lapack/getrf.cpp
+++ b/dpnp/backend/extensions/lapack/getrf.cpp
@@ -190,10 +190,12 @@ std::pair<sycl::event, sycl::event>
     }
 
     bool is_a_array_c_contig = a_array.is_c_contiguous();
+    bool is_a_array_f_contig = a_array.is_f_contiguous();
     bool is_ipiv_array_c_contig = ipiv_array.is_c_contiguous();
-    if (!is_a_array_c_contig) {
+    if (!is_a_array_c_contig && !is_a_array_f_contig) {
         throw py::value_error("The input array "
-                              "must be C-contiguous");
+                              "must be must be either C-contiguous "
+                              "or F-contiguous");
     }
     if (!is_ipiv_array_c_contig) {
         throw py::value_error("The array of pivot indices "

From 0d4a33a1ad5ef363d7f8d4d1e160d8d286183da8 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 4 Aug 2025 06:25:59 -0700
Subject: [PATCH 03/15] Allow F-contig input in getrf_batch

---
 dpnp/backend/extensions/lapack/getrf_batch.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/getrf_batch.cpp b/dpnp/backend/extensions/lapack/getrf_batch.cpp
index 83d06816fce4..6ac61045653b 100644
--- a/dpnp/backend/extensions/lapack/getrf_batch.cpp
+++ b/dpnp/backend/extensions/lapack/getrf_batch.cpp
@@ -221,10 +221,12 @@ std::pair<sycl::event, sycl::event>
     }
 
     bool is_a_array_c_contig = a_array.is_c_contiguous();
+    bool is_a_array_f_contig = a_array.is_f_contiguous();
     bool is_ipiv_array_c_contig = ipiv_array.is_c_contiguous();
-    if (!is_a_array_c_contig) {
+    if (!is_a_array_c_contig && !is_a_array_f_contig) {
         throw py::value_error("The input array "
-                              "must be C-contiguous");
+                              "must be must be either C-contiguous "
+                              "or F-contiguous");
     }
     if (!is_ipiv_array_c_contig) {
         throw py::value_error("The array of pivot indices "

From 86381ca24192c9aa285d64c80043d0916975f6d7 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 5 Aug 2025 02:59:40 -0700
Subject: [PATCH 04/15] Implement dpnp_lu_factor() and update _lu_factor()

---
 dpnp/linalg/dpnp_utils_linalg.py | 108 ++++++++++++++++++++++++++-----
 1 file changed, 92 insertions(+), 16 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 795ad27dcfb5..06655cfb5ebb 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -39,6 +39,7 @@
 # pylint: disable=useless-import-alias
 
 from typing import NamedTuple
+from warnings import warn
 
 import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
@@ -986,12 +987,29 @@ def _hermitian_svd(a, compute_uv):
     return dpnp.sort(s)[..., ::-1]
 
 
+def _is_copy_required(a, res_type):
+    """
+    Determine if `a` needs to be copied before LU decomposition.
+    This matches SciPy behavior: copy is needed unless input is suitable
+    for in-place modification.
+    """
+
+    if a.dtype != res_type:
+        return True
+    if not a.flags["F_CONTIGUOUS"]:
+        return True
+    if not a.flags["WRITABLE"]:
+        return True
+
+    return False
+
+
 def _is_empty_2d(arr):
     # check size first for efficiency
     return arr.size == 0 and numpy.prod(arr.shape[-2:]) == 0
 
 
-def _lu_factor(a, res_type):
+def _lu_factor(a, res_type, scipy=False, overwrite_a=False):
     """
     Compute pivoted LU decomposition.
 
@@ -1032,18 +1050,41 @@ def _lu_factor(a, res_type):
 
     a_usm_arr = dpnp.get_usm_ndarray(a)
 
-    # `a` must be copied because getrf destroys the input matrix
-    a_h = dpnp.empty_like(a, order="C", dtype=res_type)
+    if not scipy:
+        # Internal use case (e.g., det(), slogdet()). Always copy.
+        # `a` must be copied because getrf destroys the input matrix
+        a_h = dpnp.empty_like(a, order="C", dtype=res_type)
 
-    # use DPCTL tensor function to fill the сopy of the input array
-    # from the input array
-    ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=a_usm_arr,
-        dst=a_h.get_array(),
-        sycl_queue=a_sycl_queue,
-        depends=_manager.submitted_events,
-    )
-    _manager.add_event_pair(ht_ev, copy_ev)
+        # use DPCTL tensor function to fill the сopy of the input array
+        # from the input array
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_usm_arr,
+            dst=a_h.get_array(),
+            sycl_queue=a_sycl_queue,
+            depends=_manager.submitted_events,
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+
+    else:
+        # SciPy-compatible behavior
+        # Copy is required if:
+        # - overwrite_a is False (always copy),
+        # - dtype mismatch,
+        # - not F-contiguous,
+        # - not writeable
+        if not overwrite_a or _is_copy_required(a, res_type):
+            a_h = dpnp.empty_like(a, order="F", dtype=res_type)
+            ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_usm_arr,
+                dst=a_h.get_array(),
+                sycl_queue=a_sycl_queue,
+                depends=_manager.submitted_events,
+            )
+            _manager.add_event_pair(ht_ev, copy_ev)
+        else:
+            # input is suitable for in-place modification
+            a_h = a
+            copy_ev = None
 
     ipiv_h = dpnp.empty(
         n,
@@ -1061,13 +1102,18 @@ def _lu_factor(a, res_type):
         a_h.get_array(),
         ipiv_h.get_array(),
         dev_info_h,
-        depends=[copy_ev],
+        depends=[copy_ev] if copy_ev is not None else [],
     )
     _manager.add_event_pair(ht_ev, getrf_ev)
 
-    dev_info_array = dpnp.array(
-        dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue
-    )
+    # Return list if called in SciPy-compatible mode
+    # else dpnp.ndarray
+    if scipy:
+        dev_info_array = dev_info_h
+    else:
+        dev_info_array = dpnp.array(
+            dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue
+        )
 
     # Return a tuple containing the factorized matrix 'a_h',
     # pivot indices 'ipiv_h'
@@ -1075,6 +1121,36 @@ def _lu_factor(a, res_type):
     return (a_h, ipiv_h, dev_info_array)
 
 
+def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
+    """Compute pivoted LU decomposition."""
+
+    res_type = _common_type(a)
+
+    # accommodate empty arrays
+    if a.size == 0:
+        lu = dpnp.empty_like(a)
+        piv = dpnp.arange(0, dtype=dpnp.int32)
+        return lu, piv
+
+    if check_finite:
+        if not dpnp.isfinite(a).all():
+            raise ValueError("array must not contain infs or NaNs")
+
+    lu, piv, dev_info = _lu_factor(
+        a, res_type, scipy=True, overwrite_a=overwrite_a
+    )
+
+    if any(dev_info):
+        diag_nums = ", ".join(str(v) for v in dev_info if v > 0)
+        warn(
+            f"Diagonal number {diag_nums} are exactly zero. Singular matrix.",
+            RuntimeWarning,
+            stacklevel=2,
+        )
+
+    return lu, piv
+
+
 def _multi_dot(arrays, order, i, j, out=None):
     """Actually do the multiplication with the given order."""
 

From 6e3ad5724d2256d6dfd70451088a051cf822a4c2 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 5 Aug 2025 05:00:41 -0700
Subject: [PATCH 05/15] Simplify lu_factor by moving logic to dpnp_lu_factor

---
 dpnp/linalg/dpnp_utils_linalg.py | 180 ++++++++++++++++++-------------
 1 file changed, 106 insertions(+), 74 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 06655cfb5ebb..347d3b5d193b 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1009,7 +1009,7 @@ def _is_empty_2d(arr):
     return arr.size == 0 and numpy.prod(arr.shape[-2:]) == 0
 
 
-def _lu_factor(a, res_type, scipy=False, overwrite_a=False):
+def _lu_factor(a, res_type):
     """
     Compute pivoted LU decomposition.
 
@@ -1050,41 +1050,18 @@ def _lu_factor(a, res_type, scipy=False, overwrite_a=False):
 
     a_usm_arr = dpnp.get_usm_ndarray(a)
 
-    if not scipy:
-        # Internal use case (e.g., det(), slogdet()). Always copy.
-        # `a` must be copied because getrf destroys the input matrix
-        a_h = dpnp.empty_like(a, order="C", dtype=res_type)
-
-        # use DPCTL tensor function to fill the сopy of the input array
-        # from the input array
-        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=a_usm_arr,
-            dst=a_h.get_array(),
-            sycl_queue=a_sycl_queue,
-            depends=_manager.submitted_events,
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
+    # `a` must be copied because getrf destroys the input matrix
+    a_h = dpnp.empty_like(a, order="C", dtype=res_type)
 
-    else:
-        # SciPy-compatible behavior
-        # Copy is required if:
-        # - overwrite_a is False (always copy),
-        # - dtype mismatch,
-        # - not F-contiguous,
-        # - not writeable
-        if not overwrite_a or _is_copy_required(a, res_type):
-            a_h = dpnp.empty_like(a, order="F", dtype=res_type)
-            ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=a_usm_arr,
-                dst=a_h.get_array(),
-                sycl_queue=a_sycl_queue,
-                depends=_manager.submitted_events,
-            )
-            _manager.add_event_pair(ht_ev, copy_ev)
-        else:
-            # input is suitable for in-place modification
-            a_h = a
-            copy_ev = None
+    # use DPCTL tensor function to fill the сopy of the input array
+    # from the input array
+    ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr,
+        dst=a_h.get_array(),
+        sycl_queue=a_sycl_queue,
+        depends=_manager.submitted_events,
+    )
+    _manager.add_event_pair(ht_ev, copy_ev)
 
     ipiv_h = dpnp.empty(
         n,
@@ -1102,18 +1079,13 @@ def _lu_factor(a, res_type, scipy=False, overwrite_a=False):
         a_h.get_array(),
         ipiv_h.get_array(),
         dev_info_h,
-        depends=[copy_ev] if copy_ev is not None else [],
+        depends=[copy_ev],
     )
     _manager.add_event_pair(ht_ev, getrf_ev)
 
-    # Return list if called in SciPy-compatible mode
-    # else dpnp.ndarray
-    if scipy:
-        dev_info_array = dev_info_h
-    else:
-        dev_info_array = dpnp.array(
-            dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue
-        )
+    dev_info_array = dpnp.array(
+        dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue
+    )
 
     # Return a tuple containing the factorized matrix 'a_h',
     # pivot indices 'ipiv_h'
@@ -1121,36 +1093,6 @@ def _lu_factor(a, res_type, scipy=False, overwrite_a=False):
     return (a_h, ipiv_h, dev_info_array)
 
 
-def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
-    """Compute pivoted LU decomposition."""
-
-    res_type = _common_type(a)
-
-    # accommodate empty arrays
-    if a.size == 0:
-        lu = dpnp.empty_like(a)
-        piv = dpnp.arange(0, dtype=dpnp.int32)
-        return lu, piv
-
-    if check_finite:
-        if not dpnp.isfinite(a).all():
-            raise ValueError("array must not contain infs or NaNs")
-
-    lu, piv, dev_info = _lu_factor(
-        a, res_type, scipy=True, overwrite_a=overwrite_a
-    )
-
-    if any(dev_info):
-        diag_nums = ", ".join(str(v) for v in dev_info if v > 0)
-        warn(
-            f"Diagonal number {diag_nums} are exactly zero. Singular matrix.",
-            RuntimeWarning,
-            stacklevel=2,
-        )
-
-    return lu, piv
-
-
 def _multi_dot(arrays, order, i, j, out=None):
     """Actually do the multiplication with the given order."""
 
@@ -2349,6 +2291,96 @@ def dpnp_lstsq(a, b, rcond=None):
     return x, resids, rank, s
 
 
+def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
+    """
+    dpnp_lu_factor(a, overwrite_a=False, check_finite=True)
+
+    Compute pivoted LU decomposition (SciPy-compatible behavior).
+
+    This function mimics the behavior of `scipy.linalg.lu_factor` including
+    support for `overwrite_a`, `check_finite` and 0-based pivot indexing.
+
+    """
+
+    res_type = _common_type(a)
+
+    # accommodate empty arrays
+    if a.size == 0:
+        lu = dpnp.empty_like(a)
+        piv = dpnp.arange(0, dtype=dpnp.int32)
+        return lu, piv
+
+    if check_finite:
+        if not dpnp.isfinite(a).all():
+            raise ValueError("array must not contain infs or NaNs")
+
+    # if a.ndim > 2:
+    # return _batched_lu_factor_scipy(a, res_type, overwrite_a=overwrite_a)
+
+    n = a.shape[-2]
+
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
+    _manager = dpu.SequentialOrderManager[a_sycl_queue]
+
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+
+    # SciPy-compatible behavior
+    # Copy is required if:
+    # - overwrite_a is False (always copy),
+    # - dtype mismatch,
+    # - not F-contiguous,
+    # - not writeable
+    if not overwrite_a or _is_copy_required(a, res_type):
+        a_h = dpnp.empty_like(a, order="F", dtype=res_type)
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_usm_arr,
+            dst=a_h.get_array(),
+            sycl_queue=a_sycl_queue,
+            depends=_manager.submitted_events,
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+    else:
+        # input is suitable for in-place modification
+        a_h = a
+        copy_ev = None
+
+    ipiv_h = dpnp.empty(
+        n,
+        dtype=dpnp.int64,
+        order="C",
+        usm_type=a_usm_type,
+        sycl_queue=a_sycl_queue,
+    )
+    dev_info_h = [0]
+
+    # Call the LAPACK extension function _getrf
+    # to perform LU decomposition on the input matrix
+    ht_ev, getrf_ev = li._getrf(
+        a_sycl_queue,
+        a_h.get_array(),
+        ipiv_h.get_array(),
+        dev_info_h,
+        depends=[copy_ev] if copy_ev is not None else [],
+    )
+    _manager.add_event_pair(ht_ev, getrf_ev)
+
+    if any(dev_info_h):
+        diag_nums = ", ".join(str(v) for v in dev_info_h if v > 0)
+        warn(
+            f"Diagonal number {diag_nums} are exactly zero. Singular matrix.",
+            RuntimeWarning,
+            stacklevel=2,
+        )
+
+    # MKL lapack uses 1-origin while SciPy uses 0-origin
+    ipiv_h -= 1
+
+    # Return a tuple containing the factorized matrix 'a_h',
+    # pivot indices 'ipiv_h'
+    return (a_h, ipiv_h)
+
+
 def dpnp_matrix_power(a, n):
     """
     dpnp_matrix_power(a, n)

From 63138e9ffe7a912aea11a7cf1f79b31f7864c1ff Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 12 Aug 2025 03:15:05 -0700
Subject: [PATCH 06/15] Extend getrf to support non-square matrices

---
 dpnp/backend/extensions/lapack/getrf.cpp | 30 +++++++++++++++---------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/getrf.cpp b/dpnp/backend/extensions/lapack/getrf.cpp
index 65bd191265c0..66b91a4121fa 100644
--- a/dpnp/backend/extensions/lapack/getrf.cpp
+++ b/dpnp/backend/extensions/lapack/getrf.cpp
@@ -44,6 +44,7 @@ namespace py = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
 typedef sycl::event (*getrf_impl_fn_ptr_t)(sycl::queue &,
+                                           const std::int64_t,
                                            const std::int64_t,
                                            char *,
                                            std::int64_t,
@@ -56,6 +57,7 @@ static getrf_impl_fn_ptr_t getrf_dispatch_vector[dpctl_td_ns::num_types];
 
 template <typename T>
 static sycl::event getrf_impl(sycl::queue &exec_q,
+                              const std::int64_t m,
                               const std::int64_t n,
                               char *in_a,
                               std::int64_t lda,
@@ -82,11 +84,11 @@ static sycl::event getrf_impl(sycl::queue &exec_q,
 
         getrf_event = mkl_lapack::getrf(
             exec_q,
-            n,    // The order of the square matrix A (0 ≤ n).
+            m,    // The number of rows in the input matrix A (0 ≤ m).
                   // It must be a non-negative integer.
-            n,    // The number of columns in the square matrix A (0 ≤ n).
+            n,    // The number of columns in the input matrix A (0 ≤ n).
                   // It must be a non-negative integer.
-            a,    // Pointer to the square matrix A (n x n).
+            a,    // Pointer to the input matrix A (n x n).
             lda,  // The leading dimension of matrix A.
                   // It must be at least max(1, n).
             ipiv, // Pointer to the output array of pivot indices.
@@ -99,7 +101,7 @@ static sycl::event getrf_impl(sycl::queue &exec_q,
 
         if (info < 0) {
             error_msg << "Parameter number " << -info
-                      << " had an illegal value.";
+                      << " had an illegal value";
         }
         else if (info == scratchpad_size && e.detail() != 0) {
             error_msg
@@ -168,13 +170,13 @@ std::pair<sycl::event, sycl::event>
     if (a_array_nd != 2) {
         throw py::value_error(
             "The input array has ndim=" + std::to_string(a_array_nd) +
-            ", but a 2-dimensional array is expected.");
+            ", but a 2-dimensional array is expected");
     }
 
     if (ipiv_array_nd != 1) {
         throw py::value_error("The array of pivot indices has ndim=" +
                               std::to_string(ipiv_array_nd) +
-                              ", but a 1-dimensional array is expected.");
+                              ", but a 1-dimensional array is expected");
     }
 
     // check compatibility of execution queue and allocation queue
@@ -210,7 +212,7 @@ std::pair<sycl::event, sycl::event>
     if (getrf_fn == nullptr) {
         throw py::value_error(
             "No getrf implementation defined for the provided type "
-            "of the input matrix.");
+            "of the input matrix");
     }
 
     auto ipiv_types = dpctl_td_ns::usm_ndarray_types();
@@ -218,19 +220,25 @@ std::pair<sycl::event, sycl::event>
         ipiv_types.typenum_to_lookup_id(ipiv_array.get_typenum());
 
     if (ipiv_array_type_id != static_cast<int>(dpctl_td_ns::typenum_t::INT64)) {
-        throw py::value_error("The type of 'ipiv_array' must be int64.");
+        throw py::value_error("The type of 'ipiv_array' must be int64");
     }
 
-    const std::int64_t n = a_array.get_shape_raw()[0];
+    const py::ssize_t *a_array_shape = a_array.get_shape_raw();
+    const std::int64_t m = a_array_shape[0];
+    const std::int64_t n = a_array_shape[1];
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    if (ipiv_array.get_size() != std::min(m, n)) {
+        throw py::value_error("The size of 'ipiv_array' must be min(m, n)");
+    }
 
     char *a_array_data = a_array.get_data();
-    const std::int64_t lda = std::max<size_t>(1UL, n);
 
     char *ipiv_array_data = ipiv_array.get_data();
     std::int64_t *d_ipiv = reinterpret_cast<std::int64_t *>(ipiv_array_data);
 
     std::vector<sycl::event> host_task_events;
-    sycl::event getrf_ev = getrf_fn(exec_q, n, a_array_data, lda, d_ipiv,
+    sycl::event getrf_ev = getrf_fn(exec_q, m, n, a_array_data, lda, d_ipiv,
                                     dev_info, host_task_events, depends);
 
     sycl::event args_ev = dpctl::utils::keep_args_alive(

From 8a17f678b42f8126beb938376a401af484524e97 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 12 Aug 2025 03:22:25 -0700
Subject: [PATCH 07/15] Raise NotImplementedError for bathed matrices

---
 dpnp/linalg/dpnp_utils_linalg.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 347d3b5d193b..557258c04070 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -2314,10 +2314,10 @@ def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
         if not dpnp.isfinite(a).all():
             raise ValueError("array must not contain infs or NaNs")
 
-    # if a.ndim > 2:
-    # return _batched_lu_factor_scipy(a, res_type, overwrite_a=overwrite_a)
+    if a.ndim > 2:
+        raise NotImplementedError("Batched matrices are not supported")
 
-    n = a.shape[-2]
+    m, n = a.shape
 
     a_sycl_queue = a.sycl_queue
     a_usm_type = a.usm_type
@@ -2346,7 +2346,7 @@ def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
         copy_ev = None
 
     ipiv_h = dpnp.empty(
-        n,
+        min(m, n),
         dtype=dpnp.int64,
         order="C",
         usm_type=a_usm_type,

From 6fe2fe492cb36731c8ffe4a86dc0f5c0062d128f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 12 Aug 2025 05:29:58 -0700
Subject: [PATCH 08/15] Add implementation of dpnp.linalg.lu_factor()

---
 dpnp/linalg/dpnp_iface_linalg.py | 64 ++++++++++++++++++++++++++++++++
 dpnp/linalg/dpnp_utils_linalg.py |  2 +-
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index 07ed0078be28..55381c917203 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -56,6 +56,7 @@
     dpnp_eigh,
     dpnp_inv,
     dpnp_lstsq,
+    dpnp_lu_factor,
     dpnp_matrix_power,
     dpnp_matrix_rank,
     dpnp_multi_dot,
@@ -79,6 +80,7 @@
     "eigvalsh",
     "inv",
     "lstsq",
+    "lu_factor",
     "matmul",
     "matrix_norm",
     "matrix_power",
@@ -901,6 +903,68 @@ def lstsq(a, b, rcond=None):
     return dpnp_lstsq(a, b, rcond=rcond)
 
 
+def lu_factor(a, overwrite_a=False, check_finite=True):
+    """
+    Compute the pivoted LU decomposition of a matrix.
+
+    The decomposition is::
+
+        A = P @ L @ U
+
+    where `P` is a permutation matrix, `L` is lower triangular with unit
+    diagonal elements, and `U` is upper triangular.
+
+    Parameters
+    ----------
+    a : (M, N) {dpnp.ndarray, usm_ndarray}
+        Input array to decompose.
+    overwrite_a : {None, bool}, optional
+        Whether to overwrite data in `a` (may increase performance)
+        Default: ``False``.
+    check_finite : {None, bool}, optional
+        Whether to check that the input matrix contains only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+
+    Returns
+    -------
+    lu :(M, N) dpnp.ndarray
+        Matrix containing U in its upper triangle, and L in its lower triangle.
+        The unit diagonal elements of L are not stored.
+    piv (K, ): dpnp.ndarray
+        Pivot indices representing the permutation matrix P:
+        row i of matrix was interchanged with row piv[i].
+        ``K = min(M, N)``.
+
+    Warning
+    -------
+    This function synchronizes in order to validate array elements
+    when ``check_finite=True``.
+
+    Limitations
+    -----------
+    Only two-dimensional input matrices are supported.
+    Otherwise, the function raises ``NotImplementedError`` exception.
+
+    Examples
+    --------
+    >>> import dpnp as np
+    >>> a = np.array([[4., 3.], [6., 3.]])
+    >>> lu, piv = np.linalg.lu_factor(a)
+    >>> lu
+    array([[6.        , 3.        ],
+           [0.66666667, 1.        ]])
+    >>> piv
+    array([1, 1])
+
+    """
+
+    dpnp.check_supported_arrays_type(a)
+    assert_stacked_2d(a)
+
+    return dpnp_lu_factor(a, overwrite_a=overwrite_a, check_finite=check_finite)
+
+
 def matmul(x1, x2, /):
     """
     Computes the matrix product.
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 557258c04070..1b56e9359fcc 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -2307,7 +2307,7 @@ def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
     # accommodate empty arrays
     if a.size == 0:
         lu = dpnp.empty_like(a)
-        piv = dpnp.arange(0, dtype=dpnp.int32)
+        piv = dpnp.arange(0, dtype=dpnp.int64)
         return lu, piv
 
     if check_finite:

From e781beb51d40d8c75d94b7091c3b0abfd0412fb3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 12 Aug 2025 06:02:48 -0700
Subject: [PATCH 09/15] Add sycl_queue and usm_type tests

---
 dpnp/linalg/dpnp_utils_linalg.py | 13 +++++++------
 dpnp/tests/test_sycl_queue.py    | 12 ++++++++++++
 dpnp/tests/test_usm_type.py      | 12 ++++++++++++
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 1b56e9359fcc..bc8d7dc748b1 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -2303,11 +2303,15 @@ def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
     """
 
     res_type = _common_type(a)
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
 
     # accommodate empty arrays
     if a.size == 0:
         lu = dpnp.empty_like(a)
-        piv = dpnp.arange(0, dtype=dpnp.int64)
+        piv = dpnp.arange(
+            0, dtype=dpnp.int64, usm_type=a_usm_type, sycl_queue=a_sycl_queue
+        )
         return lu, piv
 
     if check_finite:
@@ -2317,12 +2321,7 @@ def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
     if a.ndim > 2:
         raise NotImplementedError("Batched matrices are not supported")
 
-    m, n = a.shape
-
-    a_sycl_queue = a.sycl_queue
-    a_usm_type = a.usm_type
     _manager = dpu.SequentialOrderManager[a_sycl_queue]
-
     a_usm_arr = dpnp.get_usm_ndarray(a)
 
     # SciPy-compatible behavior
@@ -2345,6 +2344,8 @@ def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
         a_h = a
         copy_ev = None
 
+    m, n = a.shape
+
     ipiv_h = dpnp.empty(
         min(m, n),
         dtype=dpnp.int64,
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index 31dfb74f2cfa..7c08263e672c 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -1570,6 +1570,18 @@ def test_lstsq(self, m, n, nrhs, device):
             assert_sycl_queue_equal(param_queue, a.sycl_queue)
             assert_sycl_queue_equal(param_queue, b.sycl_queue)
 
+    @pytest.mark.parametrize(
+        "data",
+        [[[1.0, 2.0], [3.0, 5.0]], [[]]],
+    )
+    def test_lu_factor(self, data, device):
+        a = dpnp.array(data, device=device)
+        result = dpnp.linalg.lu_factor(a)
+
+        for param in result:
+            param_queue = param.sycl_queue
+            assert_sycl_queue_equal(param_queue, a.sycl_queue)
+
     @pytest.mark.parametrize("n", [-1, 0, 1, 2, 3])
     def test_matrix_power(self, n, device):
         x = dpnp.array([[1.0, 2.0], [3.0, 5.0]], device=device)
diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index aed316eca533..6f886f8ec3c7 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -1449,6 +1449,18 @@ def test_lstsq(self, m, n, nrhs, usm_type, usm_type_other):
                 [usm_type, usm_type_other]
             )
 
+    @pytest.mark.parametrize(
+        "data",
+        [[[1.0, 2.0], [3.0, 5.0]], [[]]],
+    )
+    def test_lu_factor(self, data, usm_type):
+        a = dpnp.array(data, usm_type=usm_type)
+        result = dpnp.linalg.lu_factor(a)
+
+        assert a.usm_type == usm_type
+        for param in result:
+            assert param.usm_type == a.usm_type
+
     @pytest.mark.parametrize("n", [-1, 0, 1, 2, 3])
     def test_matrix_power(self, n, usm_type):
         a = dpnp.array([[1, 2], [3, 5]], usm_type=usm_type)

From 506d24da0a4eebae1608e5e0463262a1f8f61cd1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 13 Aug 2025 03:38:05 -0700
Subject: [PATCH 10/15] Add TestLuFactor

---
 dpnp/tests/test_linalg.py | 133 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 931c631b5633..dff60b6ca0b9 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -15,6 +15,7 @@
 )
 
 import dpnp
+import dpnp.linalg
 
 from .helper import (
     assert_dtype_allclose,
@@ -1868,6 +1869,138 @@ def test_lstsq_errors(self):
         assert_raises(TypeError, dpnp.linalg.lstsq, a_dp, b_dp, [-1])
 
 
+class TestLuFactor:
+    @staticmethod
+    def _apply_pivots_rows(A_dp, piv_dp):
+        m = A_dp.shape[0]
+        rows = dpnp.arange(m)
+        for i in range(int(piv_dp.shape[0])):
+            r = int(piv_dp[i].item())
+            if i != r:
+                tmp = rows[i].copy()
+                rows[i] = rows[r]
+                rows[r] = tmp
+        return A_dp[rows]
+
+    @staticmethod
+    def _split_lu(lu, m, n):
+        L = dpnp.tril(lu, k=-1)
+        dpnp.fill_diagonal(L, 1)
+        L = L[:, : min(m, n)]
+        U = dpnp.triu(lu)[: min(m, n), :]
+        return L, U
+
+    @pytest.mark.parametrize(
+        "shape", [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)]
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_lu_factor(self, shape, order, dtype):
+        a_np = generate_random_numpy_array(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        lu, piv = dpnp.linalg.lu_factor(
+            a_dp, check_finite=False, overwrite_a=False
+        )
+
+        # verify piv
+        assert piv.shape == (min(shape),)
+        assert piv.dtype == dpnp.int64
+        if shape[0] > 0:
+            assert int(dpnp.min(piv)) >= 0
+            assert int(dpnp.max(piv)) < shape[0]
+
+        m, n = shape
+        L, U = self._split_lu(lu, m, n)
+        LU = L @ U
+
+        A_cast = a_dp.astype(LU.dtype, copy=False)
+        PA = self._apply_pivots_rows(A_cast, piv)
+
+        assert_allclose(LU, PA, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_overwrite_inplace(self, dtype):
+        a_dp = dpnp.array([[4, 3], [6, 3]], dtype=dtype, order="F")
+        a_dp_orig = a_dp.copy()
+        lu, piv = dpnp.linalg.lu_factor(
+            a_dp, overwrite_a=True, check_finite=False
+        )
+
+        assert lu is a_dp
+        assert lu.flags["F_CONTIGUOUS"] is True
+
+        L, U = self._split_lu(lu, 2, 2)
+        PA = self._apply_pivots_rows(a_dp_orig, piv)
+        LU = L @ U
+
+        assert_allclose(LU, PA, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_overwrite_copy(self, dtype):
+        a_dp = dpnp.array([[4, 3], [6, 3]], dtype=dtype, order="C")
+        a_dp_orig = a_dp.copy()
+        lu, piv = dpnp.linalg.lu_factor(
+            a_dp, overwrite_a=True, check_finite=False
+        )
+
+        assert lu is not a_dp
+        assert lu.flags["F_CONTIGUOUS"] is True
+
+        L, U = self._split_lu(lu, 2, 2)
+        PA = self._apply_pivots_rows(a_dp_orig, piv)
+        LU = L @ U
+
+        assert_allclose(LU, PA, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize("shape", [(0, 0), (0, 2), (2, 0)])
+    def test_empty_inputs(self, shape):
+        a_dp = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+        lu, piv = dpnp.linalg.lu_factor(a_dp, check_finite=False)
+        assert lu.shape == shape
+        assert piv.shape == (min(shape),)
+
+    @pytest.mark.parametrize(
+        "sl",
+        [
+            (slice(None, None, 2), slice(None, None, 2)),
+            (slice(None, None, -1), slice(None, None, -1)),
+        ],
+    )
+    def test_strided(self, sl):
+        base = (
+            numpy.arange(7 * 7, dtype=dpnp.default_float_type()).reshape(
+                7, 7, order="F"
+            )
+            + 0.1
+        )
+        a_np = base[sl]
+        a_dp = dpnp.array(a_np)
+
+        lu, piv = dpnp.linalg.lu_factor(a_dp, check_finite=False)
+        L, U = self._split_lu(lu, *a_dp.shape)
+        PA = self._apply_pivots_rows(a_dp, piv)
+        LU = L @ U
+
+        assert_allclose(LU, PA, rtol=1e-6, atol=1e-6)
+
+    def test_singular_matrix(self):
+        a_dp = dpnp.array([[1.0, 2.0], [2.0, 4.0]])
+        with pytest.warns(RuntimeWarning, match="Singular matrix"):
+            dpnp.linalg.lu_factor(a_dp, check_finite=False)
+
+    @pytest.mark.parametrize("bad", [numpy.inf, -numpy.inf, numpy.nan])
+    def test_check_finite_raises(self, bad):
+        a_dp = dpnp.array([[1.0, 2.0], [3.0, bad]], order="F")
+        assert_raises(
+            ValueError, dpnp.linalg.lu_factor, a_dp, check_finite=True
+        )
+
+    def test_batched_not_supported(self):
+        a_dp = dpnp.ones((2, 2, 2))
+        assert_raises(NotImplementedError, dpnp.linalg.lu_factor, a_dp)
+
+
 class TestMatrixPower:
     @pytest.mark.parametrize("dtype", get_all_dtypes())
     @pytest.mark.parametrize(

From 08f5ca343f80844cdcb37e7618d7a140313de315 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 13 Aug 2025 05:54:55 -0700
Subject: [PATCH 11/15] Add test_overwrite_copy_special to improve coverage

---
 dpnp/tests/test_linalg.py | 75 +++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 22 deletions(-)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index f8c6026f070a..00b280483b4b 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -279,12 +279,15 @@ def test_cholesky_errors(self):
 
 
 class TestCond:
-    _norms = [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
+    def setup_method(self):
+        numpy.random.seed(70)
 
     @pytest.mark.parametrize(
-        "shape", [(0, 4, 4), (4, 0, 3, 3)], ids=["(0, 4, 4)", "(4, 0, 3, 3)"]
+        "shape", [(0, 4, 4), (4, 0, 3, 3)], ids=["(0, 5, 3)", "(4, 0, 2, 3)"]
+    )
+    @pytest.mark.parametrize(
+        "p", [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
     )
-    @pytest.mark.parametrize("p", _norms)
     def test_empty(self, shape, p):
         a = numpy.empty(shape)
         ia = dpnp.array(a)
@@ -293,27 +296,26 @@ def test_empty(self, shape, p):
         expected = numpy.linalg.cond(a, p=p)
         assert_dtype_allclose(result, expected)
 
-    # TODO: uncomment once numpy 2.3.3 release is published
-    # @testing.with_requires("numpy>=2.3.3")
     @pytest.mark.parametrize(
         "dtype", get_all_dtypes(no_none=True, no_bool=True)
     )
     @pytest.mark.parametrize(
         "shape", [(4, 4), (2, 4, 3, 3)], ids=["(4, 4)", "(2, 4, 3, 3)"]
     )
-    @pytest.mark.parametrize("p", _norms)
+    @pytest.mark.parametrize(
+        "p", [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
+    )
     def test_basic(self, dtype, shape, p):
         a = generate_random_numpy_array(shape, dtype)
         ia = dpnp.array(a)
 
         result = dpnp.linalg.cond(ia, p=p)
         expected = numpy.linalg.cond(a, p=p)
-        # TODO: remove when numpy#29333 is released
-        if numpy_version() < "2.3.3":
-            expected = expected.real
         assert_dtype_allclose(result, expected, factor=16)
 
-    @pytest.mark.parametrize("p", _norms)
+    @pytest.mark.parametrize(
+        "p", [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
+    )
     def test_bool(self, p):
         a = numpy.array([[True, True], [True, False]])
         ia = dpnp.array(a)
@@ -322,7 +324,9 @@ def test_bool(self, p):
         expected = numpy.linalg.cond(a, p=p)
         assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize("p", _norms)
+    @pytest.mark.parametrize(
+        "p", [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
+    )
     def test_nan_to_inf(self, p):
         a = numpy.zeros((2, 2))
         ia = dpnp.array(a)
@@ -340,7 +344,9 @@ def test_nan_to_inf(self, p):
         else:
             assert_raises(dpnp.linalg.LinAlgError, dpnp.linalg.cond, ia, p=p)
 
-    @pytest.mark.parametrize("p", _norms)
+    @pytest.mark.parametrize(
+        "p", [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
+    )
     @pytest.mark.parametrize(
         "stride",
         [(-2, -3, 2, -2), (-2, 4, -4, -4), (2, 3, 4, 4), (-1, 3, 3, -3)],
@@ -352,23 +358,21 @@ def test_nan_to_inf(self, p):
         ],
     )
     def test_strided(self, p, stride):
-        A = generate_random_numpy_array(
-            (6, 8, 10, 10), seed_value=70, low=0, high=1
-        )
-        iA = dpnp.array(A)
+        A = numpy.random.rand(6, 8, 10, 10)
+        B = dpnp.asarray(A)
         slices = tuple(slice(None, None, stride[i]) for i in range(A.ndim))
-        a, ia = A[slices], iA[slices]
+        a = A[slices]
+        b = B[slices]
 
-        result = dpnp.linalg.cond(ia, p=p)
+        result = dpnp.linalg.cond(b, p=p)
         expected = numpy.linalg.cond(a, p=p)
         assert_dtype_allclose(result, expected, factor=24)
 
-    @pytest.mark.parametrize("xp", [dpnp, numpy])
-    def test_error(self, xp):
+    def test_error(self):
         # cond is not defined on empty arrays
-        a = xp.empty((2, 0))
+        ia = dpnp.empty((2, 0))
         with pytest.raises(ValueError):
-            xp.linalg.cond(a, p=1)
+            dpnp.linalg.cond(ia, p=1)
 
 
 class TestDet:
@@ -1949,6 +1953,33 @@ def test_overwrite_copy(self, dtype):
 
         assert_allclose(LU, PA, rtol=1e-6, atol=1e-6)
 
+    def test_overwrite_copy_special(self):
+        # F-contig but dtype != res_type
+        a1 = dpnp.array([[4, 3], [6, 3]], dtype=dpnp.int32, order="F")
+        a1_orig = a1.copy()
+
+        # F-contig, match dtype but read-only input
+        a2 = dpnp.array(
+            [[4, 3], [6, 3]], dtype=dpnp.default_float_type(), order="F"
+        )
+        a2_orig = a2.copy()
+        a2.flags["WRITABLE"] = False
+
+        for a_dp, a_orig in zip((a1, a1), (a1_orig, a2_orig)):
+            lu, piv = dpnp.linalg.lu_factor(
+                a_dp, overwrite_a=True, check_finite=False
+            )
+
+            assert lu is not a_dp
+            assert lu.flags["F_CONTIGUOUS"] is True
+
+            L, U = self._split_lu(lu, 2, 2)
+            PA = self._apply_pivots_rows(
+                a_orig.astype(L.dtype, copy=False), piv
+            )
+            LU = L @ U
+            assert_allclose(LU, PA, rtol=1e-6, atol=1e-6)
+
     @pytest.mark.parametrize("shape", [(0, 0), (0, 2), (2, 0)])
     def test_empty_inputs(self, shape):
         a_dp = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")

From c2f297161581208f77fb7b0b14261703164ef525 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 13 Aug 2025 07:50:00 -0700
Subject: [PATCH 12/15] Align test_linalg.py with master

---
 dpnp/tests/test_linalg.py | 49 ++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 00b280483b4b..fd87e0ffd51f 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -15,7 +15,6 @@
 )
 
 import dpnp
-import dpnp.linalg
 
 from .helper import (
     assert_dtype_allclose,
@@ -279,15 +278,12 @@ def test_cholesky_errors(self):
 
 
 class TestCond:
-    def setup_method(self):
-        numpy.random.seed(70)
+    _norms = [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
 
     @pytest.mark.parametrize(
-        "shape", [(0, 4, 4), (4, 0, 3, 3)], ids=["(0, 5, 3)", "(4, 0, 2, 3)"]
-    )
-    @pytest.mark.parametrize(
-        "p", [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
+        "shape", [(0, 4, 4), (4, 0, 3, 3)], ids=["(0, 4, 4)", "(4, 0, 3, 3)"]
     )
+    @pytest.mark.parametrize("p", _norms)
     def test_empty(self, shape, p):
         a = numpy.empty(shape)
         ia = dpnp.array(a)
@@ -296,26 +292,27 @@ def test_empty(self, shape, p):
         expected = numpy.linalg.cond(a, p=p)
         assert_dtype_allclose(result, expected)
 
+    # TODO: uncomment once numpy 2.3.3 release is published
+    # @testing.with_requires("numpy>=2.3.3")
     @pytest.mark.parametrize(
         "dtype", get_all_dtypes(no_none=True, no_bool=True)
     )
     @pytest.mark.parametrize(
         "shape", [(4, 4), (2, 4, 3, 3)], ids=["(4, 4)", "(2, 4, 3, 3)"]
     )
-    @pytest.mark.parametrize(
-        "p", [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
-    )
+    @pytest.mark.parametrize("p", _norms)
     def test_basic(self, dtype, shape, p):
         a = generate_random_numpy_array(shape, dtype)
         ia = dpnp.array(a)
 
         result = dpnp.linalg.cond(ia, p=p)
         expected = numpy.linalg.cond(a, p=p)
+        # TODO: remove when numpy#29333 is released
+        if numpy_version() < "2.3.3":
+            expected = expected.real
         assert_dtype_allclose(result, expected, factor=16)
 
-    @pytest.mark.parametrize(
-        "p", [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
-    )
+    @pytest.mark.parametrize("p", _norms)
     def test_bool(self, p):
         a = numpy.array([[True, True], [True, False]])
         ia = dpnp.array(a)
@@ -324,9 +321,7 @@ def test_bool(self, p):
         expected = numpy.linalg.cond(a, p=p)
         assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize(
-        "p", [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
-    )
+    @pytest.mark.parametrize("p", _norms)
     def test_nan_to_inf(self, p):
         a = numpy.zeros((2, 2))
         ia = dpnp.array(a)
@@ -344,9 +339,7 @@ def test_nan_to_inf(self, p):
         else:
             assert_raises(dpnp.linalg.LinAlgError, dpnp.linalg.cond, ia, p=p)
 
-    @pytest.mark.parametrize(
-        "p", [None, -dpnp.inf, -2, -1, 1, 2, dpnp.inf, "fro"]
-    )
+    @pytest.mark.parametrize("p", _norms)
     @pytest.mark.parametrize(
         "stride",
         [(-2, -3, 2, -2), (-2, 4, -4, -4), (2, 3, 4, 4), (-1, 3, 3, -3)],
@@ -358,21 +351,23 @@ def test_nan_to_inf(self, p):
         ],
     )
     def test_strided(self, p, stride):
-        A = numpy.random.rand(6, 8, 10, 10)
-        B = dpnp.asarray(A)
+        A = generate_random_numpy_array(
+            (6, 8, 10, 10), seed_value=70, low=0, high=1
+        )
+        iA = dpnp.array(A)
         slices = tuple(slice(None, None, stride[i]) for i in range(A.ndim))
-        a = A[slices]
-        b = B[slices]
+        a, ia = A[slices], iA[slices]
 
-        result = dpnp.linalg.cond(b, p=p)
+        result = dpnp.linalg.cond(ia, p=p)
         expected = numpy.linalg.cond(a, p=p)
         assert_dtype_allclose(result, expected, factor=24)
 
-    def test_error(self):
+    @pytest.mark.parametrize("xp", [dpnp, numpy])
+    def test_error(self, xp):
         # cond is not defined on empty arrays
-        ia = dpnp.empty((2, 0))
+        a = xp.empty((2, 0))
         with pytest.raises(ValueError):
-            dpnp.linalg.cond(ia, p=1)
+            xp.linalg.cond(a, p=1)
 
 
 class TestDet:

From 74f6c6351e60b5befcfc11d79a397339c5943510 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 19 Aug 2025 02:01:52 -0700
Subject: [PATCH 13/15] Apply comment

---
 dpnp/backend/extensions/lapack/getrf.cpp       | 3 +--
 dpnp/backend/extensions/lapack/getrf_batch.cpp | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/getrf.cpp b/dpnp/backend/extensions/lapack/getrf.cpp
index 66b91a4121fa..286797cb17e7 100644
--- a/dpnp/backend/extensions/lapack/getrf.cpp
+++ b/dpnp/backend/extensions/lapack/getrf.cpp
@@ -196,8 +196,7 @@ std::pair<sycl::event, sycl::event>
     bool is_ipiv_array_c_contig = ipiv_array.is_c_contiguous();
     if (!is_a_array_c_contig && !is_a_array_f_contig) {
         throw py::value_error("The input array "
-                              "must be must be either C-contiguous "
-                              "or F-contiguous");
+                              "must be contiguous");
     }
     if (!is_ipiv_array_c_contig) {
         throw py::value_error("The array of pivot indices "
diff --git a/dpnp/backend/extensions/lapack/getrf_batch.cpp b/dpnp/backend/extensions/lapack/getrf_batch.cpp
index 6ac61045653b..ec87c8b1f2ae 100644
--- a/dpnp/backend/extensions/lapack/getrf_batch.cpp
+++ b/dpnp/backend/extensions/lapack/getrf_batch.cpp
@@ -225,8 +225,7 @@ std::pair<sycl::event, sycl::event>
     bool is_ipiv_array_c_contig = ipiv_array.is_c_contiguous();
     if (!is_a_array_c_contig && !is_a_array_f_contig) {
         throw py::value_error("The input array "
-                              "must be must be either C-contiguous "
-                              "or F-contiguous");
+                              "must be must contiguous");
     }
     if (!is_ipiv_array_c_contig) {
         throw py::value_error("The array of pivot indices "

From 3276f92bd6caf11edb18a38672250b55fc03609d Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 22 Aug 2025 06:09:18 -0700
Subject: [PATCH 14/15] Apply remarks

---
 dpnp/backend/extensions/lapack/getrf.cpp     | 6 +++---
 dpnp/backend/extensions/lapack/lapack_py.cpp | 2 +-
 dpnp/tests/test_linalg.py                    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/getrf.cpp b/dpnp/backend/extensions/lapack/getrf.cpp
index 286797cb17e7..81231b5055af 100644
--- a/dpnp/backend/extensions/lapack/getrf.cpp
+++ b/dpnp/backend/extensions/lapack/getrf.cpp
@@ -71,7 +71,7 @@ static sycl::event getrf_impl(sycl::queue &exec_q,
     T *a = reinterpret_cast<T *>(in_a);
 
     const std::int64_t scratchpad_size =
-        mkl_lapack::getrf_scratchpad_size<T>(exec_q, n, n, lda);
+        mkl_lapack::getrf_scratchpad_size<T>(exec_q, m, n, lda);
     T *scratchpad = nullptr;
 
     std::stringstream error_msg;
@@ -88,9 +88,9 @@ static sycl::event getrf_impl(sycl::queue &exec_q,
                   // It must be a non-negative integer.
             n,    // The number of columns in the input matrix A (0 ≤ n).
                   // It must be a non-negative integer.
-            a,    // Pointer to the input matrix A (n x n).
+            a,    // Pointer to the input matrix A (m x n).
             lda,  // The leading dimension of matrix A.
-                  // It must be at least max(1, n).
+                  // It must be at least max(1, m).
             ipiv, // Pointer to the output array of pivot indices.
             scratchpad, // Pointer to scratchpad memory to be used by MKL
                         // routine for storing intermediate results.
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
index 4d5adfe09e4a..fb4dce4643b7 100644
--- a/dpnp/backend/extensions/lapack/lapack_py.cpp
+++ b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -135,7 +135,7 @@ PYBIND11_MODULE(_lapack_impl, m)
 
     m.def("_getrf", &lapack_ext::getrf,
           "Call `getrf` from OneMKL LAPACK library to return "
-          "the LU factorization of a general n x n matrix",
+          "the LU factorization of a general m x n matrix",
           py::arg("sycl_queue"), py::arg("a_array"), py::arg("ipiv_array"),
           py::arg("dev_info"), py::arg("depends") = py::list());
 
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 7aa5942bab46..d4c2d540f7c4 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -1955,7 +1955,7 @@ def test_overwrite_copy_special(self):
         a2_orig = a2.copy()
         a2.flags["WRITABLE"] = False
 
-        for a_dp, a_orig in zip((a1, a1), (a1_orig, a2_orig)):
+        for a_dp, a_orig in zip((a1, a2), (a1_orig, a2_orig)):
             lu, piv = dpnp.linalg.lu_factor(
                 a_dp, overwrite_a=True, check_finite=False
             )

From 4cf630df55f7e22af66aa8fb777c67b032579e56 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 22 Aug 2025 06:24:41 -0700
Subject: [PATCH 15/15] Simplify _apply_pivots_rows

---
 dpnp/tests/test_linalg.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index d4c2d540f7c4..dd5daa99b74d 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -1863,13 +1863,17 @@ class TestLuFactor:
     @staticmethod
     def _apply_pivots_rows(A_dp, piv_dp):
         m = A_dp.shape[0]
-        rows = dpnp.arange(m)
-        for i in range(int(piv_dp.shape[0])):
-            r = int(piv_dp[i].item())
+
+        if m == 0 or piv_dp.size == 0:
+            return A_dp
+
+        rows = list(range(m))
+        piv_np = dpnp.asnumpy(piv_dp)
+        for i, r in enumerate(piv_np):
             if i != r:
-                tmp = rows[i].copy()
-                rows[i] = rows[r]
-                rows[r] = tmp
+                rows[i], rows[r] = rows[r], rows[i]
+
+        rows = dpnp.asarray(rows)
         return A_dp[rows]
 
     @staticmethod