diff --git a/GRiDCodeGenerator.py b/GRiDCodeGenerator.py index 485bdf3..b265306 100644 --- a/GRiDCodeGenerator.py +++ b/GRiDCodeGenerator.py @@ -12,7 +12,7 @@ class GRiDCodeGenerator: gen_XImats_helpers_temp_shared_memory_code, gen_load_update_XImats_helpers, gen_topology_helpers_size, \ gen_get_Xhom_size, gen_load_update_XmatsHom_helpers, gen_load_update_XmatsHom_helpers_function_call, gen_XmatsHom_helpers_temp_shared_memory_code, \ gen_topology_sparsity_helpers_python, gen_init_topology_helpers, gen_topology_helpers_pointers_for_cpp, \ - gen_insert_helpers_function_call, gen_insert_helpers_func_def_params, gen_init_robotModel, \ + gen_insert_helpers_function_call, gen_insert_helpers_func_def_params, gen_init_robotModel, gen_joint_limits_size, gen_init_joint_limits, \ gen_invert_matrix, gen_matmul, gen_matmul_trans, gen_crm_mul, gen_crm, gen_outer_product # then import all of the algorithms @@ -35,7 +35,7 @@ class GRiDCodeGenerator: gen_end_effector_pose_host, gen_end_effector_pose_gradient_inner_temp_mem_size, gen_end_effector_pose_gradient_inner_function_call, \ gen_end_effector_pose_gradient_inner, gen_end_effector_pose_gradient_device, gen_end_effector_pose_gradient_kernel, \ gen_end_effector_pose_gradient_host, gen_end_effector_pose_gradient_hessian_inner_temp_mem_size, gen_end_effector_pose_gradient_hessian_inner_function_call, \ - gen_end_effector_pose_gradient_hessian_inner, gen_end_effector_pose_gradient_hessian_device, gen_end_effector_pose_gradient_hessian_kernel, \ + gen_end_effector_pose_gradient_hessian_inner, gen_end_effector_pose_gradient_hessian_device, gen_end_effector_pose_gradient_hessian_kernel, gen_X_single_thread, gen_X_warp, \ gen_end_effector_pose_gradient_hessian_host, gen_eepose_and_derivatives, \ gen_aba, gen_aba_inner, gen_aba_host, \ gen_aba_inner_function_call, gen_aba_kernel, gen_aba_device, gen_aba_inner_temp_mem_size, \ @@ -401,6 +401,8 @@ def gen_all_code(self, use_thread_group = False, include_base_inertia = False, i self.gen_init_XImats(include_base_inertia,include_homogenous_transforms) self.gen_init_robotModel() self.gen_init_gridData() + self.gen_joint_limits_size() + self.gen_init_joint_limits() self.gen_load_update_XImats_helpers(use_thread_group) if not self.robot.floating_base: if include_homogenous_transforms: diff --git a/algorithms/_eepose_gradient_hessian.py b/algorithms/_eepose_gradient_hessian.py index 649db4d..a05e7e5 100644 --- a/algorithms/_eepose_gradient_hessian.py +++ b/algorithms/_eepose_gradient_hessian.py @@ -1198,6 +1198,180 @@ def gen_end_effector_pose_gradient_hessian_host(self, mode = 0): self.gen_add_code_line("printf(\"Single Call DEEPOS %fus\\n\",time_delta_us_timespec(start,end)/static_cast(num_timesteps));") self.gen_add_end_function() +def gen_X_single_thread(self): + n = self.robot.get_num_pos() + Xmats_hom = self.robot.get_Xmats_hom_ordered_by_id() + + func_params = [ + "s_jointXforms is the pointer to the cumulative joint transfomration matrices", + "s_XmatsHom is the pointer to the homogenous transformation matrices", + "s_q is the vector of joint positions", + "tid is the joint index up to compute" + ] + self.gen_add_func_doc("Single thread joint transformation matrix accumulation up to joint (tid)", + [], func_params, None) + self.gen_add_code_line("template ") + self.gen_add_code_line("__device__") + self.gen_add_code_line("void X_single_thread(T *s_jointXforms, T *s_XmatsHom, T *s_q, int tid) {", True) + + # Write trig into s_XmatsHom + self.gen_add_code_line("// Update s_XmatsHom from s_q") + for jid in range(n): + self.gen_add_code_line("// X_hom[" + str(jid) + "]") + for col in range(4): + for row in range(4): + val = Xmats_hom[jid][row, col] + if not val.is_constant(): + s = str(val) + s = s.replace("sin(theta)", f"sin(s_q[{jid}])") + s = s.replace("cos(theta)", f"cos(s_q[{jid}])") + s = s.replace("theta", f"s_q[{jid}]") + cpp_ind = str(self.gen_static_array_ind_3d(jid, col, row, ind_stride=16, col_stride=4)) + self.gen_add_code_line(f"s_XmatsHom[{cpp_ind}] = static_cast({s});") + + # Accumulate globals up to tid + self.gen_add_code_line("// Accumulate global transforms up to 'tid'") + self.gen_add_code_line(f"if (tid >= {n}) tid = {n}-1;") + self.gen_add_code_line("if (tid < 0) { return; }") + + # j = 0: copy local -> global + self.gen_add_code_line("{") + self.gen_add_code_line(" const T* c0 = &s_XmatsHom[0];") + self.gen_add_code_line(" T* o0 = &s_jointXforms[0];") + self.gen_add_code_line(" o0[0]=c0[0]; o0[1]=c0[1]; o0[2]=c0[2];") + self.gen_add_code_line(" o0[4]=c0[4]; o0[5]=c0[5]; o0[6]=c0[6];") + self.gen_add_code_line(" o0[8]=c0[8]; o0[9]=c0[9]; o0[10]=c0[10];") + self.gen_add_code_line(" o0[12]=c0[12]; o0[13]=c0[13]; o0[14]=c0[14];") + self.gen_add_code_line(" o0[15]=(T)1;") + self.gen_add_code_line("}") + self.gen_add_code_line("if (tid == 0) { return; }") + + # j >= 1: prev * local + self.gen_add_code_line("#pragma unroll") + self.gen_add_code_line(f"for (int j = 1; j <= tid; ++j) {{") + self.gen_add_code_line(" const T* p = &s_jointXforms[(j-1)*16];") + self.gen_add_code_line(" const T* c = &s_XmatsHom[j*16];") + self.gen_add_code_line(" T r0 = p[0]*c[0] + p[4]*c[1] + p[8]*c[2];") + self.gen_add_code_line(" T r1 = p[1]*c[0] + p[5]*c[1] + p[9]*c[2];") + self.gen_add_code_line(" T r2 = p[2]*c[0] + p[6]*c[1] + p[10]*c[2];") + self.gen_add_code_line(" T r4 = p[0]*c[4] + p[4]*c[5] + p[8]*c[6];") + self.gen_add_code_line(" T r5 = p[1]*c[4] + p[5]*c[5] + p[9]*c[6];") + self.gen_add_code_line(" T r6 = p[2]*c[4] + p[6]*c[5] + p[10]*c[6];") + self.gen_add_code_line(" T r8 = p[0]*c[8] + p[4]*c[9] + p[8]*c[10];") + self.gen_add_code_line(" T r9 = p[1]*c[8] + p[5]*c[9] + p[9]*c[10];") + self.gen_add_code_line(" T r10 = p[2]*c[8] + p[6]*c[9] + p[10]*c[10];") + self.gen_add_code_line(" T r12 = p[0]*c[12] + p[4]*c[13] + p[8]*c[14] + p[12];") + self.gen_add_code_line(" T r13 = p[1]*c[12] + p[5]*c[13] + p[9]*c[14] + p[13];") + self.gen_add_code_line(" T r14 = p[2]*c[12] + p[6]*c[13] + p[10]*c[14] + p[14];") + self.gen_add_code_line(" T* o = &s_jointXforms[j*16];") + self.gen_add_code_line(" o[0]=r0; o[1]=r1; o[2]=r2;") + self.gen_add_code_line(" o[4]=r4; o[5]=r5; o[6]=r6;") + self.gen_add_code_line(" o[8]=r8; o[9]=r9; o[10]=r10;") + self.gen_add_code_line(" o[12]=r12; o[13]=r13; o[14]=r14;") + self.gen_add_code_line(" o[15]=(T)1;") + self.gen_add_code_line("}") + + self.gen_add_end_function() + +def gen_X_warp(self): + n = self.robot.get_num_pos() + Xmats_hom = self.robot.get_Xmats_hom_ordered_by_id() + + func_params = [ + "s_jointXforms is the pointer to the cumulative joint transfomration matrices", + "s_XmatsHom is the pointer to the homogenous transformation matrices", + "s_q is the vector of joint positions", + "tid is the joint index up to compute" + ] + self.gen_add_func_doc("Warp-cooperative joint transformation matrix accumulation up to joint (tid)", + [], func_params, None) + + self.gen_add_code_line("template ") + self.gen_add_code_line("__device__ inline void X_warp(") + self.gen_add_code_line(" T* __restrict__ s_jointXforms,") + self.gen_add_code_line(" T* __restrict__ s_XmatsHom,") + self.gen_add_code_line(" const T* __restrict__ s_q,") + self.gen_add_code_line(" int tid)") + self.gen_add_code_line("{", True) + self.gen_add_code_line(" const int lane = threadIdx.x & 31;") + self.gen_add_code_line(" const unsigned mask = 0xFFFFFFFFu;") + self.gen_add_code_line(f" if (tid >= {n}) tid = {n}-1;") + self.gen_add_code_line(" if (tid < 0) return;") + + # per-lane trig write + self.gen_add_code_line("") + self.gen_add_code_line(f" if (lane <= {n-1}) {{") + self.gen_add_code_line(" const int j = lane;") + self.gen_add_code_line(" const T c = static_cast(cos(s_q[j]));") + self.gen_add_code_line(" const T s = static_cast(sin(s_q[j]));") + self.gen_add_code_line(" T* X = &s_XmatsHom[j * 16];") + + # Update theta + for jid in range(n): + prefix = "if" if jid == 0 else "else if" + self.gen_add_code_line(f" {prefix} (j == {jid}) {{") + # (row,col) -> index within 4x4 block + wrote_any = False + for col in range(4): + for row in range(4): + val = Xmats_hom[jid][row, col] + s = str(val) + depends = ("theta" in s) or ("sin(" in s) or ("cos(" in s) + if not depends: + continue + wrote_any = True + # lane-local cos/sin + s = s.replace("cos(theta)", "c") + s = s.replace("sin(theta)", "s") + s = s.replace("theta", "s_q[j]") + idx = row + col*4 + self.gen_add_code_line(f" X[{idx}] = static_cast({s});") + self.gen_add_code_line(" }") + self.gen_add_code_line(" }") + self.gen_add_code_line(" __syncwarp(mask);") + + # row per-lane + self.gen_add_code_line("") + self.gen_add_code_line(" {") + self.gen_add_code_line(" const T* c = &s_XmatsHom[0];") + self.gen_add_code_line(" T* o = &s_jointXforms[0];") + self.gen_add_code_line(" if (lane == 0) { o[0] = c[0]; o[4] = c[4]; o[8] = c[8]; o[12] = c[12]; }") + self.gen_add_code_line(" else if (lane == 1) { o[1] = c[1]; o[5] = c[5]; o[9] = c[9]; o[13] = c[13]; }") + self.gen_add_code_line(" else if (lane == 2) { o[2] = c[2]; o[6] = c[6]; o[10] = c[10]; o[14] = c[14]; }") + self.gen_add_code_line(" if (lane == 0) { o[3]=(T)0; o[7]=(T)0; o[11]=(T)0; o[15]=(T)1; }") + self.gen_add_code_line(" }") + self.gen_add_code_line(" __syncwarp(mask);") + self.gen_add_code_line(" if (tid == 0) return;") + + # accumulate transformations + self.gen_add_code_line("") + self.gen_add_code_line(" #pragma unroll") + self.gen_add_code_line(" for (int j = 1; j <= tid; ++j) {") + self.gen_add_code_line(" const T* p = &s_jointXforms[(j - 1) * 16];") + self.gen_add_code_line(" const T* c = &s_XmatsHom[ j * 16];") + self.gen_add_code_line(" T* o = &s_jointXforms[ j * 16];") + self.gen_add_code_line(" if (lane == 0) {") + self.gen_add_code_line(" o[0] = p[0]*c[0] + p[4]*c[1] + p[8]*c[2];") + self.gen_add_code_line(" o[4] = p[0]*c[4] + p[4]*c[5] + p[8]*c[6];") + self.gen_add_code_line(" o[8] = p[0]*c[8] + p[4]*c[9] + p[8]*c[10];") + self.gen_add_code_line(" o[12] = p[0]*c[12] + p[4]*c[13] + p[8]*c[14] + p[12];") + self.gen_add_code_line(" } else if (lane == 1) {") + self.gen_add_code_line(" o[1] = p[1]*c[0] + p[5]*c[1] + p[9]*c[2];") + self.gen_add_code_line(" o[5] = p[1]*c[4] + p[5]*c[5] + p[9]*c[6];") + self.gen_add_code_line(" o[9] = p[1]*c[8] + p[5]*c[9] + p[9]*c[10];") + self.gen_add_code_line(" o[13] = p[1]*c[12] + p[5]*c[13] + p[9]*c[14] + p[13];") + self.gen_add_code_line(" } else if (lane == 2) {") + self.gen_add_code_line(" o[2] = p[2]*c[0] + p[6]*c[1] + p[10]*c[2];") + self.gen_add_code_line(" o[6] = p[2]*c[4] + p[6]*c[5] + p[10]*c[6];") + self.gen_add_code_line(" o[10] = p[2]*c[8] + p[6]*c[9] + p[10]*c[10];") + self.gen_add_code_line(" o[14] = p[2]*c[12] + p[6]*c[13] + p[10]*c[14] + p[14];") + self.gen_add_code_line(" }") + self.gen_add_code_line(" if (lane == 0) { o[3]=(T)0; o[7]=(T)0; o[11]=(T)0; o[15]=(T)1; }") + self.gen_add_code_line(" __syncwarp(mask);") + self.gen_add_code_line(" }") + + self.gen_add_end_function() + def gen_eepose_and_derivatives(self, use_thread_group = False): # first generate the inner helpers self.gen_end_effector_pose_inner(use_thread_group) @@ -1233,4 +1407,7 @@ def gen_eepose_and_derivatives(self, use_thread_group = False): # then the host launch wrappers self.gen_end_effector_pose_gradient_hessian_host(0) self.gen_end_effector_pose_gradient_hessian_host(1) - self.gen_end_effector_pose_gradient_hessian_host(2) \ No newline at end of file + self.gen_end_effector_pose_gradient_hessian_host(2) + + self.gen_X_single_thread() + self.gen_X_warp() \ No newline at end of file diff --git a/helpers/_topology_helpers.py b/helpers/_topology_helpers.py index bd25f75..9050932 100644 --- a/helpers/_topology_helpers.py +++ b/helpers/_topology_helpers.py @@ -711,4 +711,53 @@ def gen_init_robotModel(self): self.gen_add_code_lines(["robotModel *d_robotModel; gpuErrchk(cudaMalloc((void**)&d_robotModel,sizeof(robotModel)));", "gpuErrchk(cudaMemcpy(d_robotModel,&h_robotModel,sizeof(robotModel),cudaMemcpyHostToDevice));"]) self.gen_add_code_line("return d_robotModel;") - self.gen_add_end_function() \ No newline at end of file + self.gen_add_end_function() + +def gen_joint_limits_size(self): + n = self.robot.get_num_pos() + return 2 * n + +def gen_init_joint_limits(self): + n = self.robot.get_num_pos() + + limits_q_order = [] + for j in self.robot.get_joints_ordered_by_id(): + jt = j.get_type() if hasattr(j, "get_type") else j.jtype + if jt == "revolute": + lims = j.get_joint_limits() if hasattr(j, "get_joint_limits") else getattr(j, "joint_limits", []) + if lims and len(lims) >= 2: + lo, hi = lims[0], lims[1] + else: + lo, hi = -float("inf"), float("inf") + if lo is None: lo = -float("inf") + if hi is None: hi = float("inf") + limits_q_order.append((lo, hi)) + + self.gen_add_func_doc( + "Initializes joint limits (lower/upper) in GPU memory", + ["Memory order is lower[0..n-1], upper[0..n-1]"], + [], + "A device pointer to the joint limits array" + ) + self.gen_add_code_line("template ") + self.gen_add_code_line("__host__") + self.gen_add_code_line("T* init_joint_limits() {", True) + + total_size = self.gen_joint_limits_size() + self.gen_add_code_line(f"T *h_joint_limits = (T*)malloc({total_size}*sizeof(T));") + + for i, (lo, hi) in enumerate(limits_q_order): + lo_str = ("-std::numeric_limits::infinity()" if (lo is None or lo == -float('inf')) + else f"static_cast({lo})") + hi_str = (" std::numeric_limits::infinity()" if (hi is None or hi == float('inf')) + else f"static_cast({hi})") + self.gen_add_code_line(f"h_joint_limits[{i}] = {lo_str};") + self.gen_add_code_line(f"h_joint_limits[{i + n}] = {hi_str};") + + self.gen_add_code_line("T *d_joint_limits;") + self.gen_add_code_line(f"gpuErrchk(cudaMalloc((void**)&d_joint_limits, {total_size}*sizeof(T)));") + self.gen_add_code_line(f"gpuErrchk(cudaMemcpy(d_joint_limits, h_joint_limits, {total_size}*sizeof(T), cudaMemcpyHostToDevice));") + self.gen_add_code_line("free(h_joint_limits);") + self.gen_add_code_line("return d_joint_limits;") + self.gen_add_end_function() +