From b7adea4698ca8f8d91e501169674b8100f4528ea Mon Sep 17 00:00:00 2001
From: "sewon.jeon" <irocks0922@gmail.com>
Date: Sun, 8 Feb 2026 17:58:11 +0900
Subject: [PATCH 1/2] Refactor NSS and DNSS implementations for improved
 readability and modularity

- Rewrote legacy code into clean C++17 style.
- Added missing method implementations and input validation.
- Introduced options struct for configurable parameters in NSS and DNSS.
- Implemented CUDA support for DNSS rotational feature computation.
- Added new CMake configuration for building with or without CUDA.
- Updated README to reflect changes and provide build instructions.
- Added .gitignore to exclude build directories.
---
 .gitignore     |   2 +
 CMakeLists.txt |  21 ++
 NSS.cpp        | 855 +++++++++++++++++++++++++++++++------------------
 NSS.h          | 285 ++++++++++-------
 README.md      |  60 +++-
 dnss_cuda.cu   | 217 +++++++++++++
 6 files changed, 997 insertions(+), 443 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 CMakeLists.txt
 create mode 100644 dnss_cuda.cu
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3b912ee
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+build/
+build-cuda/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..c0dd088
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.20)
+
+project(DNSS LANGUAGES CXX)
+
+option(DNSS_ENABLE_CUDA "Enable CUDA acceleration for DNSS rotational feature computation" OFF)
+
+add_library(dnss NSS.cpp NSS.h)
+target_include_directories(dnss PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_compile_features(dnss PUBLIC cxx_std_17)
+
+if(DNSS_ENABLE_CUDA)
+	enable_language(CUDA)
+	find_package(CUDAToolkit REQUIRED)
+	if(NOT CMAKE_CUDA_ARCHITECTURES)
+		set(CMAKE_CUDA_ARCHITECTURES native)
+	endif()
+	target_sources(dnss PRIVATE dnss_cuda.cu)
+	target_compile_definitions(dnss PRIVATE DNSS_HAS_CUDA=1)
+	target_link_libraries(dnss PUBLIC CUDA::cudart)
+	set_target_properties(dnss PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+endif()
diff --git a/NSS.cpp b/NSS.cpp
index 84e3e20..5d4e4ae 100644
--- a/NSS.cpp
+++ b/NSS.cpp
@@ -1,402 +1,637 @@
+#include "NSS.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <queue>
+#include <random>
+#include <stdexcept>
+#include <utility>
+
+namespace
+{
+constexpr float kPi = 3.14159265359f;
+constexpr float kTwoPi = 2.0f * kPi;
+constexpr float kEpsilon = 1e-7f;
 
-#include <NSS.h>
-#include <glm/gtx/norm.hpp>
-#include <glm/glm.hpp>
-#include <iostream>
-#include <ppl.h>
+float clampFloat(float value, float min_value, float max_value)
+{
+	return std::max(min_value, std::min(value, max_value));
+}
 
-void NSS::normalSpaceSampling(const float &sample_rate, std::vector<glm::fvec3> &sampled_vertices, std::vector<glm::fvec3> &sampled_normals)
+float dotProduct(const glm::fvec3 &lhs, const glm::fvec3 &rhs)
 {
-	std::vector<std::vector<int>> normbuckets;
-	sort_into_buckets(m_original_normals, normbuckets);
-	int ndesired = int(ceil(sample_rate * m_original_vertices.size()));
-	sampled_vertices.clear();
-	sampled_normals.clear();
-	while (sampled_vertices.size() < ndesired)
+	return lhs.x * rhs.x + lhs.y * rhs.y + lhs.z * rhs.z;
+}
+
+glm::fvec3 crossProduct(const glm::fvec3 &lhs, const glm::fvec3 &rhs)
+{
+	return glm::fvec3(
+		lhs.y * rhs.z - lhs.z * rhs.y,
+		lhs.z * rhs.x - lhs.x * rhs.z,
+		lhs.x * rhs.y - lhs.y * rhs.x);
+}
+
+float vectorLength(const glm::fvec3 &value)
+{
+	return std::sqrt(dotProduct(value, value));
+}
+
+glm::fvec3 normalizeSafe(const glm::fvec3 &value)
+{
+	const float norm = vectorLength(value);
+	if (!std::isfinite(norm) || norm <= kEpsilon)
 	{
-		for (int i = 0; i < normbuckets.size(); i++)
-		{
-			if (!normbuckets[i].empty())
-			{
-				int ind = normbuckets[i].back();
-				sampled_vertices.emplace_back(m_original_vertices[ind]);
-				sampled_normals.emplace_back(m_original_normals[ind]);
-				normbuckets[i].pop_back();
-			}
-		}
+		return glm::fvec3(0.0f, 0.0f, 1.0f);
 	}
+	return value / norm;
 }
 
-void NSS::sort_into_buckets(const std::vector<glm::fvec3> &n, std::vector<std::vector<int>> &normbuckets)
+float computeRotationalReturnValue(const glm::fvec3 &normalized_point, const glm::fvec3 &normal, float theta)
 {
-	const int Q = 4;
-	const float Qsqrt1_2 = 2.8284f;
-	normbuckets.resize(3 * Q * Q);
-	for (int i = 0; i < n.size(); i++)
+	const float point_norm = vectorLength(normalized_point);
+	if (!std::isfinite(point_norm) || point_norm <= kEpsilon)
 	{
-		const float *N = &n[i][0];
+		return 0.0f;
+	}
 
-		float ax = fabs(N[0]), ay = fabs(N[1]), az = fabs(N[2]);
-		int A;
-		float u, v;
-		if (ax > ay)
-		{
-			if (ax > az)
-			{
-				A = 0;
-				u = (N[0] > 0) ? N[1] : -N[1];
-				v = (N[0] > 0) ? N[2] : -N[2];
-			}
-			else
-			{
-				A = 2;
-				u = (N[2] > 0) ? N[0] : -N[0];
-				v = (N[2] > 0) ? N[1] : -N[1];
-			}
-		}
-		else
+	const glm::fvec3 point_dir = normalized_point / point_norm;
+	const glm::fvec3 normal_dir = normalizeSafe(normal);
+	const float dot_pn = clampFloat(dotProduct(point_dir, normal_dir), -1.0f, 1.0f);
+	const float beta = std::acos(dot_pn);
+
+	const float pp = 2.0f * point_norm * std::sin(theta * 0.5f);
+	const float sin_beta = std::sin(beta);
+	const float cos_beta = std::cos(beta);
+
+	const float pq_positive = pp * std::cos(beta - theta * 0.5f);
+	const float pq_negative = pp * std::cos(-beta - theta * 0.5f);
+
+	const float denominator_positive = std::max(kEpsilon, point_norm - pq_positive * cos_beta);
+	const float denominator_negative = std::max(kEpsilon, point_norm - pq_negative * cos_beta);
+
+	const float atan_positive = std::atan((pq_positive * sin_beta) / denominator_positive);
+	const float atan_negative = std::atan((pq_negative * (-sin_beta)) / denominator_negative);
+
+	const float gamma_positive = theta - atan_positive;
+	const float gamma_negative = theta - atan_negative;
+
+	const float rr_positive = point_norm * gamma_positive / theta;
+	const float rr_negative = point_norm * gamma_negative / theta;
+	const float rotational_return = std::max(rr_positive, rr_negative);
+
+	if (!std::isfinite(rotational_return))
+	{
+		return 0.0f;
+	}
+	return std::max(0.0f, rotational_return);
+}
+
+int sphericalBucketIndex(
+	const glm::fvec3 &raw_normal,
+	int azimuth_bins,
+	int z_bins,
+	bool fold_antipodal)
+{
+	if (azimuth_bins <= 0 || z_bins <= 0)
+	{
+		return -1;
+	}
+
+	glm::fvec3 normal = normalizeSafe(raw_normal);
+
+	if (fold_antipodal)
+	{
+		const bool flip =
+			normal.z < 0.0f ||
+			(normal.z == 0.0f && (normal.y < 0.0f || (normal.y == 0.0f && normal.x < 0.0f)));
+		if (flip)
 		{
-			if (ay > az)
-			{
-				A = 1;
-				u = (N[1] > 0) ? N[2] : -N[2];
-				v = (N[1] > 0) ? N[0] : -N[0];
-			}
-			else
-			{
-				A = 2;
-				u = (N[2] > 0) ? N[0] : -N[0];
-				v = (N[2] > 0) ? N[1] : -N[1];
-			}
+			normal = -normal;
 		}
-		int U = int(u * Qsqrt1_2) + (Q / 2);
-		int V = int(v * Qsqrt1_2) + (Q / 2);
-		normbuckets[((A * Q) + U) * Q + V].push_back(i);
 	}
-	for (int bucket = 0; bucket < normbuckets.size(); bucket++)
-		std::random_shuffle(normbuckets[bucket].begin(), normbuckets[bucket].end());
+
+	float azimuth = std::atan2(normal.y, normal.x);
+	if (azimuth < 0.0f)
+	{
+		azimuth += kTwoPi;
+	}
+
+	const float z_min = fold_antipodal ? 0.0f : -1.0f;
+	const float z_normalized = clampFloat((normal.z - z_min) / (1.0f - z_min), 0.0f, 1.0f);
+
+	int azimuth_index = static_cast<int>(std::floor(azimuth / kTwoPi * static_cast<float>(azimuth_bins)));
+	int z_index = static_cast<int>(std::floor(z_normalized * static_cast<float>(z_bins)));
+
+	if (azimuth_index >= azimuth_bins)
+	{
+		azimuth_index = azimuth_bins - 1;
+	}
+	if (z_index >= z_bins)
+	{
+		z_index = z_bins - 1;
+	}
+
+	return azimuth_index * z_bins + z_index;
 }
 
-void DNSS::dualNormalSpaceSampling(const float &sample_rate, std::vector<glm::fvec3> &sampled_vertices, std::vector<glm::fvec3> &sampled_normals)
+void computeCenteredAndScaledVertices(
+	const std::vector<glm::fvec3> &vertices,
+	std::vector<glm::fvec3> &normalized_vertices)
 {
+	normalized_vertices.clear();
+	normalized_vertices.resize(vertices.size(), glm::fvec3(0.0f));
+	if (vertices.empty())
+	{
+		return;
+	}
 
-	// Timer timer;
-	//	timer.start();
-	computeCentroidandNormalize();
-	computeRotationalNormals();
-	computeRotationalReturn();
+	glm::fvec3 centroid(0.0f, 0.0f, 0.0f);
+	for (const glm::fvec3 &vertex : vertices)
+	{
+		centroid += vertex;
+	}
+	centroid /= static_cast<float>(vertices.size());
 
-	sortIntoBucket();
-	initBucketB();
-	int ndesired = int(ceil(sample_rate * m_vertices_original.size()));
+	float max_radius = 0.0f;
+	for (std::size_t i = 0; i < vertices.size(); ++i)
+	{
+		normalized_vertices[i] = vertices[i] - centroid;
+		max_radius = std::max(max_radius, vectorLength(normalized_vertices[i]));
+	}
 
-	sampled_vertices.clear();
-	sampled_normals.clear();
-	sampled_vertices.reserve(ndesired);
-	sampled_normals.reserve(ndesired);
+	if (max_radius <= kEpsilon)
+	{
+		return;
+	}
 
-	while (sampled_vertices.size() < ndesired)
+	const float inv_radius = 1.0f / max_radius;
+	for (glm::fvec3 &vertex : normalized_vertices)
 	{
-		int pid = pickPoint();
+		vertex *= inv_radius;
+	}
+}
+
+} // namespace
+
+#if defined(DNSS_HAS_CUDA)
+bool DNSSComputeRotationalFeaturesCuda(
+	const std::vector<glm::fvec3> &normalized_vertices,
+	const std::vector<glm::fvec3> &normals,
+	float theta_for_return_radians,
+	std::vector<glm::fvec3> *rotational_normals,
+	std::vector<float> *rotational_returns);
+#endif
+
+NSS::NSS()
+	: NSS(Options{})
+{
+}
+
+NSS::NSS(const Options &options)
+	: m_options(options), m_runtime_seed(options.random_seed)
+{
+}
+
+void NSS::setOptions(const Options &options)
+{
+	m_options = options;
+	m_runtime_seed = options.random_seed;
+}
+
+const NSS::Options &NSS::getOptions() const
+{
+	return m_options;
+}
 
-		sampled_vertices.emplace_back(m_vertices_original[pid]);
-		sampled_normals.emplace_back(m_normals_original[pid]);
-		// updateBucketOrder();
+void NSS::setInputCloud(const std::vector<glm::fvec3> &original_vertices, const std::vector<glm::fvec3> &original_normals)
+{
+	if (original_vertices.size() != original_normals.size())
+	{
+		throw std::invalid_argument("NSS::setInputCloud requires matching vertex and normal counts.");
 	}
-	sampled_vertices.shrink_to_fit();
-	sampled_normals.shrink_to_fit();
-	//	timer.stop();
-	//	std::wcout << L"DNSS sampling time : " << timer.peek_msec() << L" (ms)" << std::endl;
+	m_original_vertices = original_vertices;
+	m_original_normals = original_normals;
 }
 
-void DNSS::computeRotationalReturn()
+void NSS::normalSpaceSampling(const float &sample_rate, std::vector<glm::fvec3> &sampled_vertices, std::vector<glm::fvec3> &sampled_normals)
 {
-	int nSize = static_cast<int>(m_vertices_normalized.size());
-	m_rotationalReturns.clear();
-	m_rotationalReturns.resize(nSize);
+	sampled_vertices.clear();
+	sampled_normals.clear();
+
+	const int target_count = targetSampleCount(sample_rate);
+	if (target_count <= 0)
+	{
+		return;
+	}
+
+	const int bucket_count = m_options.azimuth_bins * m_options.z_bins;
+	if (bucket_count <= 0)
+	{
+		throw std::invalid_argument("NSS bucket configuration must be positive.");
+	}
+
+	std::vector<std::vector<int>> buckets(static_cast<std::size_t>(bucket_count));
+	for (std::size_t index = 0; index < m_original_normals.size(); ++index)
+	{
+		int bucket_index = normalToBucketIndex(m_original_normals[index]);
+		if (bucket_index < 0 || bucket_index >= bucket_count)
+		{
+			bucket_index = 0;
+		}
+		buckets[static_cast<std::size_t>(bucket_index)].push_back(static_cast<int>(index));
+	}
 
-	Concurrency::parallel_for(0, nSize, [&](int idx)
-							  {
-		float dot_pn = glm::dot(glm::normalize(m_vertices_normalized[idx]), m_normals_original[idx]);
-		float beta = acos(dot_pn);// / (glm::l2Norm(m_vertices_normalized[idx])*glm::l2Norm(m_normals_original[idx])));
-		float p_abs = glm::l2Norm(m_vertices_normalized[idx]);
-		float pp_abs = 2 * p_abs *sin(m_thetaForReturn / 2);
+	std::mt19937 rng(m_options.deterministic ? m_options.random_seed : nextSeed());
+	for (std::vector<int> &bucket : buckets)
+	{
+		std::shuffle(bucket.begin(), bucket.end(), rng);
+	}
 
-		float pq_abs_positive = pp_abs * cos(beta - m_thetaForReturn / 2);
-		float pq_abs_negative = pp_abs * cos(-beta - m_thetaForReturn / 2);
-		float sin_beta = sin(beta);
-		float cos_beta = cos(beta);
+	sampled_vertices.reserve(static_cast<std::size_t>(target_count));
+	sampled_normals.reserve(static_cast<std::size_t>(target_count));
 
-		float atanbeta_positive = atan((pq_abs_positive*sin_beta) / (p_abs - pq_abs_positive * cos_beta));
-		float gamma_positive = m_thetaForReturn - atanbeta_positive;
+	while (sampled_vertices.size() < static_cast<std::size_t>(target_count))
+	{
+		bool selected_any = false;
+		for (std::vector<int> &bucket : buckets)
+		{
+			if (sampled_vertices.size() >= static_cast<std::size_t>(target_count))
+			{
+				break;
+			}
+			if (bucket.empty())
+			{
+				continue;
+			}
 
-		float atanbeta_negative = atan((pq_abs_negative*(-sin_beta)) / (p_abs - pq_abs_negative * cos_beta));
-		float gamma_negative = m_thetaForReturn - atanbeta_negative;
+			selected_any = true;
+			const int point_index = bucket.back();
+			bucket.pop_back();
 
-		float rotationalReturnPositive = p_abs * gamma_positive / m_thetaForReturn;
-		float rotationalReturnNegative = p_abs * gamma_negative / m_thetaForReturn;
+			sampled_vertices.push_back(m_original_vertices[static_cast<std::size_t>(point_index)]);
+			sampled_normals.push_back(m_original_normals[static_cast<std::size_t>(point_index)]);
+		}
 
-		float RR = fmax(rotationalReturnPositive, rotationalReturnNegative);
-		m_rotationalReturns[idx] = RR; });
+		if (!selected_any)
+		{
+			break;
+		}
+	}
 }
-void DNSS::initBucketB()
+
+int NSS::normalToBucketIndex(const glm::fvec3 &normal) const
 {
-	m_bucketList_T.clear();
-	m_bucketList_R.clear();
-	m_bucketList_T.resize(m_bucketTranslation.size());
-	m_bucketList_R.resize(m_bucketRotation.size());
-	const int bucketsize_R = static_cast<int>(m_bucketRotation.size());
-	const int bucketsize_T = static_cast<int>(m_bucketTranslation.size());
-	for (int idx = 0; idx < bucketsize_R; idx++)
-	{
-		m_bucketList_R[idx].bucketIndex = idx;
-		//	m_bucketRotation[idx].clear();
-		if (m_bucketRotation[idx].empty())
-			m_bucketList_R[idx].constraints = FLT_MAX;
-		else
-			m_bucketList_R[idx].constraints = 0.0f;
-	}
+	return sphericalBucketIndex(normal, m_options.azimuth_bins, m_options.z_bins, false);
+}
 
-	for (int idx = 0; idx < bucketsize_T; idx++)
+int NSS::targetSampleCount(float sample_rate) const
+{
+	if (!std::isfinite(sample_rate) || sample_rate <= 0.0f || m_original_vertices.empty())
 	{
-		m_bucketList_T[idx].bucketIndex = idx;
-		//	m_bucketTranslation[idx].clear();
-		if (m_bucketTranslation[idx].empty())
-			m_bucketList_T[idx].constraints = FLT_MAX;
-		else
-			m_bucketList_T[idx].constraints = 0.0f;
+		return 0;
 	}
-	typeBucket r = typeBucket::Rotation, t = typeBucket::Translation;
-	updateBucketOrder(r);
-	updateBucketOrder(t);
+
+	const float clamped_rate = std::min(1.0f, sample_rate);
+	const int count = static_cast<int>(std::ceil(clamped_rate * static_cast<float>(m_original_vertices.size())));
+	return std::clamp(count, 0, static_cast<int>(m_original_vertices.size()));
+}
+
+std::uint32_t NSS::nextSeed()
+{
+	m_runtime_seed = m_runtime_seed * 1664525u + 1013904223u;
+	return m_runtime_seed;
+}
+
+DNSS::DNSS()
+	: DNSS(Options{})
+{
+}
+
+DNSS::DNSS(const Options &options)
+	: m_options(options), m_runtime_seed(options.random_seed)
+{
+}
+
+void DNSS::setOptions(const Options &options)
+{
+	m_options = options;
+	m_runtime_seed = options.random_seed;
 }
 
-void DNSS::sortIntoBucket()
+const DNSS::Options &DNSS::getOptions() const
 {
-	int nSizePoints = static_cast<int>(m_vertices_normalized.size());
-	std::wcout << L"niszeP_points : " << nSizePoints << std::endl;
-	int nSizeBucketR = m_bucketsizeR_azimuth * m_bucketsizeR_polar;
-	int nSizeBucketT = m_bucketsizeT_azimuth * m_bucketsizeT_polar;
-	// Put it into a concurrent vector, sort it, and put it back into a member variable.
-	// Reason: concurrent_vector pop_back not available
-	std::vector<concurrency::concurrent_vector<std::pair<int, float>>> bucketRotation;
-	std::vector<concurrency::concurrent_vector<int>> bucketTranslation;
-	m_vecForBIdx.clear();
-	m_vecForBIdx.resize(nSizePoints);
-
-	bucketRotation.clear();
-	bucketRotation.resize(nSizeBucketR);
-	bucketTranslation.clear();
-	bucketTranslation.resize(nSizeBucketT);
-
-	m_bucketRotation.clear();
-	m_bucketRotation.resize(nSizeBucketR);
-	m_bucketTranslation.clear();
-	m_bucketTranslation.resize(nSizeBucketT);
-
-	// Bucket space - they don't fit perfectly evenly into the bucket and may be crowded to one side,
-	// so instead of dividing by the bucket size, divide by 10 (an arbitrary number) to make sure there's enough space.
-	Concurrency::parallel_for(0, nSizeBucketR, [&](int idx)
-							  { bucketRotation[idx].reserve(nSizePoints / 10); });
-	Concurrency::parallel_for(0, nSizeBucketT, [&](int idx)
-							  { bucketTranslation[idx].reserve(nSizePoints / 10); });
-
-	Concurrency::parallel_for(0, nSizePoints, [&](int idx)
-							  {
-		float coord_R_azimuth = 0.f;
-		float coord_R_polar = 0.f;
-		computeSphericalCoordinate(glm::normalize(m_normals_rotational[idx]), coord_R_azimuth, coord_R_polar);
-		int index_R_azimuth = static_cast<int>(std::floorf(fabs(coord_R_azimuth) / m_thetaForSort));
-		int index_R_polar = static_cast<int>(std::floorf(coord_R_polar / m_thetaForSort));
-		int index_R = index_R_azimuth * m_bucketsizeR_polar + index_R_polar;
-
-		bucketRotation[index_R].push_back(std::make_pair(idx, m_rotationalReturns[idx]));
-
-		float coord_T_azimuth = 0.f;
-		float coord_T_polar = 0.f;
-		computeSphericalCoordinate(glm::normalize(m_normals_original[idx]), coord_T_azimuth, coord_T_polar);
-		int index_T_azimuth = static_cast<int>(std::floorf((coord_T_azimuth + m_pi_degree) / m_thetaForSort));
-		int index_T_polar = static_cast<int>(std::floorf(coord_T_polar / m_thetaForSort));
-		int index_T = index_T_azimuth * m_bucketsizeT_polar + index_T_polar;
-
-		bucketTranslation[index_T].push_back(idx);
-
-		m_vecForBIdx[idx] = std::make_pair(index_R, index_T); });
-
-	Concurrency::parallel_for(0, nSizeBucketR, [&](int idx)
-							  {
-								  bucketRotation[idx].shrink_to_fit();
-								  m_bucketRotation[idx].resize(bucketRotation[idx].size());
-								  std::move(bucketRotation[idx].begin(), bucketRotation[idx].end(), m_bucketRotation[idx].begin()); });
-	Concurrency::parallel_for(0, nSizeBucketT, [&](int idx)
-							  {
-		bucketTranslation[idx].shrink_to_fit();
-		m_bucketTranslation[idx].resize(bucketTranslation[idx].size());
-		std::move(bucketTranslation[idx].begin(), bucketTranslation[idx].end(), m_bucketTranslation[idx].begin()); });
-
-	Concurrency::parallel_for(0, nSizeBucketR, [&](int idx)
-							  {
-		//ascendin order sort because only pop_back is possible
-		std::sort(m_bucketRotation[idx].begin(), m_bucketRotation[idx].end(),
-			[](const std::pair<int, float>& lhs, const std::pair<int, float>& rhs) {
-			return lhs.second < rhs.second;
-		}); });
-
-	Concurrency::parallel_for(0, nSizeBucketT, [&](int idx)
-							  { std::random_shuffle(m_bucketTranslation[idx].begin(), m_bucketTranslation[idx].end()); });
+	return m_options;
 }
 
-void DNSS::computeRotationalNormals()
+void DNSS::setUseCuda(bool enable)
 {
-	int nSize = static_cast<int>(m_normals_original.size());
-	m_normals_rotational.resize(m_normals_original.size());
-	Concurrency::parallel_for(0, nSize, [&](int idx)
-							  { m_normals_rotational[idx] = glm::cross(m_vertices_normalized[idx], m_normals_original[idx]); });
+	m_options.enable_cuda = enable;
 }
 
-void DNSS::computeCentroidandNormalize()
+bool DNSS::getUseCuda() const
 {
-	Eigen::Matrix3Xf vertices = Eigen::Map<const Eigen::Matrix3Xf>(&m_vertices_original[0].x, 3, m_vertices_original.size());
-	Eigen::Vector3f centroid_eigen = vertices.rowwise().mean();
-	glm::fvec3 centroid = (glm::fvec3 &)(*centroid_eigen.data());
-
-	int nSize = static_cast<int>(m_vertices_original.size());
-	m_vertices_normalized.clear();
-	m_vertices_normalized.resize(nSize);
-	Concurrency::parallel_for(0, nSize, [&](int idx)
-							  { m_vertices_normalized[idx] = m_vertices_original[idx] - centroid; });
-	Eigen::Matrix3Xf vertices_moved = Eigen::Map<const Eigen::Matrix3Xf>(&m_vertices_normalized[0].x, 3, m_vertices_normalized.size());
-	// normalize factor
-	Eigen::Vector3f centroid_eigen_moved = vertices_moved.rowwise().maxCoeff();
-	float Lmax = vertices_moved.colwise().norm().maxCoeff();
-	float L_inverse = 1 / Lmax;
-	Concurrency::parallel_for(0, nSize, [&](int idx)
-							  { m_vertices_normalized[idx] = m_vertices_normalized[idx] * L_inverse; });
+	return m_options.enable_cuda;
 }
 
-int DNSS::pickPoint()
+void DNSS::setInputCloud(const std::vector<glm::fvec3> &original_vertices, const std::vector<glm::fvec3> &original_normals)
 {
-	int bid_top = 0;
-	typeBucket bType_top, bType_another;
+	if (original_vertices.size() != original_normals.size())
+	{
+		throw std::invalid_argument("DNSS::setInputCloud requires matching vertex and normal counts.");
+	}
+	m_vertices_original = original_vertices;
+	m_normals_original = original_normals;
+}
+
+void DNSS::dualNormalSpaceSampling(const float &sample_rate, std::vector<glm::fvec3> &sampled_vertices, std::vector<glm::fvec3> &sampled_normals)
+{
+	sampled_vertices.clear();
+	sampled_normals.clear();
+
+	const int target_count = targetSampleCount(sample_rate);
+	if (target_count <= 0)
+	{
+		return;
+	}
+
+	const int t_bucket_count = m_options.t_azimuth_bins * m_options.t_z_bins;
+	const int r_bucket_count = m_options.r_azimuth_bins * m_options.r_z_bins;
+	if (t_bucket_count <= 0 || r_bucket_count <= 0)
+	{
+		throw std::invalid_argument("DNSS bucket configuration must be positive.");
+	}
 
-	if (m_bucketList_R.front().constraints <= m_bucketList_T.front().constraints)
+	std::vector<glm::fvec3> normalized_vertices;
+	computeCenteredAndScaledVertices(m_vertices_original, normalized_vertices);
+
+	std::vector<glm::fvec3> rotational_normals(m_normals_original.size(), glm::fvec3(0.0f));
+	std::vector<float> rotational_returns(m_normals_original.size(), 0.0f);
+
+	bool cuda_used = false;
+#if defined(DNSS_HAS_CUDA)
+	if (m_options.enable_cuda)
 	{
-		bid_top = m_bucketList_R.front().bucketIndex;
-		bType_top = typeBucket::Rotation;
-		bType_another = typeBucket::Translation;
+		cuda_used = DNSSComputeRotationalFeaturesCuda(
+			normalized_vertices,
+			m_normals_original,
+			m_options.theta_for_return_radians,
+			&rotational_normals,
+			&rotational_returns);
 	}
-	else
+#endif
+
+	if (!cuda_used)
 	{
-		bid_top = m_bucketList_T.front().bucketIndex;
-		bType_top = typeBucket::Translation;
-		bType_another = typeBucket::Rotation;
+		for (std::size_t i = 0; i < normalized_vertices.size(); ++i)
+		{
+			rotational_normals[i] = crossProduct(normalized_vertices[i], m_normals_original[i]);
+			rotational_returns[i] = computeRotationalReturnValue(
+				normalized_vertices[i],
+				m_normals_original[i],
+				m_options.theta_for_return_radians);
+		}
 	}
-	int pid = -1;
 
-	switch (bType_top)
+	std::vector<std::vector<int>> t_buckets(static_cast<std::size_t>(t_bucket_count));
+
+	struct RotationEntry
 	{
-	case typeBucket::Rotation:
+		float rotational_return;
+		int point_index;
+	};
+	struct RotationEntryComparator
 	{
-		if (m_bucketRotation[bid_top].empty())
+		bool operator()(const RotationEntry &lhs, const RotationEntry &rhs) const
 		{
-			std::wcout << L"bucket id : " << bid_top << " and bucket size : " << m_bucketRotation[bid_top].size() << std::endl;
-			std::wcout << L"constraints : " << m_bucketList_R[bid_top].constraints << std::endl;
+			return lhs.rotational_return < rhs.rotational_return;
 		}
+	};
 
-		pid = m_bucketRotation[bid_top].back().first;
+	using RotationQueue = std::priority_queue<RotationEntry, std::vector<RotationEntry>, RotationEntryComparator>;
+	std::vector<RotationQueue> r_buckets(static_cast<std::size_t>(r_bucket_count));
 
-		m_bucketList_R.front().constraints += m_bucketRotation[bid_top].back().second;
-		m_bucketRotation[bid_top].pop_back();
+	std::vector<int> point_to_t_bucket(m_vertices_original.size(), 0);
+	std::vector<int> point_to_r_bucket(m_vertices_original.size(), 0);
 
-		int bid_T = m_vecForBIdx[pid].second;
+	for (std::size_t point_index = 0; point_index < m_vertices_original.size(); ++point_index)
+	{
+		int t_bucket = translationalBucketIndex(m_normals_original[point_index]);
+		int r_bucket = rotationalBucketIndex(rotational_normals[point_index]);
 
-		if (!m_bucketTranslation[bid_T].empty())
+		if (t_bucket < 0 || t_bucket >= t_bucket_count)
 		{
-			m_bucketTranslation[bid_T].erase(std::remove_if(m_bucketTranslation[bid_T].begin(), m_bucketTranslation[bid_T].end(),
-															[&pid](const int &elem)
-															{ return elem == pid; }),
-											 m_bucketTranslation[bid_T].end());
+			t_bucket = 0;
 		}
-		if (m_bucketTranslation[bid_T].empty())
+		if (r_bucket < 0 || r_bucket >= r_bucket_count)
 		{
-			auto it = std::find_if(m_bucketList_T.begin(), m_bucketList_T.end(),
-								   [bid_T](const structBucket &element)
-								   { return element.bucketIndex == bid_T; });
-			it->constraints = FLT_MAX;
-			updateBucketOrder(bType_another);
+			r_bucket = 0;
 		}
 
-		if (m_bucketRotation[bid_top].empty())
+		t_buckets[static_cast<std::size_t>(t_bucket)].push_back(static_cast<int>(point_index));
+		r_buckets[static_cast<std::size_t>(r_bucket)].push(
+			RotationEntry{rotational_returns[point_index], static_cast<int>(point_index)});
+
+		point_to_t_bucket[point_index] = t_bucket;
+		point_to_r_bucket[point_index] = r_bucket;
+	}
+
+	std::mt19937 rng(m_options.deterministic ? m_options.random_seed : nextSeed());
+	for (std::vector<int> &bucket : t_buckets)
+	{
+		std::shuffle(bucket.begin(), bucket.end(), rng);
+	}
+
+	std::vector<float> t_constraints(static_cast<std::size_t>(t_bucket_count), 0.0f);
+	std::vector<float> r_constraints(static_cast<std::size_t>(r_bucket_count), 0.0f);
+	std::vector<unsigned char> is_active(m_vertices_original.size(), 1u);
+
+	sampled_vertices.reserve(static_cast<std::size_t>(target_count));
+	sampled_normals.reserve(static_cast<std::size_t>(target_count));
+
+	auto popTranslationalPoint = [&](int bucket_index) -> int
+	{
+		auto &bucket = t_buckets[static_cast<std::size_t>(bucket_index)];
+		while (!bucket.empty() && is_active[static_cast<std::size_t>(bucket.back())] == 0u)
+		{
+			bucket.pop_back();
+		}
+		if (bucket.empty())
 		{
-			m_bucketList_R.front().constraints = FLT_MAX;
+			return -1;
+		}
+		const int point_index = bucket.back();
+		bucket.pop_back();
+		return point_index;
+	};
+
+	auto popRotationalPoint = [&](int bucket_index, float *rotational_weight) -> int
+	{
+		auto &bucket = r_buckets[static_cast<std::size_t>(bucket_index)];
+		while (!bucket.empty() && is_active[static_cast<std::size_t>(bucket.top().point_index)] == 0u)
+		{
+			bucket.pop();
+		}
+		if (bucket.empty())
+		{
+			*rotational_weight = 0.0f;
+			return -1;
+		}
+		const RotationEntry entry = bucket.top();
+		bucket.pop();
+		*rotational_weight = entry.rotational_return;
+		return entry.point_index;
+	};
+
+	auto selectPoint = [&](int point_index)
+	{
+		if (point_index < 0)
+		{
+			return;
+		}
+		if (is_active[static_cast<std::size_t>(point_index)] == 0u)
+		{
+			return;
+		}
+
+		is_active[static_cast<std::size_t>(point_index)] = 0u;
+		sampled_vertices.push_back(m_vertices_original[static_cast<std::size_t>(point_index)]);
+		sampled_normals.push_back(m_normals_original[static_cast<std::size_t>(point_index)]);
+
+		const int t_bucket = point_to_t_bucket[static_cast<std::size_t>(point_index)];
+		const int r_bucket = point_to_r_bucket[static_cast<std::size_t>(point_index)];
+		t_constraints[static_cast<std::size_t>(t_bucket)] += 1.0f;
+		r_constraints[static_cast<std::size_t>(r_bucket)] += rotational_returns[static_cast<std::size_t>(point_index)];
+	};
+
+	if (m_options.initialize_rotation_buckets)
+	{
+		for (int r_bucket = 0; r_bucket < r_bucket_count; ++r_bucket)
+		{
+			if (sampled_vertices.size() >= static_cast<std::size_t>(target_count))
+			{
+				break;
+			}
+			float weight = 0.0f;
+			const int point_index = popRotationalPoint(r_bucket, &weight);
+			(void)weight;
+			selectPoint(point_index);
 		}
-		updateBucketOrder(bType_top);
-		break;
 	}
-	case typeBucket::Translation:
+
+	while (sampled_vertices.size() < static_cast<std::size_t>(target_count))
 	{
-		if (m_bucketTranslation[bid_top].empty())
+		enum class BucketType
+		{
+			None,
+			Translational,
+			Rotational
+		};
+
+		BucketType best_type = BucketType::None;
+		int best_bucket_index = -1;
+		float best_constraint = std::numeric_limits<float>::max();
+
+		for (int t_bucket = 0; t_bucket < t_bucket_count; ++t_bucket)
 		{
-			std::wcout << L"bucket id : " << bid_top << " and bucket size : " << m_bucketTranslation[bid_top].size() << std::endl;
-			std::wcout << L"constraints : " << m_bucketList_T[bid_top].constraints << std::endl;
+			auto &bucket = t_buckets[static_cast<std::size_t>(t_bucket)];
+			while (!bucket.empty() && is_active[static_cast<std::size_t>(bucket.back())] == 0u)
+			{
+				bucket.pop_back();
+			}
+			if (bucket.empty())
+			{
+				continue;
+			}
+
+			const float constraint = t_constraints[static_cast<std::size_t>(t_bucket)];
+			if (constraint < best_constraint)
+			{
+				best_constraint = constraint;
+				best_bucket_index = t_bucket;
+				best_type = BucketType::Translational;
+			}
 		}
-		pid = m_bucketTranslation[bid_top].back();
 
-		m_bucketList_T.front().constraints += 1.0f;
-		m_bucketTranslation[bid_top].pop_back();
+		for (int r_bucket = 0; r_bucket < r_bucket_count; ++r_bucket)
+		{
+			auto &bucket = r_buckets[static_cast<std::size_t>(r_bucket)];
+			while (!bucket.empty() && is_active[static_cast<std::size_t>(bucket.top().point_index)] == 0u)
+			{
+				bucket.pop();
+			}
+			if (bucket.empty())
+			{
+				continue;
+			}
 
-		int bid_R = m_vecForBIdx[pid].first;
+			const float constraint = r_constraints[static_cast<std::size_t>(r_bucket)];
+			if (constraint < best_constraint)
+			{
+				best_constraint = constraint;
+				best_bucket_index = r_bucket;
+				best_type = BucketType::Rotational;
+			}
+		}
 
-		if (!m_bucketRotation[bid_R].empty())
+		if (best_type == BucketType::None)
 		{
-			m_bucketRotation[bid_R].erase(std::remove_if(m_bucketRotation[bid_R].begin(), m_bucketRotation[bid_R].end(),
-														 [&pid](const std::pair<int, float> &elem)
-														 { return elem.first == pid; }),
-										  m_bucketRotation[bid_R].end());
+			break;
 		}
-		if (m_bucketRotation[bid_R].empty())
+
+		if (best_type == BucketType::Translational)
 		{
-			auto it = std::find_if(m_bucketList_R.begin(), m_bucketList_R.end(),
-								   [bid_R](const structBucket &element)
-								   { return element.bucketIndex == bid_R; });
-			it->constraints = FLT_MAX;
-			updateBucketOrder(bType_another);
+			selectPoint(popTranslationalPoint(best_bucket_index));
 		}
-		if (m_bucketTranslation[bid_top].empty())
+		else
 		{
-			m_bucketList_T.front().constraints = FLT_MAX;
+			float rotational_weight = 0.0f;
+			selectPoint(popRotationalPoint(best_bucket_index, &rotational_weight));
 		}
-		updateBucketOrder(bType_top);
-		break;
 	}
+
+	if (sampled_vertices.size() < static_cast<std::size_t>(target_count))
+	{
+		for (std::size_t point_index = 0; point_index < is_active.size(); ++point_index)
+		{
+			if (sampled_vertices.size() >= static_cast<std::size_t>(target_count))
+			{
+				break;
+			}
+			if (is_active[point_index] == 0u)
+			{
+				continue;
+			}
+			selectPoint(static_cast<int>(point_index));
+		}
 	}
-	return pid;
 }
-void DNSS::updateBucketOrder(typeBucket &bType)
+
+int DNSS::translationalBucketIndex(const glm::fvec3 &normal) const
 {
-	switch (bType)
-	{
-	case typeBucket::Rotation:
-		std::sort(m_bucketList_R.begin(), m_bucketList_R.end(),
-				  [](const structBucket &lhs, const structBucket &rhs)
-				  {
-					  return lhs.constraints < rhs.constraints;
-				  });
-		break;
-	case typeBucket::Translation:
-		std::sort(m_bucketList_T.begin(), m_bucketList_T.end(),
-				  [](const structBucket &lhs, const structBucket &rhs)
-				  {
-					  return lhs.constraints < rhs.constraints;
-				  });
-		break;
+	return sphericalBucketIndex(normal, m_options.t_azimuth_bins, m_options.t_z_bins, false);
+}
+
+int DNSS::rotationalBucketIndex(const glm::fvec3 &rotational_normal) const
+{
+	return sphericalBucketIndex(rotational_normal, m_options.r_azimuth_bins, m_options.r_z_bins, true);
+}
+
+int DNSS::targetSampleCount(float sample_rate) const
+{
+	if (!std::isfinite(sample_rate) || sample_rate <= 0.0f || m_vertices_original.empty())
+	{
+		return 0;
 	}
+
+	const float clamped_rate = std::min(1.0f, sample_rate);
+	const int count = static_cast<int>(std::ceil(clamped_rate * static_cast<float>(m_vertices_original.size())));
+	return std::clamp(count, 0, static_cast<int>(m_vertices_original.size()));
 }
 
-void DNSS::computeSphericalCoordinate(const glm::fvec3 &normal, float &coordinates_azimuth, float &coordinates_polar)
+std::uint32_t DNSS::nextSeed()
 {
-	float radian_azimuth = atan2(normal.y, normal.x);
-	float radian_polar = acos(normal.z);
-	coordinates_azimuth = radian_azimuth * m_pi_degree / m_pi_radian;
-	coordinates_polar = radian_polar * m_pi_degree / m_pi_radian;
+	m_runtime_seed = m_runtime_seed * 1664525u + 1013904223u;
+	return m_runtime_seed;
 }
diff --git a/NSS.h b/NSS.h
index d19bdf5..81a6e15 100644
--- a/NSS.h
+++ b/NSS.h
@@ -1,154 +1,193 @@
 #pragma once
 
-#include <glm/glm.hpp>
-#include <algorithm>
+#include <cstdint>
 #include <vector>
-#include <Eigen/Dense>
-#include <Eigen/Geometry>
-#include <Eigen/Core>
-#include <concurrent_queue.h>
-#include <concurrent_vector.h>
-#include <concurrent_unordered_map.h>
-
-/*
- * normal space sampling algorithm
-*
- *
- * original source code: https://github.com/kyzyx/scanalyze/
- *Scanalyze (version 1.0) license information
- -------------------------------------------
-
- Scanalyze is copyright (c) 2002 the Board of Trustees of The Leland
- Stanford Junior University. All rights reserved.
-
- This software is covered by the Stanford Computer Graphics Laboratory's
- General Software License. This license is royalty-free, nonexclusive,
- and nontransferable.  The terms and conditions of this license are
- viewable at this URL:
-
- http://graphics.stanford.edu/software/license.html
-
-
- The scanalyze software also builds upon a number of 3rd party
- software components, some of which may be copyrighted:
-
- 1. Togl
- 2. Tcl/Tk
- 3. STL
- 4. Template Numerical Toolkit (TNT)
- 5. Janne Heikkila's Camera Calibration Toolbox
- 6. Graphics Gems
- 7. Miscellaneous SGI code
-
- Further information about each of these software components and
- their terms of usage is included below:
-
- ----------------------------------------------------------------------
- *
- **/
+
+#if __has_include(<glm/vec3.hpp>)
+#include <glm/vec3.hpp>
+#else
+namespace glm
+{
+struct fvec3
+{
+	float x;
+	float y;
+	float z;
+
+	constexpr fvec3()
+		: x(0.0f), y(0.0f), z(0.0f)
+	{
+	}
+
+	constexpr explicit fvec3(float value)
+		: x(value), y(value), z(value)
+	{
+	}
+
+	constexpr fvec3(float x_value, float y_value, float z_value)
+		: x(x_value), y(y_value), z(z_value)
+	{
+	}
+
+	float &operator[](int index)
+	{
+		return (&x)[index];
+	}
+
+	const float &operator[](int index) const
+	{
+		return (&x)[index];
+	}
+
+	fvec3 &operator+=(const fvec3 &rhs)
+	{
+		x += rhs.x;
+		y += rhs.y;
+		z += rhs.z;
+		return *this;
+	}
+
+	fvec3 &operator-=(const fvec3 &rhs)
+	{
+		x -= rhs.x;
+		y -= rhs.y;
+		z -= rhs.z;
+		return *this;
+	}
+
+	fvec3 &operator*=(float scalar)
+	{
+		x *= scalar;
+		y *= scalar;
+		z *= scalar;
+		return *this;
+	}
+
+	fvec3 &operator/=(float scalar)
+	{
+		x /= scalar;
+		y /= scalar;
+		z /= scalar;
+		return *this;
+	}
+};
+
+inline fvec3 operator+(fvec3 lhs, const fvec3 &rhs)
+{
+	lhs += rhs;
+	return lhs;
+}
+
+inline fvec3 operator-(fvec3 lhs, const fvec3 &rhs)
+{
+	lhs -= rhs;
+	return lhs;
+}
+
+inline fvec3 operator-(const fvec3 &value)
+{
+	return fvec3(-value.x, -value.y, -value.z);
+}
+
+inline fvec3 operator*(fvec3 lhs, float scalar)
+{
+	lhs *= scalar;
+	return lhs;
+}
+
+inline fvec3 operator*(float scalar, fvec3 rhs)
+{
+	rhs *= scalar;
+	return rhs;
+}
+
+inline fvec3 operator/(fvec3 lhs, float scalar)
+{
+	lhs /= scalar;
+	return lhs;
+}
+} // namespace glm
+#endif
+
 class NSS
 {
 public:
-	/** \brief Function to perform normal space sampling
-	 * \param[in] sample_rate The sampling rate, maximum 1.0. Repeat until the number is filled by multiplying the rate.
+	struct Options
+	{
+		int azimuth_bins = 12;
+		int z_bins = 6;
+		std::uint32_t random_seed = 0x12345678u;
+		bool deterministic = true;
+	};
+
+	NSS();
+	explicit NSS(const Options &options);
+
+	void setOptions(const Options &options);
+	const Options &getOptions() const;
+
+	/** \brief Function to perform normal space sampling.
+	 * \param[in] sample_rate The sampling rate in [0, 1].
 	 * \param[out] sampled_vertices Data of sampled points.
 	 * \param[out] sampled_normals Data of sampled normals.
 	 */
 	void normalSpaceSampling(const float &sample_rate, std::vector<glm::fvec3> &sampled_vertices, std::vector<glm::fvec3> &sampled_normals);
 
-	/** \brief Function to specify the original data for sampling
-	 * \param[in] pointData Point data.
-	 * \param[in] normalData Normal data.
+	/** \brief Function to specify the original data for sampling.
+	 * \param[in] original_vertices Point data.
+	 * \param[in] original_normals Normal data.
 	 */
-	void setInputCloud(const std::vector<glm::fvec3> original_vertices, const std::vector<glm::fvec3> original_normals);
+	void setInputCloud(const std::vector<glm::fvec3> &original_vertices, const std::vector<glm::fvec3> &original_normals);
 
 private:
-	/** \brief Create indices based on the normal values and then store the index values in buckets.
-	 * \param[in] const std::vector<glm::fvec3> &n A series of normal values.
-	 * \param[out] std::vector< std::vector<int>> Buckets containing indices.
-	 */
-	void sort_into_buckets(const std::vector<glm::fvec3> &n, std::vector<std::vector<int>> &normbuckets);
+	int normalToBucketIndex(const glm::fvec3 &normal) const;
+	int targetSampleCount(float sample_rate) const;
+	std::uint32_t nextSeed();
 
 private:
+	Options m_options;
+	std::uint32_t m_runtime_seed = m_options.random_seed;
 	std::vector<glm::fvec3> m_original_vertices;
 	std::vector<glm::fvec3> m_original_normals;
 };
 
-/*
- * Dual Normal Space Sampling
- * Kwok, Tsz-Ho. "DNSS: Dual-Normal-Space Sampling for 3-D ICP Registration."
- * IEEE Transactions on Automation Science and Engineering (2018).
- * Sampling including rotation information in addition to the original NSS.
- * Adding Rotational Information to NSS
- * Author: seweon.jeon
- * Date: 2018.04.09
- **/
-
 class DNSS
 {
 public:
-	void setInputCloud(const std::vector<glm::fvec3> original_vertices, const std::vector<glm::fvec3> original_normals);
-	void dualNormalSpaceSampling(const float &sample_rate, std::vector<glm::fvec3> &sampled_vertices, std::vector<glm::fvec3> &sampled_normals);
-
-private:
-	struct structBucket
+	struct Options
 	{
-		int bucketIndex = 0;
-		float constraints = 0;
+		int t_azimuth_bins = 12;
+		int t_z_bins = 6;
+		int r_azimuth_bins = 6;
+		int r_z_bins = 6;
+		float theta_for_return_radians = 0.78539816339f; // pi / 4
+		bool initialize_rotation_buckets = true;
+		bool enable_cuda = false;
+		std::uint32_t random_seed = 0x9e3779b9u;
+		bool deterministic = true;
 	};
 
-	enum class typeBucket : int
-	{
-		Rotation = 0,
-		Translation,
-	};
+	DNSS();
+	explicit DNSS(const Options &options);
 
-	void computeRotationalReturn();
-	void initBucketB();
-	void sortIntoBucket();
-	void computeRotationalNormals();
-	void computeCentroidandNormalize();
-
-	int pickPoint();
-
-	void updateBucketOrder(typeBucket &bType);
-	/**
-	 * \brief Calculates azimuth and polar values based on Normal (nx, ny, nz)
-	 * \note The naming of azimuth and polar angles differ in the paper and ISO rule, so this follows the ISO rule
-	 * ISO rule: theta - polar angle (0~ pi), phi - azimuth angle (0~2pi)
-	 * Paper: theta - azimuth angle (0~2pi), phi - polar angle (0~pi)
-	 * pi = 180; 2pi = 360;
-	 * \param[in] glm::fvec3 Normal value.
-	 * \param[out] float coordinates_azimuth.
-	 * \param[out] float coordinates_polar.
-	 */
-	void computeSphericalCoordinate(const glm::fvec3 &normal, float &coordinates_azimuth, float &coordinates_polar);
+	void setOptions(const Options &options);
+	const Options &getOptions() const;
+
+	void setUseCuda(bool enable);
+	bool getUseCuda() const;
+
+	void setInputCloud(const std::vector<glm::fvec3> &original_vertices, const std::vector<glm::fvec3> &original_normals);
+	void dualNormalSpaceSampling(const float &sample_rate, std::vector<glm::fvec3> &sampled_vertices, std::vector<glm::fvec3> &sampled_normals);
+
+private:
+	int translationalBucketIndex(const glm::fvec3 &normal) const;
+	int rotationalBucketIndex(const glm::fvec3 &rotational_normal) const;
+	int targetSampleCount(float sample_rate) const;
+	std::uint32_t nextSeed();
 
 private:
-	std::vector<structBucket> m_bucketList_R;
-	std::vector<structBucket> m_bucketList_T;
-	std::vector<std::pair<int, int>> m_vecForBIdx;
-	std::vector<std::vector<std::pair<int, float>>> m_bucketRotation;
-	std::vector<std::vector<int>> m_bucketTranslation;
+	Options m_options;
+	std::uint32_t m_runtime_seed = m_options.random_seed;
 	std::vector<glm::fvec3> m_vertices_original;
 	std::vector<glm::fvec3> m_normals_original;
-	std::vector<glm::fvec3> m_vertices_normalized;
-	std::vector<glm::fvec3> m_normals_rotational;
-	std::vector<float> m_rotationalReturns;
-
-	/*
-	 * \note The naming of azimuth and polar angles differ in the paper and ISO rule, so this follows the ISO rule
-	 * ISO rule: theta - polar angle (0~180), phi - azimuth angle (0~360)
-	 * Paper: theta - azimuth angle (0~360), phi - polar angle (0~180)
-	 */
-	static const int m_bucketsizeT_azimuth = 12;
-	static const int m_bucketsizeT_polar = 6;
-	static const int m_bucketsizeR_azimuth = 6;
-	static const int m_bucketsizeR_polar = 6;
-
-	const float m_pi_degree = 180.0f;
-	const float m_pi_radian = 3.141592f;
-	const float m_pi2_degree = 360.0f;
-	const float m_thetaForSort = m_pi_degree / 6;
-	const float m_thetaForReturn = m_pi_degree / 4;
-};
\ No newline at end of file
+};
diff --git a/README.md b/README.md
index de38a01..4da9f82 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,55 @@
-# DNSS
-Normal Space Sampling and Dual Normal Space Sampling
+# DNSS / NSS
 
-An Implementation of DNSS and NSS.
+Clean C++ implementation of:
 
-I am not author of original paper.
-I just implemented it.
+- NSS: Normal-Space Sampling for ICP point selection.
+- DNSS: Dual-Normal-Space Sampling from
+  Tsz-Ho Kwok, "DNSS: Dual-Normal-Space Sampling for 3-D ICP Registration," IEEE T-ASE, 2018.
 
-NSS: Normal Space Sampling
-DNSS: Dual Normal Space Sampling
+## What This Version Fixes
 
-parameters are configured same as original paper.
+- Rewrites legacy code into readable, modular C++17.
+- Adds missing method implementations (`setInputCloud`) and input validation.
+- Fixes rotational-return math to use radians consistently.
+- Uses efficient lazy removal (active-mask + heap cleanup) instead of repeated `erase` on vectors.
+- Adds optional CUDA acceleration for DNSS rotational feature computation.
 
-Kwok, Tsz-Ho. "DNSS: Dual-Normal-Space Sampling for 3-D ICP Registration."
-IEEE Transactions on Automation Science and Engineering (2018).
+## Algorithm Notes
+
+DNSS follows the paper's main idea:
+
+1. Center and scale points by `Lmax`.
+2. Build translational normal space from original normals.
+3. Build rotational normal space from `nr = p x n`.
+4. Compute rotational return and use it as the rotational-bucket constraint increment.
+5. Iteratively pick points from the currently least-constrained bucket across both spaces.
+
+Default bucket settings:
+
+- Translational space: `12 x 6`
+- Rotational space: `6 x 6`
+- Return angle `theta`: `pi / 4`
+
+## Build
+
+CPU only:
+
+```bash
+cmake -S . -B build
+cmake --build build
+```
+
+With CUDA (if CUDA toolkit is installed):
+
+```bash
+cmake -S . -B build -DDNSS_ENABLE_CUDA=ON
+cmake --build build
+```
+
+When CUDA is enabled, call `DNSS::setUseCuda(true)` to request GPU acceleration. If GPU execution fails at runtime, the implementation falls back to the CPU path.
+
+## References
+
+- DNSS paper (2018): https://ieeexplore.ieee.org/document/8375750
+- DNSS open-access repository page: https://spectrum.library.concordia.ca/id/eprint/984398/
+- NSS/ICP variant context: Rusinkiewicz and Levoy, "Efficient Variants of the ICP Algorithm," 3DIM 2001.
diff --git a/dnss_cuda.cu b/dnss_cuda.cu
new file mode 100644
index 0000000..ada8572
--- /dev/null
+++ b/dnss_cuda.cu
@@ -0,0 +1,217 @@
+#include <cmath>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "NSS.h"
+
+namespace
+{
+constexpr float kEpsilon = 1e-7f;
+
+__device__ float clampFloat(float value, float min_value, float max_value)
+{
+	return fmaxf(min_value, fminf(value, max_value));
+}
+
+__global__ void computeRotationalFeaturesKernel(
+	const float *normalized_vertices,
+	const float *normals,
+	float theta,
+	float *rotational_normals,
+	float *rotational_returns,
+	int point_count)
+{
+	const int index = blockIdx.x * blockDim.x + threadIdx.x;
+	if (index >= point_count)
+	{
+		return;
+	}
+
+	const int base = 3 * index;
+	const float px = normalized_vertices[base + 0];
+	const float py = normalized_vertices[base + 1];
+	const float pz = normalized_vertices[base + 2];
+	const float nx = normals[base + 0];
+	const float ny = normals[base + 1];
+	const float nz = normals[base + 2];
+
+	// Rotational normal nr = p x n.
+	rotational_normals[base + 0] = py * nz - pz * ny;
+	rotational_normals[base + 1] = pz * nx - px * nz;
+	rotational_normals[base + 2] = px * ny - py * nx;
+
+	const float point_norm = sqrtf(px * px + py * py + pz * pz);
+	if (!(isfinite(point_norm)) || point_norm <= kEpsilon)
+	{
+		rotational_returns[index] = 0.0f;
+		return;
+	}
+
+	float normal_norm = sqrtf(nx * nx + ny * ny + nz * nz);
+	if (!(isfinite(normal_norm)) || normal_norm <= kEpsilon)
+	{
+		normal_norm = 1.0f;
+	}
+
+	const float pdx = px / point_norm;
+	const float pdy = py / point_norm;
+	const float pdz = pz / point_norm;
+	const float ndx = nx / normal_norm;
+	const float ndy = ny / normal_norm;
+	const float ndz = nz / normal_norm;
+
+	const float dot_pn = clampFloat(pdx * ndx + pdy * ndy + pdz * ndz, -1.0f, 1.0f);
+	const float beta = acosf(dot_pn);
+	const float pp = 2.0f * point_norm * sinf(theta * 0.5f);
+	const float sin_beta = sinf(beta);
+	const float cos_beta = cosf(beta);
+
+	const float pq_positive = pp * cosf(beta - theta * 0.5f);
+	const float pq_negative = pp * cosf(-beta - theta * 0.5f);
+
+	const float denominator_positive = fmaxf(kEpsilon, point_norm - pq_positive * cos_beta);
+	const float denominator_negative = fmaxf(kEpsilon, point_norm - pq_negative * cos_beta);
+
+	const float atan_positive = atanf((pq_positive * sin_beta) / denominator_positive);
+	const float atan_negative = atanf((pq_negative * (-sin_beta)) / denominator_negative);
+
+	const float gamma_positive = theta - atan_positive;
+	const float gamma_negative = theta - atan_negative;
+
+	const float rr_positive = point_norm * gamma_positive / theta;
+	const float rr_negative = point_norm * gamma_negative / theta;
+
+	const float rr = fmaxf(rr_positive, rr_negative);
+	if (!isfinite(rr) || rr < 0.0f)
+	{
+		rotational_returns[index] = 0.0f;
+	}
+	else
+	{
+		rotational_returns[index] = rr;
+	}
+}
+
+bool isCudaSuccess(cudaError_t status)
+{
+	return status == cudaSuccess;
+}
+
+} // namespace
+
+bool DNSSComputeRotationalFeaturesCuda(
+	const std::vector<glm::fvec3> &normalized_vertices,
+	const std::vector<glm::fvec3> &normals,
+	float theta_for_return_radians,
+	std::vector<glm::fvec3> *rotational_normals,
+	std::vector<float> *rotational_returns)
+{
+	if (rotational_normals == nullptr || rotational_returns == nullptr)
+	{
+		return false;
+	}
+	if (normalized_vertices.size() != normals.size())
+	{
+		return false;
+	}
+
+	const int point_count = static_cast<int>(normalized_vertices.size());
+	rotational_normals->assign(normalized_vertices.size(), glm::fvec3(0.0f));
+	rotational_returns->assign(normalized_vertices.size(), 0.0f);
+
+	if (point_count == 0)
+	{
+		return true;
+	}
+
+	std::vector<float> vertices_flat(static_cast<std::size_t>(point_count) * 3u);
+	std::vector<float> normals_flat(static_cast<std::size_t>(point_count) * 3u);
+	for (int i = 0; i < point_count; ++i)
+	{
+		vertices_flat[3 * i + 0] = normalized_vertices[static_cast<std::size_t>(i)].x;
+		vertices_flat[3 * i + 1] = normalized_vertices[static_cast<std::size_t>(i)].y;
+		vertices_flat[3 * i + 2] = normalized_vertices[static_cast<std::size_t>(i)].z;
+		normals_flat[3 * i + 0] = normals[static_cast<std::size_t>(i)].x;
+		normals_flat[3 * i + 1] = normals[static_cast<std::size_t>(i)].y;
+		normals_flat[3 * i + 2] = normals[static_cast<std::size_t>(i)].z;
+	}
+
+	float *d_vertices = nullptr;
+	float *d_normals = nullptr;
+	float *d_rot_normals = nullptr;
+	float *d_rot_returns = nullptr;
+
+	const std::size_t vector_bytes = vertices_flat.size() * sizeof(float);
+	const std::size_t return_bytes = rotational_returns->size() * sizeof(float);
+
+	if (!isCudaSuccess(cudaMalloc(&d_vertices, vector_bytes)) ||
+		!isCudaSuccess(cudaMalloc(&d_normals, vector_bytes)) ||
+		!isCudaSuccess(cudaMalloc(&d_rot_normals, vector_bytes)) ||
+		!isCudaSuccess(cudaMalloc(&d_rot_returns, return_bytes)))
+	{
+		cudaFree(d_vertices);
+		cudaFree(d_normals);
+		cudaFree(d_rot_normals);
+		cudaFree(d_rot_returns);
+		return false;
+	}
+
+	if (!isCudaSuccess(cudaMemcpy(d_vertices, vertices_flat.data(), vector_bytes, cudaMemcpyHostToDevice)) ||
+		!isCudaSuccess(cudaMemcpy(d_normals, normals_flat.data(), vector_bytes, cudaMemcpyHostToDevice)))
+	{
+		cudaFree(d_vertices);
+		cudaFree(d_normals);
+		cudaFree(d_rot_normals);
+		cudaFree(d_rot_returns);
+		return false;
+	}
+
+	const int threads = 256;
+	const int blocks = (point_count + threads - 1) / threads;
+	computeRotationalFeaturesKernel<<<blocks, threads>>>(
+		d_vertices,
+		d_normals,
+		theta_for_return_radians,
+		d_rot_normals,
+		d_rot_returns,
+		point_count);
+
+	if (!isCudaSuccess(cudaGetLastError()) || !isCudaSuccess(cudaDeviceSynchronize()))
+	{
+		cudaFree(d_vertices);
+		cudaFree(d_normals);
+		cudaFree(d_rot_normals);
+		cudaFree(d_rot_returns);
+		return false;
+	}
+
+	std::vector<float> rot_normals_flat(vertices_flat.size());
+	std::vector<float> rot_returns_flat(rotational_returns->size());
+
+	if (!isCudaSuccess(cudaMemcpy(rot_normals_flat.data(), d_rot_normals, vector_bytes, cudaMemcpyDeviceToHost)) ||
+		!isCudaSuccess(cudaMemcpy(rot_returns_flat.data(), d_rot_returns, return_bytes, cudaMemcpyDeviceToHost)))
+	{
+		cudaFree(d_vertices);
+		cudaFree(d_normals);
+		cudaFree(d_rot_normals);
+		cudaFree(d_rot_returns);
+		return false;
+	}
+
+	cudaFree(d_vertices);
+	cudaFree(d_normals);
+	cudaFree(d_rot_normals);
+	cudaFree(d_rot_returns);
+
+	for (int i = 0; i < point_count; ++i)
+	{
+		(*rotational_normals)[static_cast<std::size_t>(i)] = glm::fvec3(
+			rot_normals_flat[3 * i + 0],
+			rot_normals_flat[3 * i + 1],
+			rot_normals_flat[3 * i + 2]);
+		(*rotational_returns)[static_cast<std::size_t>(i)] = rot_returns_flat[static_cast<std::size_t>(i)];
+	}
+
+	return true;
+}

From 4bb5106961b499d59513bdd7f18b5764f0c534be Mon Sep 17 00:00:00 2001
From: "sewon.jeon" <irocks0922@gmail.com>
Date: Sun, 8 Feb 2026 22:21:48 +0900
Subject: [PATCH 2/2] ai code review

---
 .github/workflows/pr-agent.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr-agent.yml b/.github/workflows/pr-agent.yml
index 7ee9478..5fb7c3f 100644
--- a/.github/workflows/pr-agent.yml
+++ b/.github/workflows/pr-agent.yml
@@ -30,14 +30,14 @@ jobs:
           CONFIG__CUSTOM_MODEL_MAX_TOKENS: "32768"
           CONFIG__RESPONSE_LANGUAGE: "ko-KR"
 
-          PR_REVIEWER__NUM_CODE_SUGGESTIONS: "3"
+          PR_REVIEWER__NUM_CODE_SUGGESTIONS: "5"
           PR_REVIEWER__REQUIRE_SECURITY_REVIEW: "true"
           PR_REVIEWER__EXTRA_INSTRUCTIONS: "버그, 성능 문제, 메모리 누수를 분석해주세요."
 
           PR_CODE_SUGGESTIONS__NUM_CODE_SUGGESTIONS: "5"
           PR_CODE_SUGGESTIONS__EXTRA_INSTRUCTIONS: "구체적인 코드 수정안을 제시해주세요."
 
-          PR_DESCRIPTION__PUBLISH_LABELS: "true"
+
           PR_DESCRIPTION__ADD_ORIGINAL_USER_DESCRIPTION: "true"
 
         run: |