From 29bb133bfe5d864a3397356cfd2cb52b51a24cc0 Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Mon, 2 Jun 2025 14:00:49 +0200 Subject: [PATCH 01/11] feat: add knn density estimation with kdtree rs tree. --- Cargo.toml | 2 ++ src/density/knn.rs | 84 ++++++++++++++++++++++++++++++++++++++++++++++ src/density/mod.rs | 1 + src/lib.rs | 1 + 4 files changed, 88 insertions(+) create mode 100644 src/density/knn.rs create mode 100644 src/density/mod.rs diff --git a/Cargo.toml b/Cargo.toml index cd3125e0..4e96c4f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,8 @@ rand = ["dep:rand", "nalgebra?/rand"] [dependencies] approx = "0.5.0" num-traits = "0.2.14" +kdtree = "0.7.0" +thiserror = "2.0" [dependencies.rand] version = "0.9.0" diff --git a/src/density/knn.rs b/src/density/knn.rs new file mode 100644 index 00000000..ba8cfcad --- /dev/null +++ b/src/density/knn.rs @@ -0,0 +1,84 @@ +use core::f64::consts::PI; + +use kdtree::{distance::squared_euclidean, ErrorKind, KdTree}; +use thiserror::Error; + +use crate::function::gamma::gamma; + +#[derive(Error, Debug)] +pub enum DensityError { + /// Error when the k-d tree cannot be built or queried. + #[error(transparent)] + KdTree(#[from] ErrorKind), + EmptySample, + EmptyNeighborhood, +} + +impl core::fmt::Display for DensityError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match self { + DensityError::KdTree(err) => write!(f, "K-d tree error: {}", err), + DensityError::EmptySample => write!(f, "No samples provided"), + DensityError::EmptyNeighborhood => write!(f, "No neighbors found"), + } + } +} + +fn orava_optimal_k(n_samples: f64) -> f64 { + // Adapted from K-nearest neighbour kernel density estimation, the choice of optimal k; Jan Orava 2012 + (0.587 * n_samples.powf(4.0 / 5.0)).round().max(1.) +} + +/// Computes the k-nearest neighbor density estimate for a given point `x` +/// using the samples provided. +/// +/// The optimal `k` is computed using Orava's formula. +/// +/// Returns `None` when `samples` is empty. +pub fn knn_pdf(x: f64, samples: &[f64]) -> Result { + if samples.is_empty() { + return Err(DensityError::EmptySample); + } + let n_samples = samples.len() as f64; + let k = orava_optimal_k(n_samples); + let mut tree = KdTree::with_capacity(1, n_samples.log2() as usize); + for (position, sample) in samples.iter().enumerate() { + tree.add([*sample], position)?; + } + let neighbors = tree.nearest(&[x], k as usize, &squared_euclidean)?; + if neighbors.is_empty() { + Err(DensityError::EmptyNeighborhood) + } else { + let radius = neighbors.last().unwrap().0.sqrt(); + let d = 1.; + Ok((k / n_samples) * (gamma(d / 2. + 1.) / (PI.powf(d / 2.) * radius.powf(d)))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::distribution::{Continuous, Normal}; + use rand::distributions::Distribution; + + #[test] + fn test_knn_pdf() { + let law = Normal::new(0., 1.).unwrap(); + let mut rng = rand::thread_rng(); + let samples = (0..100000) + .map(|_| law.sample(&mut rng)) + .collect::>(); + let x = 0.0; + let knn_density = knn_pdf(x, &samples); + println!("Knn: {:?}", knn_density); + println!("Pdf: {:?}", law.pdf(x)); + } + + #[test] + fn test_knn_pdf_empty_samples() { + let samples: Vec = vec![]; + let x = 3.0; + let result = knn_pdf(x, &samples); + assert!(matches!(result, Err(DensityError::EmptySample))); + } +} diff --git a/src/density/mod.rs b/src/density/mod.rs new file mode 100644 index 00000000..7fb9d74b --- /dev/null +++ b/src/density/mod.rs @@ -0,0 +1 @@ +pub mod knn; diff --git a/src/lib.rs b/src/lib.rs index e72d0536..61fa30fb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,6 +54,7 @@ #![cfg_attr(not(feature = "std"), no_std)] pub mod consts; +pub mod density; pub mod distribution; pub mod euclid; pub mod function; From 9b984e8cf1ce214b3b9894495adab1b35d7168d3 Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Mon, 2 Jun 2025 20:18:49 +0200 Subject: [PATCH 02/11] feat: add a more generic implementation of knn density estimation. --- src/density/knn.rs | 35 ++++++++++++++--------- src/density/mod.rs | 69 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 13 deletions(-) diff --git a/src/density/knn.rs b/src/density/knn.rs index ba8cfcad..5c4a3499 100644 --- a/src/density/knn.rs +++ b/src/density/knn.rs @@ -5,6 +5,8 @@ use thiserror::Error; use crate::function::gamma::gamma; +use super::Container; + #[derive(Error, Debug)] pub enum DensityError { /// Error when the k-d tree cannot be built or queried. @@ -35,22 +37,27 @@ fn orava_optimal_k(n_samples: f64) -> f64 { /// The optimal `k` is computed using Orava's formula. /// /// Returns `None` when `samples` is empty. -pub fn knn_pdf(x: f64, samples: &[f64]) -> Result { - if samples.is_empty() { +pub fn knn_pdf(x: X, samples: &S) -> Result +where + S: AsRef<[X]> + Container, + X: AsRef<[f64]> + Container + PartialEq, +{ + if samples.length() == 0 { return Err(DensityError::EmptySample); } - let n_samples = samples.len() as f64; + let n_samples = samples.length() as f64; let k = orava_optimal_k(n_samples); - let mut tree = KdTree::with_capacity(1, n_samples.log2() as usize); - for (position, sample) in samples.iter().enumerate() { - tree.add([*sample], position)?; + let d = x.length(); + let mut tree = KdTree::with_capacity(d, 1 + n_samples.log2() as usize); + for (position, sample) in samples.as_ref().iter().enumerate() { + tree.add(sample.clone(), position)?; } - let neighbors = tree.nearest(&[x], k as usize, &squared_euclidean)?; + let neighbors = tree.nearest(x.as_ref(), k as usize, &squared_euclidean)?; if neighbors.is_empty() { Err(DensityError::EmptyNeighborhood) } else { let radius = neighbors.last().unwrap().0.sqrt(); - let d = 1.; + let d = d as f64; Ok((k / n_samples) * (gamma(d / 2. + 1.) / (PI.powf(d / 2.) * radius.powf(d)))) } } @@ -59,26 +66,28 @@ pub fn knn_pdf(x: f64, samples: &[f64]) -> Result { mod tests { use super::*; use crate::distribution::{Continuous, Normal}; + use nalgebra::Vector1; use rand::distributions::Distribution; #[test] fn test_knn_pdf() { let law = Normal::new(0., 1.).unwrap(); let mut rng = rand::thread_rng(); - let samples = (0..100000) - .map(|_| law.sample(&mut rng)) + let samples = (0..10000) + .map(|_| Vector1::new(law.sample(&mut rng))) .collect::>(); let x = 0.0; - let knn_density = knn_pdf(x, &samples); + let knn_density = knn_pdf(Vector1::new(x), &samples); println!("Knn: {:?}", knn_density); println!("Pdf: {:?}", law.pdf(x)); + assert!(knn_density.is_ok()); } #[test] fn test_knn_pdf_empty_samples() { - let samples: Vec = vec![]; + let samples: Vec<[f64; 1]> = vec![]; let x = 3.0; - let result = knn_pdf(x, &samples); + let result = knn_pdf([x], &samples); assert!(matches!(result, Err(DensityError::EmptySample))); } } diff --git a/src/density/mod.rs b/src/density/mod.rs index 7fb9d74b..4dd73609 100644 --- a/src/density/mod.rs +++ b/src/density/mod.rs @@ -1 +1,70 @@ pub mod knn; + +/// Handles variable/point types for which nearest neighbors can be computed. +pub trait Container: Clone { + type Elem; + fn length(&self) -> usize; +} + +macro_rules! impl_container_for_num { + ($($t:ty),*) => { + $( + impl Container for $t { + type Elem = $t; + fn length(&self) -> usize { + 1 + } + } + )* + }; +} +impl_container_for_num!(f32, f64); + +macro_rules! impl_container { + ($($t:ty),*) => { + $( + impl Container for $t { + type Elem = T; + fn length(&self) -> usize { + self.len() + } + + } + )* + }; +} +impl_container!( + [T; 1], + [T; 2], + [T; 3], + Vec, + nalgebra::Vector1, + nalgebra::Vector2, + nalgebra::Vector3, + nalgebra::Vector4, + nalgebra::Vector5, + nalgebra::Vector6 +); + +#[cfg(test)] +mod tests { + use nalgebra::Vector3; + + use super::*; + + #[test] + fn test_vec_container() { + let v1 = vec![1.0, 2.0, 3.0]; + assert_eq!(v1.length(), 3); + let v2 = Vector3::new(1.0, 2.0, 3.0); + assert_eq!(v2.length(), 3); + } + + #[test] + fn test_num_container() { + let a: f64 = 3.0; + let b: f64 = 5.0; + assert_eq!(a.length(), 1); + assert_eq!(b.length(), 1); + } +} From 4840f383568f972b8ba1838a0d03c95d23d11d63 Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Tue, 3 Jun 2025 07:18:24 +0200 Subject: [PATCH 03/11] perf: add performance bench. --- Cargo.toml | 5 +++++ benches/density.rs | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 benches/density.rs diff --git a/Cargo.toml b/Cargo.toml index 4e96c4f1..7da6804b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,11 @@ name = "order_statistics" harness = false required-features = ["rand", "std"] +[[bench]] +name = "density" +harness = false +required-features = ["rand", "std"] + [features] default = ["std", "nalgebra", "rand"] std = ["nalgebra?/std", "rand?/std"] diff --git a/benches/density.rs b/benches/density.rs new file mode 100644 index 00000000..399ebffb --- /dev/null +++ b/benches/density.rs @@ -0,0 +1,32 @@ +extern crate criterion; +extern crate rand; +extern crate statrs; +use criterion::{criterion_group, criterion_main, Criterion}; + +fn generate_1d(n_samples: usize) -> Vec<[f64; 1]> { + (0..n_samples).map(|_| [rand::random()]).collect() +} + +fn generate_3d(n_samples: usize) -> Vec<[f64; 3]> { + (0..n_samples).map(|_| rand::random()).collect() +} + +fn bench_density(c: &mut Criterion) { + let samples = generate_1d(100_000); + let mut group = c.benchmark_group("density"); + group.bench_function("knn_density_1d", |b| { + b.iter(|| { + let _f = statrs::density::knn::knn_pdf([0.], &samples); + }); + }); + let samples = generate_3d(100_000); + group.bench_function("knn_density_3d", |b| { + b.iter(|| { + let _f = statrs::density::knn::knn_pdf([0., 0., 0.], &samples); + }); + }); +} + +criterion_group!(benches, bench_density); + +criterion_main!(benches); From d2044a68cd63d84559f9c09934cd758947deab16 Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Tue, 3 Jun 2025 08:36:59 +0200 Subject: [PATCH 04/11] feat: add bandwith support for knn density estimation. --- benches/density.rs | 4 ++-- src/density/knn.rs | 54 ++++++++++++++++++---------------------------- src/density/mod.rs | 22 +++++++++++++++++++ 3 files changed, 45 insertions(+), 35 deletions(-) diff --git a/benches/density.rs b/benches/density.rs index 399ebffb..ce3fe3de 100644 --- a/benches/density.rs +++ b/benches/density.rs @@ -16,13 +16,13 @@ fn bench_density(c: &mut Criterion) { let mut group = c.benchmark_group("density"); group.bench_function("knn_density_1d", |b| { b.iter(|| { - let _f = statrs::density::knn::knn_pdf([0.], &samples); + let _f = statrs::density::knn::knn_pdf([0.], &samples, None); }); }); let samples = generate_3d(100_000); group.bench_function("knn_density_3d", |b| { b.iter(|| { - let _f = statrs::density::knn::knn_pdf([0., 0., 0.], &samples); + let _f = statrs::density::knn::knn_pdf([0., 0., 0.], &samples, None); }); }); } diff --git a/src/density/knn.rs b/src/density/knn.rs index 5c4a3499..2f12422f 100644 --- a/src/density/knn.rs +++ b/src/density/knn.rs @@ -1,31 +1,11 @@ use core::f64::consts::PI; -use kdtree::{distance::squared_euclidean, ErrorKind, KdTree}; -use thiserror::Error; +use kdtree::{distance::squared_euclidean, KdTree}; -use crate::function::gamma::gamma; +use crate::{density::DensityError, function::gamma::gamma}; use super::Container; -#[derive(Error, Debug)] -pub enum DensityError { - /// Error when the k-d tree cannot be built or queried. - #[error(transparent)] - KdTree(#[from] ErrorKind), - EmptySample, - EmptyNeighborhood, -} - -impl core::fmt::Display for DensityError { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - match self { - DensityError::KdTree(err) => write!(f, "K-d tree error: {}", err), - DensityError::EmptySample => write!(f, "No samples provided"), - DensityError::EmptyNeighborhood => write!(f, "No neighbors found"), - } - } -} - fn orava_optimal_k(n_samples: f64) -> f64 { // Adapted from K-nearest neighbour kernel density estimation, the choice of optimal k; Jan Orava 2012 (0.587 * n_samples.powf(4.0 / 5.0)).round().max(1.) @@ -37,7 +17,7 @@ fn orava_optimal_k(n_samples: f64) -> f64 { /// The optimal `k` is computed using Orava's formula. /// /// Returns `None` when `samples` is empty. -pub fn knn_pdf(x: X, samples: &S) -> Result +pub fn knn_pdf(x: X, samples: &S, bandwidth: Option) -> Result where S: AsRef<[X]> + Container, X: AsRef<[f64]> + Container + PartialEq, @@ -46,13 +26,19 @@ where return Err(DensityError::EmptySample); } let n_samples = samples.length() as f64; - let k = orava_optimal_k(n_samples); let d = x.length(); - let mut tree = KdTree::with_capacity(d, 1 + n_samples.log2() as usize); + let mut tree = KdTree::with_capacity(d, 2usize.pow(n_samples.log2() as u32)); for (position, sample) in samples.as_ref().iter().enumerate() { tree.add(sample.clone(), position)?; } - let neighbors = tree.nearest(x.as_ref(), k as usize, &squared_euclidean)?; + let (neighbors, k) = if let Some(bandwidth) = bandwidth { + let neighbors = tree.within(x.as_ref(), bandwidth, &squared_euclidean)?; + let k = neighbors.len() as f64; + (neighbors, k) + } else { + let k = orava_optimal_k(n_samples); + (tree.nearest(x.as_ref(), k as usize, &squared_euclidean)?, k) + }; if neighbors.is_empty() { Err(DensityError::EmptyNeighborhood) } else { @@ -65,8 +51,8 @@ where #[cfg(test)] mod tests { use super::*; - use crate::distribution::{Continuous, Normal}; - use nalgebra::Vector1; + use crate::distribution::Normal; + use nalgebra::Vector2; use rand::distributions::Distribution; #[test] @@ -74,12 +60,14 @@ mod tests { let law = Normal::new(0., 1.).unwrap(); let mut rng = rand::thread_rng(); let samples = (0..10000) - .map(|_| Vector1::new(law.sample(&mut rng))) + .map(|_| Vector2::new(law.sample(&mut rng), law.sample(&mut rng))) .collect::>(); - let x = 0.0; - let knn_density = knn_pdf(Vector1::new(x), &samples); + let x = Vector2::new(0.0, 0.0); + let knn_density_with_bandwidth = knn_pdf(x, &samples, Some(0.75)); + let knn_density = knn_pdf(x, &samples, None); println!("Knn: {:?}", knn_density); - println!("Pdf: {:?}", law.pdf(x)); + println!("Knn with bandwidth: {:?}", knn_density_with_bandwidth); + // println!("Pdf: {:?}", law.pdf(x)); assert!(knn_density.is_ok()); } @@ -87,7 +75,7 @@ mod tests { fn test_knn_pdf_empty_samples() { let samples: Vec<[f64; 1]> = vec![]; let x = 3.0; - let result = knn_pdf([x], &samples); + let result = knn_pdf([x], &samples, None); assert!(matches!(result, Err(DensityError::EmptySample))); } } diff --git a/src/density/mod.rs b/src/density/mod.rs index 4dd73609..26e49f31 100644 --- a/src/density/mod.rs +++ b/src/density/mod.rs @@ -1,5 +1,27 @@ pub mod knn; +use kdtree::ErrorKind; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum DensityError { + /// Error when the k-d tree cannot be built or queried. + #[error(transparent)] + KdTree(#[from] ErrorKind), + EmptySample, + EmptyNeighborhood, +} + +impl core::fmt::Display for DensityError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match self { + DensityError::KdTree(err) => write!(f, "K-d tree error: {}", err), + DensityError::EmptySample => write!(f, "No samples provided"), + DensityError::EmptyNeighborhood => write!(f, "No neighbors found"), + } + } +} + /// Handles variable/point types for which nearest neighbors can be computed. pub trait Container: Clone { type Elem; From f606804aba86b230fd0192172d3eec1af11136ae Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Tue, 15 Jul 2025 11:03:00 +0200 Subject: [PATCH 05/11] feat: add first implementation of kernel density estimation. --- src/density/kde.rs | 181 +++++++++++++++++++++++++++++++++++++++++++++ src/density/knn.rs | 21 ++---- src/density/mod.rs | 7 +- 3 files changed, 195 insertions(+), 14 deletions(-) create mode 100644 src/density/kde.rs diff --git a/src/density/kde.rs b/src/density/kde.rs new file mode 100644 index 00000000..e3429cf2 --- /dev/null +++ b/src/density/kde.rs @@ -0,0 +1,181 @@ +use core::{ + f64::consts::{PI, SQRT_2}, + ops::{Div, Sub}, +}; + +use kdtree::{distance::squared_euclidean, KdTree}; + +use crate::density::{Container, DensityError}; + +/// The implemented [kernel functions][source] +/// +/// source: https://en.wikipedia.org/wiki/Kernel_(statistics) +#[derive(Debug, Default, Clone, Copy)] +pub enum Kernel { + #[default] + Epanechnikov, + Gaussian { + sigma: f64, + dim: i32, + }, + Uniform, + Triangular, + Biweigth, + Triweight, + Tricube, + Cosine, + Logistic, + Sigmoid, + Silverman, +} + +impl Kernel { + pub fn evaluate(&self, u: f64) -> f64 { + match self { + Self::Epanechnikov => { + if u.abs() >= 1. { + 0.0 + } else { + 0.75 * (1. - u.powi(2)) + } + } + Self::Gaussian { sigma, dim } => { + (-0.5 * (u / sigma).powi(2)).exp() / (crate::consts::SQRT_2PI.powi(*dim) * sigma) + } + Self::Uniform => { + if u.abs() > 1. { + 0.0 + } else { + 0.5 + } + } + Self::Triangular => { + let abs_u = u.abs(); + if abs_u >= 1. { + 0.0 + } else { + 1. - abs_u + } + } + Self::Biweigth => { + if u.abs() >= 1. { + 0.0 + } else { + (15. / 16.) * (1. - u.powi(2)).powi(2) + } + } + Self::Triweight => { + if u.abs() >= 1. { + 0.0 + } else { + (35. / 32.) * (1. - u.powi(2)).powi(3) + } + } + Self::Tricube => { + let abs_u = u.abs(); + if abs_u >= 1. { + 0.0 + } else { + (70. / 81.) * (1. - abs_u.powi(3)).powi(3) + } + } + Self::Cosine => { + if u.abs() >= 1. { + 0.0 + } else { + (PI / 4.) * ((PI / 2.) * u).cos() + } + } + Self::Logistic => 0.5 / (1. + u.cosh()), + Self::Sigmoid => 1. / (PI * u.cosh()), + Self::Silverman => { + let abs_u_over_sqrt2 = u.abs() / SQRT_2; + 0.5 * (-abs_u_over_sqrt2).exp() * (PI / 4. + abs_u_over_sqrt2).sin() + } + } + } +} + +/// Computes the kernel density estimate for a given one dimensional point `x` +/// using the samples provided and a specified kernel. +/// +/// The optimal `k` is computed using Orava's formula when `bandwidth` is `None`. +pub fn kde_pdf( + x: X, + samples: &S, + bandwidth: Option, + kernel: Kernel, +) -> Result +where + S: AsRef<[X]> + Container, + X: AsRef<[f64]> + Container + PartialEq, //+ Div + for<'a> &'a X: Sub<&'a X, Output = X> + Div, +{ + if samples.length() == 0 { + return Err(DensityError::EmptySample); + } + let n_samples = samples.length() as f64; + let d = x.length(); + let mut tree = KdTree::with_capacity(d, 2usize.pow(n_samples.log2() as u32)); + for (position, sample) in samples.as_ref().iter().enumerate() { + tree.add(sample.clone(), position)?; + } + let neighbors = if let Some(bandwidth) = bandwidth { + let neighbors = tree.within(x.as_ref(), bandwidth, &squared_euclidean)?; + neighbors + } else { + let k = super::orava_optimal_k(n_samples); + tree.nearest(x.as_ref(), k as usize, &squared_euclidean)? + }; + if neighbors.is_empty() { + Err(DensityError::EmptyNeighborhood) + } else { + let radius = neighbors.last().unwrap().0.sqrt(); + Ok((1. / (n_samples * radius)) + * samples + .as_ref() + .iter() + .map(|xi| { + kernel.evaluate(squared_euclidean( + (&x / radius).as_ref(), + (xi / radius).as_ref(), + )) + }) + .sum::()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::distribution::Normal; + use nalgebra::{Vector1, Vector2}; + use rand::distributions::Distribution; + + #[test] + fn test_kernel_1d() { + let kernel = Kernel::Epanechnikov; + assert_eq!(kernel.evaluate(0.5), 0.75 * 0.75); + assert_eq!(kernel.evaluate(1.5), 0.0); + + let kernel = Kernel::Gaussian { sigma: 1.0, dim: 1 }; + assert!((kernel.evaluate(0.0) - (1. / (SQRT_2 * PI.sqrt()))).abs() < 1e-10); + } + + #[test] + fn test_kde_pdf() { + let law = Normal::new(0., 1.).unwrap(); + let mut rng = rand::thread_rng(); + let samples = (0..100000) + // .map(|_| Vector2::new(law.sample(&mut rng), law.sample(&mut rng))) + .map(|_| Vector1::new(law.sample(&mut rng))) + .collect::>(); + // let x = Vector2::new(0.0, 0.0); + let x = Vector1::new(0.0); + let kde_density_with_bandwidth = kde_pdf(x, &samples, Some(0.5), Default::default()); + let kde_density = kde_pdf(x, &samples, None, Default::default()); + println!("Kde: {:?}", kde_density); + println!("Kde with bandwidth: {:?}", kde_density_with_bandwidth); + assert!(kde_density.is_ok()); + } +} diff --git a/src/density/knn.rs b/src/density/knn.rs index 2f12422f..88f1330b 100644 --- a/src/density/knn.rs +++ b/src/density/knn.rs @@ -4,19 +4,12 @@ use kdtree::{distance::squared_euclidean, KdTree}; use crate::{density::DensityError, function::gamma::gamma}; -use super::Container; +use super::{orava_optimal_k, Container}; -fn orava_optimal_k(n_samples: f64) -> f64 { - // Adapted from K-nearest neighbour kernel density estimation, the choice of optimal k; Jan Orava 2012 - (0.587 * n_samples.powf(4.0 / 5.0)).round().max(1.) -} - -/// Computes the k-nearest neighbor density estimate for a given point `x` +/// Computes the `k`-nearest neighbor density estimate for a given point `x` /// using the samples provided. /// -/// The optimal `k` is computed using Orava's formula. -/// -/// Returns `None` when `samples` is empty. +/// The optimal `k` is computed using Orava's formula when `bandwidth` is `None`. pub fn knn_pdf(x: X, samples: &S, bandwidth: Option) -> Result where S: AsRef<[X]> + Container, @@ -52,18 +45,20 @@ where mod tests { use super::*; use crate::distribution::Normal; - use nalgebra::Vector2; + use nalgebra::{Vector1, Vector2}; use rand::distributions::Distribution; #[test] fn test_knn_pdf() { let law = Normal::new(0., 1.).unwrap(); let mut rng = rand::thread_rng(); - let samples = (0..10000) + let samples = (0..100000) .map(|_| Vector2::new(law.sample(&mut rng), law.sample(&mut rng))) + // .map(|_| Vector1::new(law.sample(&mut rng))) .collect::>(); let x = Vector2::new(0.0, 0.0); - let knn_density_with_bandwidth = knn_pdf(x, &samples, Some(0.75)); + // let x = Vector1::new(0.0); + let knn_density_with_bandwidth = knn_pdf(x, &samples, Some(0.2)); let knn_density = knn_pdf(x, &samples, None); println!("Knn: {:?}", knn_density); println!("Knn with bandwidth: {:?}", knn_density_with_bandwidth); diff --git a/src/density/mod.rs b/src/density/mod.rs index 26e49f31..3083bfdf 100644 --- a/src/density/mod.rs +++ b/src/density/mod.rs @@ -1,5 +1,5 @@ +pub mod kde; pub mod knn; - use kdtree::ErrorKind; use thiserror::Error; @@ -22,6 +22,11 @@ impl core::fmt::Display for DensityError { } } +fn orava_optimal_k(n_samples: f64) -> f64 { + // Adapted from K-nearest neighbour kernel density estimation, the choice of optimal k; Jan Orava 2012 + (0.587 * n_samples.powf(4.0 / 5.0)).round().max(1.) +} + /// Handles variable/point types for which nearest neighbors can be computed. pub trait Container: Clone { type Elem; From 6f28a6c42d99d9bbb65f8032f7dbd94b527973c6 Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Tue, 15 Jul 2025 13:38:32 +0200 Subject: [PATCH 06/11] fix: kernel density computation. --- src/density/kde.rs | 39 ++++++++++++++++----------------------- src/density/knn.rs | 2 +- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/src/density/kde.rs b/src/density/kde.rs index e3429cf2..edd79fe1 100644 --- a/src/density/kde.rs +++ b/src/density/kde.rs @@ -15,7 +15,6 @@ pub enum Kernel { #[default] Epanechnikov, Gaussian { - sigma: f64, dim: i32, }, Uniform, @@ -39,9 +38,7 @@ impl Kernel { 0.75 * (1. - u.powi(2)) } } - Self::Gaussian { sigma, dim } => { - (-0.5 * (u / sigma).powi(2)).exp() / (crate::consts::SQRT_2PI.powi(*dim) * sigma) - } + Self::Gaussian { dim } => (-0.5 * u.powi(2)).exp() / crate::consts::SQRT_2PI.powi(*dim), Self::Uniform => { if u.abs() > 1. { 0.0 @@ -100,12 +97,7 @@ impl Kernel { /// using the samples provided and a specified kernel. /// /// The optimal `k` is computed using Orava's formula when `bandwidth` is `None`. -pub fn kde_pdf( - x: X, - samples: &S, - bandwidth: Option, - kernel: Kernel, -) -> Result +pub fn kde_pdf(x: X, samples: &S, bandwidth: Option) -> Result where S: AsRef<[X]> + Container, X: AsRef<[f64]> + Container + PartialEq, //+ Div @@ -131,15 +123,15 @@ where Err(DensityError::EmptyNeighborhood) } else { let radius = neighbors.last().unwrap().0.sqrt(); - Ok((1. / (n_samples * radius)) + let gaussian_kernel = Kernel::Gaussian { dim: d as i32 }; + Ok((1. / (n_samples * radius.powi(d as i32))) * samples .as_ref() .iter() .map(|xi| { - kernel.evaluate(squared_euclidean( - (&x / radius).as_ref(), - (xi / radius).as_ref(), - )) + gaussian_kernel.evaluate( + squared_euclidean((&x / radius).as_ref(), (xi / radius).as_ref()).sqrt(), + ) }) .sum::()) } @@ -149,7 +141,7 @@ where mod tests { use super::*; use crate::distribution::Normal; - use nalgebra::{Vector1, Vector2}; + use nalgebra::Vector2; use rand::distributions::Distribution; #[test] @@ -158,7 +150,7 @@ mod tests { assert_eq!(kernel.evaluate(0.5), 0.75 * 0.75); assert_eq!(kernel.evaluate(1.5), 0.0); - let kernel = Kernel::Gaussian { sigma: 1.0, dim: 1 }; + let kernel = Kernel::Gaussian { dim: 1 }; assert!((kernel.evaluate(0.0) - (1. / (SQRT_2 * PI.sqrt()))).abs() < 1e-10); } @@ -167,15 +159,16 @@ mod tests { let law = Normal::new(0., 1.).unwrap(); let mut rng = rand::thread_rng(); let samples = (0..100000) - // .map(|_| Vector2::new(law.sample(&mut rng), law.sample(&mut rng))) - .map(|_| Vector1::new(law.sample(&mut rng))) + .map(|_| Vector2::new(law.sample(&mut rng), law.sample(&mut rng))) + // .map(|_| Vector1::new(law.sample(&mut rng))) .collect::>(); - // let x = Vector2::new(0.0, 0.0); - let x = Vector1::new(0.0); - let kde_density_with_bandwidth = kde_pdf(x, &samples, Some(0.5), Default::default()); - let kde_density = kde_pdf(x, &samples, None, Default::default()); + let x = Vector2::new(0.0, 0.0); + // let x = Vector1::new(0.0); + let kde_density_with_bandwidth = kde_pdf(x, &samples, Some(0.1)); + let kde_density = kde_pdf(x, &samples, None); println!("Kde: {:?}", kde_density); println!("Kde with bandwidth: {:?}", kde_density_with_bandwidth); + // println!("Pdf: {:?}", law.pdf(x[0])); assert!(kde_density.is_ok()); } } diff --git a/src/density/knn.rs b/src/density/knn.rs index 88f1330b..c096cd84 100644 --- a/src/density/knn.rs +++ b/src/density/knn.rs @@ -45,7 +45,7 @@ where mod tests { use super::*; use crate::distribution::Normal; - use nalgebra::{Vector1, Vector2}; + use nalgebra::Vector2; use rand::distributions::Distribution; #[test] From 770a2e4dd842b6e7af61472de5a0de6eac4625e1 Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Tue, 15 Jul 2025 13:57:35 +0200 Subject: [PATCH 07/11] perf: add performance bench. --- benches/density.rs | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/benches/density.rs b/benches/density.rs index ce3fe3de..dc45542f 100644 --- a/benches/density.rs +++ b/benches/density.rs @@ -2,29 +2,43 @@ extern crate criterion; extern crate rand; extern crate statrs; use criterion::{criterion_group, criterion_main, Criterion}; +use nalgebra::{Vector1, Vector3}; +use rand::distributions::Standard; -fn generate_1d(n_samples: usize) -> Vec<[f64; 1]> { - (0..n_samples).map(|_| [rand::random()]).collect() -} - -fn generate_3d(n_samples: usize) -> Vec<[f64; 3]> { +fn generate(n_samples: usize) -> Vec +where + Standard: rand::distributions::Distribution, +{ (0..n_samples).map(|_| rand::random()).collect() } - fn bench_density(c: &mut Criterion) { - let samples = generate_1d(100_000); + let samples = generate(100_00); + // generate_1d(100_000); let mut group = c.benchmark_group("density"); group.bench_function("knn_density_1d", |b| { b.iter(|| { let _f = statrs::density::knn::knn_pdf([0.], &samples, None); }); }); - let samples = generate_3d(100_000); + let samples = generate(100_000); group.bench_function("knn_density_3d", |b| { b.iter(|| { let _f = statrs::density::knn::knn_pdf([0., 0., 0.], &samples, None); }); }); + + let samples = generate(100_000); + group.bench_function("kde_density_1d", |b| { + b.iter(|| { + let _f = statrs::density::kde::kde_pdf(Vector1::new(0.), &samples, None); + }); + }); + let samples = generate(100_000); + group.bench_function("kde_density_3d", |b| { + b.iter(|| { + let _f = statrs::density::kde::kde_pdf(Vector3::new(0., 0., 0.), &samples, None); + }); + }); } criterion_group!(benches, bench_density); From a2bf0c8a3aff61baad93ad7bc1f3751cc9a11a88 Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Sun, 20 Jul 2025 15:28:56 +0200 Subject: [PATCH 08/11] chore: some refactoring. --- src/density/kde.rs | 42 +++++++++++---------------------- src/density/knn.rs | 35 +++++++++------------------- src/density/mod.rs | 58 ++++++++++++++++++++++++++++------------------ 3 files changed, 59 insertions(+), 76 deletions(-) diff --git a/src/density/kde.rs b/src/density/kde.rs index edd79fe1..e66f26b5 100644 --- a/src/density/kde.rs +++ b/src/density/kde.rs @@ -1,11 +1,8 @@ -use core::{ - f64::consts::{PI, SQRT_2}, - ops::{Div, Sub}, -}; +use core::f64::consts::{PI, SQRT_2}; -use kdtree::{distance::squared_euclidean, KdTree}; +use kdtree::distance::squared_euclidean; -use crate::density::{Container, DensityError}; +use crate::density::{nearest_neighbors, Container, DensityError}; /// The implemented [kernel functions][source] /// @@ -93,45 +90,32 @@ impl Kernel { } } -/// Computes the kernel density estimate for a given one dimensional point `x` +/// Computes the kernel density estimate for a given point `x` /// using the samples provided and a specified kernel. /// -/// The optimal `k` is computed using Orava's formula when `bandwidth` is `None`. +/// The optimal `k` is computed using [Orava's][orava] formula when `bandwidth` is `None`. +/// +/// orava: K-nearest neighbour kernel density estimation, the choice of optimal k; Jan Orava 2012. pub fn kde_pdf(x: X, samples: &S, bandwidth: Option) -> Result where S: AsRef<[X]> + Container, - X: AsRef<[f64]> + Container + PartialEq, //+ Div - for<'a> &'a X: Sub<&'a X, Output = X> + Div, + X: AsRef<[f64]> + Container + PartialEq, { - if samples.length() == 0 { - return Err(DensityError::EmptySample); - } let n_samples = samples.length() as f64; let d = x.length(); - let mut tree = KdTree::with_capacity(d, 2usize.pow(n_samples.log2() as u32)); - for (position, sample) in samples.as_ref().iter().enumerate() { - tree.add(sample.clone(), position)?; - } - let neighbors = if let Some(bandwidth) = bandwidth { - let neighbors = tree.within(x.as_ref(), bandwidth, &squared_euclidean)?; - neighbors - } else { - let k = super::orava_optimal_k(n_samples); - tree.nearest(x.as_ref(), k as usize, &squared_euclidean)? - }; + let neighbors = nearest_neighbors(&x, samples, bandwidth)?.0; if neighbors.is_empty() { Err(DensityError::EmptyNeighborhood) } else { - let radius = neighbors.last().unwrap().0.sqrt(); + let radius = neighbors.last().unwrap().sqrt(); let gaussian_kernel = Kernel::Gaussian { dim: d as i32 }; Ok((1. / (n_samples * radius.powi(d as i32))) * samples .as_ref() .iter() .map(|xi| { - gaussian_kernel.evaluate( - squared_euclidean((&x / radius).as_ref(), (xi / radius).as_ref()).sqrt(), - ) + gaussian_kernel + .evaluate(squared_euclidean(x.as_ref(), xi.as_ref()).sqrt() / radius) }) .sum::()) } @@ -164,7 +148,7 @@ mod tests { .collect::>(); let x = Vector2::new(0.0, 0.0); // let x = Vector1::new(0.0); - let kde_density_with_bandwidth = kde_pdf(x, &samples, Some(0.1)); + let kde_density_with_bandwidth = kde_pdf(x, &samples, Some(0.05)); let kde_density = kde_pdf(x, &samples, None); println!("Kde: {:?}", kde_density); println!("Kde with bandwidth: {:?}", kde_density_with_bandwidth); diff --git a/src/density/knn.rs b/src/density/knn.rs index c096cd84..afe94478 100644 --- a/src/density/knn.rs +++ b/src/density/knn.rs @@ -1,41 +1,28 @@ +use super::Container; +use crate::{ + density::{nearest_neighbors, DensityError}, + function::gamma::gamma, +}; use core::f64::consts::PI; -use kdtree::{distance::squared_euclidean, KdTree}; - -use crate::{density::DensityError, function::gamma::gamma}; - -use super::{orava_optimal_k, Container}; - /// Computes the `k`-nearest neighbor density estimate for a given point `x` /// using the samples provided. /// -/// The optimal `k` is computed using Orava's formula when `bandwidth` is `None`. +/// The optimal `k` is computed using [Orava's][orava] formula when `bandwidth` is `None`. +/// +/// orava: K-nearest neighbour kernel density estimation, the choice of optimal k; Jan Orava 2012. pub fn knn_pdf(x: X, samples: &S, bandwidth: Option) -> Result where - S: AsRef<[X]> + Container, + S: AsRef<[X]> + Container, X: AsRef<[f64]> + Container + PartialEq, { - if samples.length() == 0 { - return Err(DensityError::EmptySample); - } let n_samples = samples.length() as f64; let d = x.length(); - let mut tree = KdTree::with_capacity(d, 2usize.pow(n_samples.log2() as u32)); - for (position, sample) in samples.as_ref().iter().enumerate() { - tree.add(sample.clone(), position)?; - } - let (neighbors, k) = if let Some(bandwidth) = bandwidth { - let neighbors = tree.within(x.as_ref(), bandwidth, &squared_euclidean)?; - let k = neighbors.len() as f64; - (neighbors, k) - } else { - let k = orava_optimal_k(n_samples); - (tree.nearest(x.as_ref(), k as usize, &squared_euclidean)?, k) - }; + let (neighbors, k) = nearest_neighbors(&x, samples, bandwidth)?; if neighbors.is_empty() { Err(DensityError::EmptyNeighborhood) } else { - let radius = neighbors.last().unwrap().0.sqrt(); + let radius = neighbors.last().unwrap().sqrt(); let d = d as f64; Ok((k / n_samples) * (gamma(d / 2. + 1.) / (PI.powf(d / 2.) * radius.powf(d)))) } diff --git a/src/density/mod.rs b/src/density/mod.rs index 3083bfdf..9cd2faa5 100644 --- a/src/density/mod.rs +++ b/src/density/mod.rs @@ -1,6 +1,6 @@ pub mod kde; pub mod knn; -use kdtree::ErrorKind; +use kdtree::{distance::squared_euclidean, ErrorKind, KdTree}; use thiserror::Error; #[derive(Error, Debug)] @@ -33,20 +33,6 @@ pub trait Container: Clone { fn length(&self) -> usize; } -macro_rules! impl_container_for_num { - ($($t:ty),*) => { - $( - impl Container for $t { - type Elem = $t; - fn length(&self) -> usize { - 1 - } - } - )* - }; -} -impl_container_for_num!(f32, f64); - macro_rules! impl_container { ($($t:ty),*) => { $( @@ -72,7 +58,41 @@ impl_container!( nalgebra::Vector5, nalgebra::Vector6 ); +pub type NearestNeighbors = (Vec, f64); +pub(crate) fn nearest_neighbors( + x: &X, + samples: &S, + bandwidth: Option, +) -> Result +where + S: AsRef<[X]> + Container, + X: AsRef<[f64]> + Container + PartialEq, +{ + if samples.length() == 0 { + return Err(DensityError::EmptySample); + } + let n_samples = samples.length() as f64; + let d = x.length(); + let mut tree = KdTree::with_capacity(d, 2usize.pow(n_samples.log2() as u32)); + for (position, sample) in samples.as_ref().iter().enumerate() { + tree.add(sample.clone(), position)?; + } + if let Some(bandwidth) = bandwidth { + let neighbors = tree.within(x.as_ref(), bandwidth, &squared_euclidean)?; + let k = neighbors.len() as f64; + Ok((neighbors.into_iter().map(|r| r.0).collect(), k)) + } else { + let k = orava_optimal_k(n_samples); + Ok(( + tree.nearest(x.as_ref(), k as usize, &squared_euclidean)? + .into_iter() + .map(|r| r.0) + .collect(), + k, + )) + } +} #[cfg(test)] mod tests { use nalgebra::Vector3; @@ -86,12 +106,4 @@ mod tests { let v2 = Vector3::new(1.0, 2.0, 3.0); assert_eq!(v2.length(), 3); } - - #[test] - fn test_num_container() { - let a: f64 = 3.0; - let b: f64 = 5.0; - assert_eq!(a.length(), 1); - assert_eq!(b.length(), 1); - } } From 2476696c8bf33819edbf401014a215d0487e7720 Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Sun, 20 Jul 2025 22:43:02 +0200 Subject: [PATCH 09/11] chore: density functions borrowing evaluation pointS instead of owning them. --- benches/density.rs | 12 +++++++----- src/density/kde.rs | 16 ++++++++-------- src/density/knn.rs | 15 +++++++-------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/benches/density.rs b/benches/density.rs index dc45542f..c85431c3 100644 --- a/benches/density.rs +++ b/benches/density.rs @@ -11,32 +11,34 @@ where { (0..n_samples).map(|_| rand::random()).collect() } + fn bench_density(c: &mut Criterion) { let samples = generate(100_00); - // generate_1d(100_000); let mut group = c.benchmark_group("density"); group.bench_function("knn_density_1d", |b| { b.iter(|| { - let _f = statrs::density::knn::knn_pdf([0.], &samples, None); + let _f = statrs::density::knn::knn_pdf(&[0.], &samples, None); }); }); + let samples = generate(100_000); group.bench_function("knn_density_3d", |b| { b.iter(|| { - let _f = statrs::density::knn::knn_pdf([0., 0., 0.], &samples, None); + let _f = statrs::density::knn::knn_pdf(&[0., 0., 0.], &samples, None); }); }); let samples = generate(100_000); group.bench_function("kde_density_1d", |b| { b.iter(|| { - let _f = statrs::density::kde::kde_pdf(Vector1::new(0.), &samples, None); + let _f = statrs::density::kde::kde_pdf(&Vector1::new(0.), &samples, None); }); }); + let samples = generate(100_000); group.bench_function("kde_density_3d", |b| { b.iter(|| { - let _f = statrs::density::kde::kde_pdf(Vector3::new(0., 0., 0.), &samples, None); + let _f = statrs::density::kde::kde_pdf(&Vector3::new(0., 0., 0.), &samples, None); }); }); } diff --git a/src/density/kde.rs b/src/density/kde.rs index e66f26b5..fcbf473a 100644 --- a/src/density/kde.rs +++ b/src/density/kde.rs @@ -96,20 +96,20 @@ impl Kernel { /// The optimal `k` is computed using [Orava's][orava] formula when `bandwidth` is `None`. /// /// orava: K-nearest neighbour kernel density estimation, the choice of optimal k; Jan Orava 2012. -pub fn kde_pdf(x: X, samples: &S, bandwidth: Option) -> Result +pub fn kde_pdf(x: &X, samples: &S, bandwidth: Option) -> Result where S: AsRef<[X]> + Container, X: AsRef<[f64]> + Container + PartialEq, { let n_samples = samples.length() as f64; - let d = x.length(); - let neighbors = nearest_neighbors(&x, samples, bandwidth)?.0; + let neighbors = nearest_neighbors(x, samples, bandwidth)?.0; if neighbors.is_empty() { Err(DensityError::EmptyNeighborhood) } else { let radius = neighbors.last().unwrap().sqrt(); - let gaussian_kernel = Kernel::Gaussian { dim: d as i32 }; - Ok((1. / (n_samples * radius.powi(d as i32))) + let d = x.length() as i32; + let gaussian_kernel = Kernel::Gaussian { dim: d }; + Ok((1. / (n_samples * radius.powi(d))) * samples .as_ref() .iter() @@ -146,10 +146,10 @@ mod tests { .map(|_| Vector2::new(law.sample(&mut rng), law.sample(&mut rng))) // .map(|_| Vector1::new(law.sample(&mut rng))) .collect::>(); - let x = Vector2::new(0.0, 0.0); + let x = Vector2::new(1.0, 0.0); // let x = Vector1::new(0.0); - let kde_density_with_bandwidth = kde_pdf(x, &samples, Some(0.05)); - let kde_density = kde_pdf(x, &samples, None); + let kde_density_with_bandwidth = kde_pdf(&x, &samples, Some(0.05)); + let kde_density = kde_pdf(&x, &samples, None); println!("Kde: {:?}", kde_density); println!("Kde with bandwidth: {:?}", kde_density_with_bandwidth); // println!("Pdf: {:?}", law.pdf(x[0])); diff --git a/src/density/knn.rs b/src/density/knn.rs index afe94478..aba690b5 100644 --- a/src/density/knn.rs +++ b/src/density/knn.rs @@ -11,19 +11,18 @@ use core::f64::consts::PI; /// The optimal `k` is computed using [Orava's][orava] formula when `bandwidth` is `None`. /// /// orava: K-nearest neighbour kernel density estimation, the choice of optimal k; Jan Orava 2012. -pub fn knn_pdf(x: X, samples: &S, bandwidth: Option) -> Result +pub fn knn_pdf(x: &X, samples: &S, bandwidth: Option) -> Result where S: AsRef<[X]> + Container, X: AsRef<[f64]> + Container + PartialEq, { let n_samples = samples.length() as f64; - let d = x.length(); - let (neighbors, k) = nearest_neighbors(&x, samples, bandwidth)?; + let (neighbors, k) = nearest_neighbors(x, samples, bandwidth)?; if neighbors.is_empty() { Err(DensityError::EmptyNeighborhood) } else { let radius = neighbors.last().unwrap().sqrt(); - let d = d as f64; + let d = x.length() as f64; Ok((k / n_samples) * (gamma(d / 2. + 1.) / (PI.powf(d / 2.) * radius.powf(d)))) } } @@ -43,10 +42,10 @@ mod tests { .map(|_| Vector2::new(law.sample(&mut rng), law.sample(&mut rng))) // .map(|_| Vector1::new(law.sample(&mut rng))) .collect::>(); - let x = Vector2::new(0.0, 0.0); + let x = Vector2::new(1.0, 0.0); // let x = Vector1::new(0.0); - let knn_density_with_bandwidth = knn_pdf(x, &samples, Some(0.2)); - let knn_density = knn_pdf(x, &samples, None); + let knn_density_with_bandwidth = knn_pdf(&x, &samples, Some(0.2)); + let knn_density = knn_pdf(&x, &samples, None); println!("Knn: {:?}", knn_density); println!("Knn with bandwidth: {:?}", knn_density_with_bandwidth); // println!("Pdf: {:?}", law.pdf(x)); @@ -57,7 +56,7 @@ mod tests { fn test_knn_pdf_empty_samples() { let samples: Vec<[f64; 1]> = vec![]; let x = 3.0; - let result = knn_pdf([x], &samples, None); + let result = knn_pdf(&[x], &samples, None); assert!(matches!(result, Err(DensityError::EmptySample))); } } From b54076e4ddafb10278fe616d0cfa69c13b2cbd97 Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Mon, 17 Nov 2025 03:44:34 +0100 Subject: [PATCH 10/11] fix imports and add some tests. --- benches/density.rs | 8 +-- src/density/kde.rs | 148 ++++++++++----------------------------------- src/density/knn.rs | 42 +++++++++---- src/density/mod.rs | 2 +- 4 files changed, 67 insertions(+), 133 deletions(-) diff --git a/benches/density.rs b/benches/density.rs index c85431c3..4cac74b2 100644 --- a/benches/density.rs +++ b/benches/density.rs @@ -1,19 +1,19 @@ extern crate criterion; extern crate rand; extern crate statrs; -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{Criterion, criterion_group, criterion_main}; use nalgebra::{Vector1, Vector3}; -use rand::distributions::Standard; +use rand::distr::StandardUniform; fn generate(n_samples: usize) -> Vec where - Standard: rand::distributions::Distribution, + StandardUniform: rand::distr::Distribution, { (0..n_samples).map(|_| rand::random()).collect() } fn bench_density(c: &mut Criterion) { - let samples = generate(100_00); + let samples = generate(100_000); let mut group = c.benchmark_group("density"); group.bench_function("knn_density_1d", |b| { b.iter(|| { diff --git a/src/density/kde.rs b/src/density/kde.rs index fcbf473a..2994ebdc 100644 --- a/src/density/kde.rs +++ b/src/density/kde.rs @@ -1,94 +1,6 @@ -use core::f64::consts::{PI, SQRT_2}; - use kdtree::distance::squared_euclidean; -use crate::density::{nearest_neighbors, Container, DensityError}; - -/// The implemented [kernel functions][source] -/// -/// source: https://en.wikipedia.org/wiki/Kernel_(statistics) -#[derive(Debug, Default, Clone, Copy)] -pub enum Kernel { - #[default] - Epanechnikov, - Gaussian { - dim: i32, - }, - Uniform, - Triangular, - Biweigth, - Triweight, - Tricube, - Cosine, - Logistic, - Sigmoid, - Silverman, -} - -impl Kernel { - pub fn evaluate(&self, u: f64) -> f64 { - match self { - Self::Epanechnikov => { - if u.abs() >= 1. { - 0.0 - } else { - 0.75 * (1. - u.powi(2)) - } - } - Self::Gaussian { dim } => (-0.5 * u.powi(2)).exp() / crate::consts::SQRT_2PI.powi(*dim), - Self::Uniform => { - if u.abs() > 1. { - 0.0 - } else { - 0.5 - } - } - Self::Triangular => { - let abs_u = u.abs(); - if abs_u >= 1. { - 0.0 - } else { - 1. - abs_u - } - } - Self::Biweigth => { - if u.abs() >= 1. { - 0.0 - } else { - (15. / 16.) * (1. - u.powi(2)).powi(2) - } - } - Self::Triweight => { - if u.abs() >= 1. { - 0.0 - } else { - (35. / 32.) * (1. - u.powi(2)).powi(3) - } - } - Self::Tricube => { - let abs_u = u.abs(); - if abs_u >= 1. { - 0.0 - } else { - (70. / 81.) * (1. - abs_u.powi(3)).powi(3) - } - } - Self::Cosine => { - if u.abs() >= 1. { - 0.0 - } else { - (PI / 4.) * ((PI / 2.) * u).cos() - } - } - Self::Logistic => 0.5 / (1. + u.cosh()), - Self::Sigmoid => 1. / (PI * u.cosh()), - Self::Silverman => { - let abs_u_over_sqrt2 = u.abs() / SQRT_2; - 0.5 * (-abs_u_over_sqrt2).exp() * (PI / 4. + abs_u_over_sqrt2).sin() - } - } - } -} +use crate::density::{Container, DensityError, nearest_neighbors}; /// Computes the kernel density estimate for a given point `x` /// using the samples provided and a specified kernel. @@ -106,16 +18,16 @@ where if neighbors.is_empty() { Err(DensityError::EmptyNeighborhood) } else { - let radius = neighbors.last().unwrap().sqrt(); + let radius = neighbors.last().unwrap().sqrt(); // safe to unwrap here since `neighbors` is not empty let d = x.length() as i32; - let gaussian_kernel = Kernel::Gaussian { dim: d }; Ok((1. / (n_samples * radius.powi(d))) * samples .as_ref() .iter() .map(|xi| { - gaussian_kernel - .evaluate(squared_euclidean(x.as_ref(), xi.as_ref()).sqrt() / radius) + (-0.5 * (squared_euclidean(x.as_ref(), xi.as_ref()).sqrt() / radius).powi(2)) + .exp() + / crate::consts::SQRT_2PI.powi(d) }) .sum::()) } @@ -123,36 +35,42 @@ where #[cfg(test)] mod tests { + use core::f32::consts::PI; + use super::*; use crate::distribution::Normal; - use nalgebra::Vector2; - use rand::distributions::Distribution; - - #[test] - fn test_kernel_1d() { - let kernel = Kernel::Epanechnikov; - assert_eq!(kernel.evaluate(0.5), 0.75 * 0.75); - assert_eq!(kernel.evaluate(1.5), 0.0); - - let kernel = Kernel::Gaussian { dim: 1 }; - assert!((kernel.evaluate(0.0) - (1. / (SQRT_2 * PI.sqrt()))).abs() < 1e-10); - } + use crate::function::kernel::Kernel; + use nalgebra::{Vector1, Vector2}; + use rand::distr::Distribution; #[test] fn test_kde_pdf() { let law = Normal::new(0., 1.).unwrap(); - let mut rng = rand::thread_rng(); - let samples = (0..100000) + let mut rng = rand::rng(); + let gaussian = crate::function::kernel::Gaussian; + let samples_1d = (0..100000) + .map(|_| Vector1::new(law.sample(&mut rng))) + .collect::>(); + let x = Vector1::new(0.); + let kde_density_with_bandwidth = kde_pdf(&x, &samples_1d, Some(0.05)); + let kde_density = kde_pdf(&x, &samples_1d, None); + let reference_value = gaussian.evaluate(0.); + assert!(kde_density.is_ok()); + assert!(kde_density_with_bandwidth.is_ok()); + assert!((kde_density.unwrap() - reference_value).abs() < 2e-2); + assert!((kde_density_with_bandwidth.unwrap() - reference_value).abs() < 3e-2); + + let samples_2d = (0..100000) .map(|_| Vector2::new(law.sample(&mut rng), law.sample(&mut rng))) - // .map(|_| Vector1::new(law.sample(&mut rng))) .collect::>(); - let x = Vector2::new(1.0, 0.0); - // let x = Vector1::new(0.0); - let kde_density_with_bandwidth = kde_pdf(&x, &samples, Some(0.05)); - let kde_density = kde_pdf(&x, &samples, None); - println!("Kde: {:?}", kde_density); - println!("Kde with bandwidth: {:?}", kde_density_with_bandwidth); - // println!("Pdf: {:?}", law.pdf(x[0])); + + let x = Vector2::new(0., 0.); + let kde_density_with_bandwidth = kde_pdf(&x, &samples_2d, Some(0.05)); + let kde_density = kde_pdf(&x, &samples_2d, None); + let reference_value = 1. / (2. * PI) as f64; assert!(kde_density.is_ok()); + assert!(kde_density_with_bandwidth.is_ok()); + assert!((kde_density.unwrap() - reference_value).abs() < 2e-2); + assert!((kde_density_with_bandwidth.unwrap() - reference_value).abs() < 3e-2); } } diff --git a/src/density/knn.rs b/src/density/knn.rs index aba690b5..28dbfe2b 100644 --- a/src/density/knn.rs +++ b/src/density/knn.rs @@ -1,6 +1,6 @@ use super::Container; use crate::{ - density::{nearest_neighbors, DensityError}, + density::{DensityError, nearest_neighbors}, function::gamma::gamma, }; use core::f64::consts::PI; @@ -29,27 +29,43 @@ where #[cfg(test)] mod tests { + use core::f32::consts::PI; + use super::*; use crate::distribution::Normal; - use nalgebra::Vector2; - use rand::distributions::Distribution; + use crate::function::kernel::Kernel; + use nalgebra::{Vector1, Vector2}; + use rand::distr::Distribution; #[test] fn test_knn_pdf() { let law = Normal::new(0., 1.).unwrap(); - let mut rng = rand::thread_rng(); - let samples = (0..100000) + let mut rng = rand::rng(); + let gaussian = crate::function::kernel::Gaussian; + let samples_1d = (0..100000) + .map(|_| Vector1::new(law.sample(&mut rng))) + .collect::>(); + let x = Vector1::new(0.); + let knn_density_with_bandwidth = knn_pdf(&x, &samples_1d, Some(0.05)); + let knn_density = knn_pdf(&x, &samples_1d, None); + let reference_value = gaussian.evaluate(0.); + assert!(knn_density.is_ok()); + assert!(knn_density_with_bandwidth.is_ok()); + assert!((knn_density.unwrap() - reference_value).abs() < 2e-2); + assert!((knn_density_with_bandwidth.unwrap() - reference_value).abs() < 3e-2); + + let samples_2d = (0..100000) .map(|_| Vector2::new(law.sample(&mut rng), law.sample(&mut rng))) - // .map(|_| Vector1::new(law.sample(&mut rng))) .collect::>(); - let x = Vector2::new(1.0, 0.0); - // let x = Vector1::new(0.0); - let knn_density_with_bandwidth = knn_pdf(&x, &samples, Some(0.2)); - let knn_density = knn_pdf(&x, &samples, None); - println!("Knn: {:?}", knn_density); - println!("Knn with bandwidth: {:?}", knn_density_with_bandwidth); - // println!("Pdf: {:?}", law.pdf(x)); + + let x = Vector2::new(0., 0.); + let knn_density_with_bandwidth = knn_pdf(&x, &samples_2d, Some(0.05)); + let knn_density = knn_pdf(&x, &samples_2d, None); + let reference_value = 1. / (2. * PI) as f64; assert!(knn_density.is_ok()); + assert!(knn_density_with_bandwidth.is_ok()); + assert!((knn_density.unwrap() - reference_value).abs() < 2e-2); + assert!((knn_density_with_bandwidth.unwrap() - reference_value).abs() < 3e-2); } #[test] diff --git a/src/density/mod.rs b/src/density/mod.rs index 9cd2faa5..6e35af38 100644 --- a/src/density/mod.rs +++ b/src/density/mod.rs @@ -1,6 +1,6 @@ pub mod kde; pub mod knn; -use kdtree::{distance::squared_euclidean, ErrorKind, KdTree}; +use kdtree::{ErrorKind, KdTree, distance::squared_euclidean}; use thiserror::Error; #[derive(Error, Debug)] From 0bb1f5e36f205b9dde9bf5160f8132dc79f90908 Mon Sep 17 00:00:00 2001 From: Georges SARR Date: Mon, 17 Nov 2025 04:55:10 +0100 Subject: [PATCH 11/11] add use of one dimensional gaussian kernel function. --- src/density/kde.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/density/kde.rs b/src/density/kde.rs index 2994ebdc..3e5d8dc1 100644 --- a/src/density/kde.rs +++ b/src/density/kde.rs @@ -1,6 +1,9 @@ use kdtree::distance::squared_euclidean; -use crate::density::{Container, DensityError, nearest_neighbors}; +use crate::{ + density::{Container, DensityError, nearest_neighbors}, + function::kernel::{Gaussian, Kernel}, +}; /// Computes the kernel density estimate for a given point `x` /// using the samples provided and a specified kernel. @@ -25,9 +28,8 @@ where .as_ref() .iter() .map(|xi| { - (-0.5 * (squared_euclidean(x.as_ref(), xi.as_ref()).sqrt() / radius).powi(2)) - .exp() - / crate::consts::SQRT_2PI.powi(d) + Gaussian.evaluate(squared_euclidean(x.as_ref(), xi.as_ref()).sqrt() / radius) + / crate::consts::SQRT_2PI.powi(d - 1) }) .sum::()) }