use console::Term; use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; use kmeans::*; use ndarray::{Array2, AsArray, Ix1, Ix2}; use numpy::{AllowTypeChange, IntoPyArray, PyArray2, PyArrayLike1, PyArrayLike2}; use pyo3::exceptions::PyTypeError; use pyo3::prelude::*; use pyo3_stub_gen::derive::*; use pyo3_stub_gen::{StubGenConfig, StubInfo}; use rayon::prelude::*; use std::collections::{HashMap, HashSet}; use std::hash::Hash; use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; #[derive(Debug, thiserror::Error)] pub enum Error { #[error(transparent)] ProgressBarTemplate(#[from] indicatif::style::TemplateError), #[error("shape mismatch: {0} != {1}")] ShapeMismatch(usize, usize), #[error("no centroids defined")] NoCentroidsDefined, } impl From for PyErr { fn from(err: Error) -> PyErr { color_eyre::eyre::Report::from(err).into() } } /// a progress bar with an ok style that when py::detach is used also works in jupyter pub fn get_bar(count: Option) -> Result { let style = ProgressStyle::with_template( "{spinner:.green} {percent}% [{wide_bar:.green/lime}] {pos:>7}/{len:7} [{elapsed}/{eta}, {per_sec:<5}]", )?.progress_chars("#>-"); let bar = ProgressBar::with_draw_target( count.map(|i| i as u64), ProgressDrawTarget::term_like_with_hz(Box::new(Term::buffered_stdout()), 20), ) .with_style(style); bar.enable_steady_tick(Duration::from_millis(100)); Ok(bar) } trait Predict { type Error; fn predict<'a, A>(&self, points: A) -> Result<(Vec, Vec), Self::Error> where A: AsArray<'a, T, Ix2>, T: 'a; fn silhouette_simple<'p, 'a, P, A>( &self, points: P, assignments: Option, ) -> Result where P: AsArray<'p, f64, Ix2>, A: AsArray<'a, usize, Ix1>; } /// Specify an initialization method using plusplus, random_partition, random_sample or precomputed. #[gen_stub_pyclass_complex_enum] #[pyclass(name = "KMeansInit", module = "kmeans_rs", from_py_object)] #[derive(Clone, Debug)] pub(crate) enum PyKMeansInit { PlusPlus(), RandomPartition(), RandomSample(), Precomputed(Vec), } #[gen_stub_pymethods] #[pymethods] impl PyKMeansInit { /// K-Means++ initialization method, as implemented in Matlab /// /// ## Description /// This initialization method starts by selecting one sample as first centroid. /// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating /// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample /// is from its centroid. Then, one sample is randomly selected, while taking their probability of being /// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from /// their currently assigned cluster's centroid. /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) #[staticmethod] pub(crate) fn plusplus() -> Self { Self::PlusPlus() } /// Random-Partition initialization method /// /// ## Description /// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means. /// These means are then used as initial clusters. #[staticmethod] pub(crate) fn random_partition() -> Self { Self::RandomPartition() } /// Random sample initialization method (a.k.a. Forgy) /// /// ## Description /// This initialization method randomly selects k centroids from the samples as initial centroids. #[staticmethod] pub(crate) fn random_sample() -> Self { Self::RandomSample() } /// Precomputed centroids initialization method /// /// ## Description /// This initialization method requires a precomputed list of k centroids to use as initial /// centroids. #[staticmethod] pub(crate) fn precomputed( #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] centroids: PyArrayLike2, ) -> Self { Self::Precomputed(centroids.as_array().flatten().to_vec()) } } /// Specify a kmeans algorithm using lloyd or mini_batch. #[gen_stub_pyclass_complex_enum] #[pyclass(name = "KMeansAlgorithm", module = "kmeans_rs", from_py_object)] #[derive(Clone, Debug)] pub(crate) enum PyKMeansAlgorithm { Lloyd(), MiniBatch(usize), } #[gen_stub_pymethods] #[pymethods] impl PyKMeansAlgorithm { /// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase). /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) #[staticmethod] pub(crate) fn lloyd() -> Self { Self::Lloyd() } /// Mini-Batch k-Means implementation. /// (see: https://dl.acm.org/citation.cfm?id=1772862) /// /// ## Arguments /// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower) #[staticmethod] pub(crate) fn mini_batch(batch_size: usize) -> Self { Self::MiniBatch(batch_size) } } /// Compute kmeans clustering /// this implementation is supposed to be faster than scipy or scikit-learn /// when dealing with a lot of points /// /// ## Arguments /// - **points**: Numpy array #points x dimensions /// - **k**: Amount of clusters to search for /// - **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite) /// - **init**: initialization method /// - **algorithm**: algorithm to use #[gen_stub_pyclass] #[pyclass(name = "KMeans", module = "kmeans_rs", from_py_object)] #[derive(Clone, Debug)] pub(crate) struct PyKMeans { ndim: usize, inner: KMeansState, } #[gen_stub_pymethods] #[pymethods] impl PyKMeans { #[new] #[pyo3(signature = (points, k, max_iter=300, init=None, algorithm=None))] pub(crate) fn new( py: Python, #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] points: PyArrayLike2, k: usize, max_iter: usize, init: Option, algorithm: Option, ) -> Self { let points = points.as_array(); py.detach(|| { let shape = points.shape(); let kmeans = if let Some(s) = points.as_slice() { KMeans::::new(s, shape[0], shape[1], EuclideanDistance) } else { let v = points.flatten().to_vec(); KMeans::::new(v.as_slice(), shape[0], shape[1], EuclideanDistance) }; let init = if let Some(init) = init { init } else { PyKMeansInit::PlusPlus() }; let algorithm = if let Some(algorithm) = algorithm { algorithm } else { PyKMeansAlgorithm::Lloyd() }; let config = KMeansConfig::default(); match algorithm { PyKMeansAlgorithm::Lloyd() => PyKMeans { ndim: shape[1], inner: match init { PyKMeansInit::PlusPlus() => { kmeans.kmeans_lloyd(k, max_iter, KMeans::init_kmeanplusplus, &config) } PyKMeansInit::RandomPartition() => { kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_partition, &config) } PyKMeansInit::RandomSample() => { kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_sample, &config) } PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_lloyd( k, max_iter, KMeans::init_precomputed(centroids), &config, ), }, }, PyKMeansAlgorithm::MiniBatch(size) => PyKMeans { ndim: shape[1], inner: match init { PyKMeansInit::PlusPlus() => kmeans.kmeans_minibatch( size, k, max_iter, KMeans::init_kmeanplusplus, &config, ), PyKMeansInit::RandomPartition() => kmeans.kmeans_minibatch( size, k, max_iter, KMeans::init_random_partition, &config, ), PyKMeansInit::RandomSample() => kmeans.kmeans_minibatch( size, k, max_iter, KMeans::init_random_sample, &config, ), PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_minibatch( size, k, max_iter, KMeans::init_precomputed(centroids), &config, ), }, }, } }) } /// K-Means++ initialization method, as implemented in Matlab /// /// ## Description /// This initialization method starts by selecting one sample as first centroid. /// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating /// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample /// is from its centroid. Then, one sample is randomly selected, while taking their probability of being /// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from /// their currently assigned cluster's centroid. /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) #[staticmethod] pub(crate) fn init_plusplus() -> PyKMeansInit { PyKMeansInit::PlusPlus() } /// Random-Parition initialization method /// /// ## Description /// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means. /// These means are then used as initial clusters. #[staticmethod] pub(crate) fn init_random_partition() -> PyKMeansInit { PyKMeansInit::RandomPartition() } /// Random sample initialization method (a.k.a. Forgy) /// /// ## Description /// This initialization method randomly selects k centroids from the samples as initial centroids. #[staticmethod] pub(crate) fn init_random_sample() -> PyKMeansInit { PyKMeansInit::RandomSample() } /// Precomputed centroids initialization method /// /// ## Description /// This initialization method requires a precomputed list of k centroids to use as initial /// centroids. #[staticmethod] pub(crate) fn init_precomputed( #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] centroids: PyArrayLike2, ) -> PyKMeansInit { PyKMeansInit::Precomputed(centroids.as_array().flatten().to_vec()) } /// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase). /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) #[staticmethod] pub(crate) fn algo_lloyd() -> PyKMeansAlgorithm { PyKMeansAlgorithm::Lloyd() } /// Mini-Batch k-Means implementation. /// (see: https://dl.acm.org/citation.cfm?id=1772862) /// /// ## Arguments /// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower) #[staticmethod] pub(crate) fn algo_mini_batch(batch_size: usize) -> PyKMeansAlgorithm { PyKMeansAlgorithm::MiniBatch(batch_size) } /// find the closest cluster and the distance for each point pub(crate) fn predict( &self, py: Python, #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] points: PyArrayLike2, ) -> PyResult<(Vec, Vec)> { let points = points.as_array(); Ok(py.detach(|| self.inner.predict(points))?) } /// calculate the mean simple (using centroids) silhouette score for a set of points, /// assignments must be specified if they do not correspond to the assignments in the KMeans instance #[pyo3(signature = (points, assignments = None))] pub(crate) fn silhouette_simple( &self, py: Python, #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] points: PyArrayLike2, #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] assignments: Option>, ) -> PyResult { let points = points.as_array(); let assignments = assignments.as_ref().map(|a| a.as_array()); Ok(py.detach(|| self.inner.silhouette_simple(points, assignments))?) } /// number of dimensions #[getter] pub(crate) fn ndim(&self) -> usize { self.ndim } /// number of clusters #[getter] pub(crate) fn k(&self) -> usize { self.inner.k } /// sum of all distances, cost measure #[getter] pub(crate) fn distance_sum(&self) -> f64 { self.inner.distsum } /// centroid coordinates #[getter] pub(crate) fn centroids<'py>(&self, py: Python<'py>) -> PyResult>> { let v = self.inner.centroids.to_vec(); Ok(Array2::from_shape_vec((v.len() / self.ndim, self.ndim), v) .map_err(|e| PyErr::new::(e.to_string()))? .into_pyarray(py)) } /// centroid frequencies #[getter] pub(crate) fn centroid_frequency(&self) -> Vec { self.inner.centroid_frequency.clone() } /// to which cluster each of the points is assigned #[getter] pub(crate) fn assignments(&self) -> Vec { self.inner.assignments.clone() } /// distances of all points to the center it's assigned to #[getter] pub(crate) fn centroid_distances(&self) -> Vec { self.inner.centroid_distances.clone() } } impl Predict for KMeansState { type Error = Error; fn predict<'a, A>(&self, points: A) -> Result<(Vec, Vec), Self::Error> where A: AsArray<'a, f64, Ix2>, { let centroids = self.centroids.to_vec(); let ndim = centroids.len() / self.k; let points = points.into(); let shape = points.shape(); if shape[1] != ndim { return Err(Error::ShapeMismatch(shape[1], ndim)); } if centroids.is_empty() { return Err(Error::NoCentroidsDefined); } let fill = vec![0.0; 8 - ndim % 8]; let e = EuclideanDistance; let dist = |s: &[f64]| { s.par_chunks_exact(ndim) .map(|point| { let (i, d) = centroids .par_chunks_exact(ndim) .enumerate() .fold( || (usize::MAX, f64::INFINITY), |(i, a), (j, centroid)| { let b = >::distance( &e, &[point, &fill].concat(), &[centroid, &fill].concat(), ); if a <= b { (i, a) } else { (j, b) } }, ) .reduce( || (usize::MAX, f64::INFINITY), |(i, a), (j, b)| { if a <= b { (i, a) } else { (j, b) } }, ); (i, d.sqrt()) }) .collect::<(Vec<_>, Vec<_>)>() }; Ok(if let Some(s) = points.as_slice() { dist(s) } else { let s = points.flatten().to_vec(); dist(&s) }) } fn silhouette_simple<'p, 'a, P, A>( &self, points: P, assignments: Option, ) -> Result where P: AsArray<'p, f64, Ix2>, A: AsArray<'a, usize, Ix1>, { let points = points.into(); let shape = points.shape(); let centroids = Arc::new(self.centroids.to_vec()); let ndim = centroids.len() / self.k; if shape[1] != ndim { return Err(Error::ShapeMismatch(shape[1], ndim)); } if centroids.is_empty() { return Err(Error::NoCentroidsDefined); } let assignments = if let Some(assignments) = assignments { assignments.into().to_vec() } else { self.assignments.to_vec() }; let k = self.k; let mut clusters = vec![Vec::new(); k]; for (point, assignment) in points.rows().into_iter().zip(assignments) { clusters[assignment].extend(point.to_vec()); } let fill = vec![0.0; 8 - ndim % 8]; let a = clusters .par_iter() .zip(centroids.clone().par_chunks_exact(ndim)) .flat_map(|(points, centroid)| { let c = [centroid, &fill].concat(); let fill = fill.clone(); let e = EuclideanDistance; points.par_chunks_exact(ndim).map(move |point| { >::distance( &e, &c, &[point, &fill].concat(), ) .sqrt() }) }) .collect::>(); let b = clusters .par_iter() .enumerate() .flat_map(|(i, points)| { let centroids = centroids.clone(); let fill = fill.clone(); let e = EuclideanDistance; points.par_chunks_exact(ndim).map(move |point| { centroids .par_chunks_exact(ndim) .enumerate() .map(|(j, centroid)| { if i == j { f64::INFINITY } else { >::distance( &e, &[centroid, &fill].concat(), &[point, &fill].concat(), ) .sqrt() } }) .min_by(|a, b| a.total_cmp(b)) .unwrap_or(f64::INFINITY) }) }) .collect::>(); Ok(a.into_iter() .zip(b) .map(|(a, b)| (b - a) / a.max(b)) .sum::() / points.shape()[0] as f64) } } fn silhouette<'p, 'a, P, A, K>(points: P, assignments: A) -> Result where P: AsArray<'p, f64, Ix2>, A: AsArray<'a, K, Ix1>, K: 'a + Eq + Hash, { let points = points.into(); let assignments = assignments.into(); let shape = points.shape(); let n = shape[0]; let ndim = shape[1]; let labels = assignments .iter() .collect::>() .into_iter() .enumerate() .map(|(k, v)| (v, k)) .collect::>(); let assignments = assignments.iter().map(|k| labels[k]).collect::>(); let k = labels.len(); let mut clusters = vec![Vec::new(); k]; for (point, assignment) in points.rows().into_iter().zip(assignments) { clusters[assignment].extend(point.to_vec()); } let bar = get_bar(Some(k * n + k * n * k))?; let fill = vec![0.0; 8 - ndim % 8]; let e = EuclideanDistance; let a = clusters .par_iter() .flat_map(|points| { let c = (points.len() / ndim - 1) as f64; points .par_chunks_exact(ndim) .map(|i| { let q = points .par_chunks_exact(ndim) .map(|j| { >::distance( &e, &[i, &fill].concat(), &[j, &fill].concat(), ) .sqrt() }) .sum::() / c; bar.inc(1); q }) .collect::>() }) .collect::>(); let b = clusters .par_iter() .enumerate() .flat_map(|(i, points_i)| { points_i .par_chunks_exact(ndim) .map(|a| { clusters .par_iter() .enumerate() .map(|(j, points_j)| { let c = (points_j.len() / ndim) as f64; let q = if i == j { f64::INFINITY } else { points_j .par_chunks_exact(ndim) .map(|b| { >::distance( &e, &[a, &fill].concat(), &[b, &fill].concat(), ) .sqrt() }) .sum::() / c }; bar.inc(1); q }) .min_by(|a, b| a.total_cmp(b)) .unwrap_or(f64::INFINITY) }) .collect::>() }) .collect::>(); bar.finish(); Ok(a.into_iter() .zip(b) .map(|(a, b)| (b - a) / a.max(b)) .sum::() / points.shape()[0] as f64) } /// calculate the mean silhouette score for a set of points #[gen_stub_pyfunction(module = "kmeans_rs")] #[pyfunction(name = "silhouette")] pub(crate) fn py_silhouette( py: Python, #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] points: PyArrayLike2, #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] assignments: PyArrayLike1, ) -> PyResult { let points = points.as_array(); let assignments = assignments.as_array(); Ok(py.detach(|| silhouette(points, assignments))?) } /// generates kmeans/__init__.pyi #[pyfunction] fn generate_stub(dest_path: String) -> PyResult<()> { Ok(StubInfo::from_project_root( "kmeans_rs".to_string(), PathBuf::from(dest_path).join("py"), true, StubGenConfig::default(), )? .generate()?) } #[pymodule] #[pyo3(name = "kmeans_rs")] mod kmeans_rs { use pyo3::prelude::*; #[pymodule_export] use super::generate_stub; #[pymodule_export] use super::PyKMeans; #[pymodule_export] use super::PyKMeansInit; #[pymodule_export] use super::PyKMeansAlgorithm; #[pymodule_export] use super::py_silhouette; #[pymodule_init] fn init(_: &Bound<'_, PyModule>) -> PyResult<()> { Ok(color_eyre::install()?) } }