kmeans_rs/src/lib.rs

use console::Term;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use kmeans::*;
use ndarray::{Array2, AsArray, Ix1, Ix2};
use numpy::{AllowTypeChange, IntoPyArray, PyArray2, PyArrayLike1, PyArrayLike2};
use pyo3::exceptions::PyTypeError;
use pyo3::prelude::*;
use pyo3_stub_gen::derive::*;
use pyo3_stub_gen::{StubGenConfig, StubInfo};
use rayon::prelude::*;
use std::collections::{HashMap, HashSet};
use std::hash::Hash;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;

#[derive(Debug, thiserror::Error)]
pub enum Error {
    #[error(transparent)]
    ProgressBarTemplate(#[from] indicatif::style::TemplateError),
    #[error("shape mismatch: {0} != {1}")]
    ShapeMismatch(usize, usize),
    #[error("no centroids defined")]
    NoCentroidsDefined,
}

impl From<Error> for PyErr {
    fn from(err: Error) -> PyErr {
        color_eyre::eyre::Report::from(err).into()
    }
}

/// a progress bar with an ok style that  when py::detach is used also works in jupyter
pub fn get_bar(count: Option<usize>) -> Result<ProgressBar, Error> {
    let style = ProgressStyle::with_template(
        "{spinner:.green} {percent}% [{wide_bar:.green/lime}] {pos:>7}/{len:7} [{elapsed}/{eta}, {per_sec:<5}]",
    )?.progress_chars("#>-");
    let bar = ProgressBar::with_draw_target(
        count.map(|i| i as u64),
        ProgressDrawTarget::term_like_with_hz(Box::new(Term::buffered_stdout()), 20),
    )
    .with_style(style);
    bar.enable_steady_tick(Duration::from_millis(100));
    Ok(bar)
}

trait Predict<T> {
    type Error;

    fn predict<'a, A>(&self, points: A) -> Result<(Vec<usize>, Vec<T>), Self::Error>
    where
        A: AsArray<'a, T, Ix2>,
        T: 'a;

    fn silhouette_simple<'p, 'a, P, A>(
        &self,
        points: P,
        assignments: Option<A>,
    ) -> Result<f64, Self::Error>
    where
        P: AsArray<'p, f64, Ix2>,
        A: AsArray<'a, usize, Ix1>;
}

/// Specify an initialization method using plusplus, random_partition, random_sample or precomputed.
#[gen_stub_pyclass_complex_enum]
#[pyclass(name = "KMeansInit", module = "kmeans_rs", from_py_object)]
#[derive(Clone, Debug)]
pub(crate) enum PyKMeansInit {
    PlusPlus(),
    RandomPartition(),
    RandomSample(),
    Precomputed(Vec<f64>),
}

#[gen_stub_pymethods]
#[pymethods]
impl PyKMeansInit {
    /// K-Means++ initialization method, as implemented in Matlab
    ///
    /// ## Description
    /// This initialization method starts by selecting one sample as first centroid.
    /// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
    /// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
    /// is from its centroid. Then, one sample is randomly selected, while taking their probability of being
    /// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
    /// their currently assigned cluster's centroid.
    /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5    Section: More About)
    #[staticmethod]
    pub(crate) fn plusplus() -> Self {
        Self::PlusPlus()
    }

    /// Random-Partition initialization method
    ///
    /// ## Description
    /// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
    /// These means are then used as initial clusters.
    #[staticmethod]
    pub(crate) fn random_partition() -> Self {
        Self::RandomPartition()
    }

    /// Random sample initialization method (a.k.a. Forgy)
    ///
    /// ## Description
    /// This initialization method randomly selects k centroids from the samples as initial centroids.
    #[staticmethod]
    pub(crate) fn random_sample() -> Self {
        Self::RandomSample()
    }

    /// Precomputed centroids initialization method
    ///
    /// ## Description
    /// This initialization method requires a precomputed list of k centroids to use as initial
    /// centroids.
    #[staticmethod]
    pub(crate) fn precomputed(
        #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
        centroids: PyArrayLike2<f64, AllowTypeChange>,
    ) -> Self {
        Self::Precomputed(centroids.as_array().flatten().to_vec())
    }
}

/// Specify a kmeans algorithm using lloyd or mini_batch.
#[gen_stub_pyclass_complex_enum]
#[pyclass(name = "KMeansAlgorithm", module = "kmeans_rs", from_py_object)]
#[derive(Clone, Debug)]
pub(crate) enum PyKMeansAlgorithm {
    Lloyd(),
    MiniBatch(usize),
}

#[gen_stub_pymethods]
#[pymethods]
impl PyKMeansAlgorithm {
    /// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
    /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5    Section: More About)
    #[staticmethod]
    pub(crate) fn lloyd() -> Self {
        Self::Lloyd()
    }

    /// Mini-Batch k-Means implementation.
    /// (see: https://dl.acm.org/citation.cfm?id=1772862)
    ///
    /// ## Arguments
    /// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
    #[staticmethod]
    pub(crate) fn mini_batch(batch_size: usize) -> Self {
        Self::MiniBatch(batch_size)
    }
}

/// Compute kmeans clustering
/// this implementation is supposed to be faster than scipy or scikit-learn
/// when dealing with a lot of points
///
/// ## Arguments
/// - **points**: Numpy array #points x dimensions
/// - **k**: Amount of clusters to search for
/// - **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite)
/// - **init**: initialization method
/// - **algorithm**: algorithm to use
#[gen_stub_pyclass]
#[pyclass(name = "KMeans", module = "kmeans_rs", from_py_object)]
#[derive(Clone, Debug)]
pub(crate) struct PyKMeans {
    ndim: usize,
    inner: KMeansState<f64>,
}

#[gen_stub_pymethods]
#[pymethods]
impl PyKMeans {
    #[new]
    #[pyo3(signature = (points, k, max_iter=300, init=None, algorithm=None))]
    pub(crate) fn new(
        py: Python,
        #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
        points: PyArrayLike2<f64, AllowTypeChange>,
        k: usize,
        max_iter: usize,
        init: Option<PyKMeansInit>,
        algorithm: Option<PyKMeansAlgorithm>,
    ) -> Self {
        let points = points.as_array();
        py.detach(|| {
            let shape = points.shape();
            let kmeans = if let Some(s) = points.as_slice() {
                KMeans::<f64, 8, _>::new(s, shape[0], shape[1], EuclideanDistance)
            } else {
                let v = points.flatten().to_vec();
                KMeans::<f64, 8, _>::new(v.as_slice(), shape[0], shape[1], EuclideanDistance)
            };
            let init = if let Some(init) = init {
                init
            } else {
                PyKMeansInit::PlusPlus()
            };
            let algorithm = if let Some(algorithm) = algorithm {
                algorithm
            } else {
                PyKMeansAlgorithm::Lloyd()
            };
            let config = KMeansConfig::default();
            match algorithm {
                PyKMeansAlgorithm::Lloyd() => PyKMeans {
                    ndim: shape[1],
                    inner: match init {
                        PyKMeansInit::PlusPlus() => {
                            kmeans.kmeans_lloyd(k, max_iter, KMeans::init_kmeanplusplus, &config)
                        }
                        PyKMeansInit::RandomPartition() => {
                            kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_partition, &config)
                        }
                        PyKMeansInit::RandomSample() => {
                            kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_sample, &config)
                        }
                        PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_lloyd(
                            k,
                            max_iter,
                            KMeans::init_precomputed(centroids),
                            &config,
                        ),
                    },
                },
                PyKMeansAlgorithm::MiniBatch(size) => PyKMeans {
                    ndim: shape[1],
                    inner: match init {
                        PyKMeansInit::PlusPlus() => kmeans.kmeans_minibatch(
                            size,
                            k,
                            max_iter,
                            KMeans::init_kmeanplusplus,
                            &config,
                        ),
                        PyKMeansInit::RandomPartition() => kmeans.kmeans_minibatch(
                            size,
                            k,
                            max_iter,
                            KMeans::init_random_partition,
                            &config,
                        ),
                        PyKMeansInit::RandomSample() => kmeans.kmeans_minibatch(
                            size,
                            k,
                            max_iter,
                            KMeans::init_random_sample,
                            &config,
                        ),
                        PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_minibatch(
                            size,
                            k,
                            max_iter,
                            KMeans::init_precomputed(centroids),
                            &config,
                        ),
                    },
                },
            }
        })
    }

    /// K-Means++ initialization method, as implemented in Matlab
    ///
    /// ## Description
    /// This initialization method starts by selecting one sample as first centroid.
    /// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
    /// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
    /// is from its centroid. Then, one sample is randomly selected, while taking their probability of being
    /// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
    /// their currently assigned cluster's centroid.
    /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5    Section: More About)
    #[staticmethod]
    pub(crate) fn init_plusplus() -> PyKMeansInit {
        PyKMeansInit::PlusPlus()
    }

    /// Random-Parition initialization method
    ///
    /// ## Description
    /// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
    /// These means are then used as initial clusters.
    #[staticmethod]
    pub(crate) fn init_random_partition() -> PyKMeansInit {
        PyKMeansInit::RandomPartition()
    }

    /// Random sample initialization method (a.k.a. Forgy)
    ///
    /// ## Description
    /// This initialization method randomly selects k centroids from the samples as initial centroids.
    #[staticmethod]
    pub(crate) fn init_random_sample() -> PyKMeansInit {
        PyKMeansInit::RandomSample()
    }

    /// Precomputed centroids initialization method
    ///
    /// ## Description
    /// This initialization method requires a precomputed list of k centroids to use as initial
    /// centroids.
    #[staticmethod]
    pub(crate) fn init_precomputed(
        #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
        centroids: PyArrayLike2<f64, AllowTypeChange>,
    ) -> PyKMeansInit {
        PyKMeansInit::Precomputed(centroids.as_array().flatten().to_vec())
    }

    /// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
    /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5    Section: More About)
    #[staticmethod]
    pub(crate) fn algo_lloyd() -> PyKMeansAlgorithm {
        PyKMeansAlgorithm::Lloyd()
    }

    /// Mini-Batch k-Means implementation.
    /// (see: https://dl.acm.org/citation.cfm?id=1772862)
    ///
    /// ## Arguments
    /// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
    #[staticmethod]
    pub(crate) fn algo_mini_batch(batch_size: usize) -> PyKMeansAlgorithm {
        PyKMeansAlgorithm::MiniBatch(batch_size)
    }

    /// find the closest cluster and the distance for each point
    pub(crate) fn predict(
        &self,
        py: Python,
        #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
        points: PyArrayLike2<f64, AllowTypeChange>,
    ) -> PyResult<(Vec<usize>, Vec<f64>)> {
        let points = points.as_array();
        Ok(py.detach(|| self.inner.predict(points))?)
    }

    /// calculate the mean simple (using centroids) silhouette score for a set of points,
    /// assignments must be specified if they do not correspond to the assignments in the KMeans instance
    #[pyo3(signature = (points, assignments = None))]
    pub(crate) fn silhouette_simple(
        &self,
        py: Python,
        #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
        points: PyArrayLike2<f64, AllowTypeChange>,
        #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
        assignments: Option<PyArrayLike1<usize, AllowTypeChange>>,
    ) -> PyResult<f64> {
        let points = points.as_array();
        let assignments = assignments.as_ref().map(|a| a.as_array());
        Ok(py.detach(|| self.inner.silhouette_simple(points, assignments))?)
    }

    /// number of dimensions
    #[getter]
    pub(crate) fn ndim(&self) -> usize {
        self.ndim
    }

    /// number of clusters
    #[getter]
    pub(crate) fn k(&self) -> usize {
        self.inner.k
    }

    /// sum of all distances, cost measure
    #[getter]
    pub(crate) fn distance_sum(&self) -> f64 {
        self.inner.distsum
    }

    /// centroid coordinates
    #[getter]
    pub(crate) fn centroids<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyArray2<f64>>> {
        let v = self.inner.centroids.to_vec();
        Ok(Array2::from_shape_vec((v.len() / self.ndim, self.ndim), v)
            .map_err(|e| PyErr::new::<PyTypeError, String>(e.to_string()))?
            .into_pyarray(py))
    }

    /// centroid frequencies
    #[getter]
    pub(crate) fn centroid_frequency(&self) -> Vec<usize> {
        self.inner.centroid_frequency.clone()
    }

    /// to which cluster each of the points is assigned
    #[getter]
    pub(crate) fn assignments(&self) -> Vec<usize> {
        self.inner.assignments.clone()
    }

    /// distances of all points to the center it's assigned to
    #[getter]
    pub(crate) fn centroid_distances(&self) -> Vec<f64> {
        self.inner.centroid_distances.clone()
    }
}

impl Predict<f64> for KMeansState<f64> {
    type Error = Error;

    fn predict<'a, A>(&self, points: A) -> Result<(Vec<usize>, Vec<f64>), Self::Error>
    where
        A: AsArray<'a, f64, Ix2>,
    {
        let centroids = self.centroids.to_vec();
        let ndim = centroids.len() / self.k;
        let points = points.into();
        let shape = points.shape();
        if shape[1] != ndim {
            return Err(Error::ShapeMismatch(shape[1], ndim));
        }
        if centroids.is_empty() {
            return Err(Error::NoCentroidsDefined);
        }
        let fill = vec![0.0; 8 - ndim % 8];
        let e = EuclideanDistance;
        let dist = |s: &[f64]| {
            s.par_chunks_exact(ndim)
                .map(|point| {
                    let (i, d) = centroids
                        .par_chunks_exact(ndim)
                        .enumerate()
                        .fold(
                            || (usize::MAX, f64::INFINITY),
                            |(i, a), (j, centroid)| {
                                let b = <EuclideanDistance as DistanceFunction<f64, 8>>::distance(
                                    &e,
                                    &[point, &fill].concat(),
                                    &[centroid, &fill].concat(),
                                );
                                if a <= b { (i, a) } else { (j, b) }
                            },
                        )
                        .reduce(
                            || (usize::MAX, f64::INFINITY),
                            |(i, a), (j, b)| {
                                if a <= b { (i, a) } else { (j, b) }
                            },
                        );
                    (i, d.sqrt())
                })
                .collect::<(Vec<_>, Vec<_>)>()
        };

        Ok(if let Some(s) = points.as_slice() {
            dist(s)
        } else {
            let s = points.flatten().to_vec();
            dist(&s)
        })
    }

    fn silhouette_simple<'p, 'a, P, A>(
        &self,
        points: P,
        assignments: Option<A>,
    ) -> Result<f64, Self::Error>
    where
        P: AsArray<'p, f64, Ix2>,
        A: AsArray<'a, usize, Ix1>,
    {
        let points = points.into();
        let shape = points.shape();
        let centroids = Arc::new(self.centroids.to_vec());
        let ndim = centroids.len() / self.k;

        if shape[1] != ndim {
            return Err(Error::ShapeMismatch(shape[1], ndim));
        }
        if centroids.is_empty() {
            return Err(Error::NoCentroidsDefined);
        }

        let assignments = if let Some(assignments) = assignments {
            assignments.into().to_vec()
        } else {
            self.assignments.to_vec()
        };
        let k = self.k;
        let mut clusters = vec![Vec::new(); k];
        for (point, assignment) in points.rows().into_iter().zip(assignments) {
            clusters[assignment].extend(point.to_vec());
        }
        let fill = vec![0.0; 8 - ndim % 8];
        let a = clusters
            .par_iter()
            .zip(centroids.clone().par_chunks_exact(ndim))
            .flat_map(|(points, centroid)| {
                let c = [centroid, &fill].concat();
                let fill = fill.clone();
                let e = EuclideanDistance;
                points.par_chunks_exact(ndim).map(move |point| {
                    <EuclideanDistance as DistanceFunction<f64, 8>>::distance(
                        &e,
                        &c,
                        &[point, &fill].concat(),
                    )
                    .sqrt()
                })
            })
            .collect::<Vec<_>>();

        let b = clusters
            .par_iter()
            .enumerate()
            .flat_map(|(i, points)| {
                let centroids = centroids.clone();
                let fill = fill.clone();
                let e = EuclideanDistance;
                points.par_chunks_exact(ndim).map(move |point| {
                    centroids
                        .par_chunks_exact(ndim)
                        .enumerate()
                        .map(|(j, centroid)| {
                            if i == j {
                                f64::INFINITY
                            } else {
                                <EuclideanDistance as DistanceFunction<f64, 8>>::distance(
                                    &e,
                                    &[centroid, &fill].concat(),
                                    &[point, &fill].concat(),
                                )
                                .sqrt()
                            }
                        })
                        .min_by(|a, b| a.total_cmp(b))
                        .unwrap_or(f64::INFINITY)
                })
            })
            .collect::<Vec<_>>();
        Ok(a.into_iter()
            .zip(b)
            .map(|(a, b)| (b - a) / a.max(b))
            .sum::<f64>()
            / points.shape()[0] as f64)
    }
}

fn silhouette<'p, 'a, P, A, K>(points: P, assignments: A) -> Result<f64, Error>
where
    P: AsArray<'p, f64, Ix2>,
    A: AsArray<'a, K, Ix1>,
    K: 'a + Eq + Hash,
{
    let points = points.into();
    let assignments = assignments.into();
    let shape = points.shape();
    let n = shape[0];
    let ndim = shape[1];

    let labels = assignments
        .iter()
        .collect::<HashSet<_>>()
        .into_iter()
        .enumerate()
        .map(|(k, v)| (v, k))
        .collect::<HashMap<_, _>>();
    let assignments = assignments.iter().map(|k| labels[k]).collect::<Vec<_>>();
    let k = labels.len();

    let mut clusters = vec![Vec::new(); k];
    for (point, assignment) in points.rows().into_iter().zip(assignments) {
        clusters[assignment].extend(point.to_vec());
    }
    let bar = get_bar(Some(k * n + k * n * k))?;
    let fill = vec![0.0; 8 - ndim % 8];
    let e = EuclideanDistance;
    let a = clusters
        .par_iter()
        .flat_map(|points| {
            let c = (points.len() / ndim - 1) as f64;
            points
                .par_chunks_exact(ndim)
                .map(|i| {
                    let q = points
                        .par_chunks_exact(ndim)
                        .map(|j| {
                            <EuclideanDistance as DistanceFunction<f64, 8>>::distance(
                                &e,
                                &[i, &fill].concat(),
                                &[j, &fill].concat(),
                            )
                            .sqrt()
                        })
                        .sum::<f64>()
                        / c;
                    bar.inc(1);
                    q
                })
                .collect::<Vec<_>>()
        })
        .collect::<Vec<_>>();

    let b = clusters
        .par_iter()
        .enumerate()
        .flat_map(|(i, points_i)| {
            points_i
                .par_chunks_exact(ndim)
                .map(|a| {
                    clusters
                        .par_iter()
                        .enumerate()
                        .map(|(j, points_j)| {
                            let c = (points_j.len() / ndim) as f64;
                            let q = if i == j {
                                f64::INFINITY
                            } else {
                                points_j
                                    .par_chunks_exact(ndim)
                                    .map(|b| {
                                        <EuclideanDistance as DistanceFunction<f64, 8>>::distance(
                                            &e,
                                            &[a, &fill].concat(),
                                            &[b, &fill].concat(),
                                        )
                                        .sqrt()
                                    })
                                    .sum::<f64>()
                                    / c
                            };
                            bar.inc(1);
                            q
                        })
                        .min_by(|a, b| a.total_cmp(b))
                        .unwrap_or(f64::INFINITY)
                })
                .collect::<Vec<_>>()
        })
        .collect::<Vec<_>>();
    bar.finish();
    Ok(a.into_iter()
        .zip(b)
        .map(|(a, b)| (b - a) / a.max(b))
        .sum::<f64>()
        / points.shape()[0] as f64)
}

/// calculate the mean silhouette score for a set of points
#[gen_stub_pyfunction(module = "kmeans_rs")]
#[pyfunction(name = "silhouette")]
pub(crate) fn py_silhouette(
    py: Python,
    #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
    points: PyArrayLike2<f64, AllowTypeChange>,
    #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
    assignments: PyArrayLike1<usize, AllowTypeChange>,
) -> PyResult<f64> {
    let points = points.as_array();
    let assignments = assignments.as_array();
    Ok(py.detach(|| silhouette(points, assignments))?)
}

/// generates kmeans/__init__.pyi
#[pyfunction]
fn generate_stub(dest_path: String) -> PyResult<()> {
    Ok(StubInfo::from_project_root(
        "kmeans_rs".to_string(),
        PathBuf::from(dest_path).join("py"),
        true,
        StubGenConfig::default(),
    )?
    .generate()?)
}

#[pymodule]
#[pyo3(name = "kmeans_rs")]
mod kmeans_rs {
    use pyo3::prelude::*;

    #[pymodule_export]
    use super::generate_stub;

    #[pymodule_export]
    use super::PyKMeans;

    #[pymodule_export]
    use super::PyKMeansInit;

    #[pymodule_export]
    use super::PyKMeansAlgorithm;

    #[pymodule_export]
    use super::py_silhouette;

    #[pymodule_init]
    fn init(_: &Bound<'_, PyModule>) -> PyResult<()> {
        Ok(color_eyre::install()?)
    }
}