Files
kmeans_rs/src/lib.rs
T
w.pomp 20e04c8b53
CI / linux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 55s
CI / linux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:ppc64le]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:s390x]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 6s
CI / musllinux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 4s
CI / musllinux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 4s
CI / windows (map[runner:windows-latest target:x64]) (push) Has been cancelled
CI / windows (map[runner:windows-latest target:x86]) (push) Has been cancelled
CI / macos (map[runner:macos-13 target:x86_64]) (push) Has been cancelled
CI / macos (map[runner:macos-14 target:aarch64]) (push) Has been cancelled
CI / Release (push) Has been cancelled
CI / sdist (push) Has been cancelled
CI / linux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 5s
first commit
2026-04-16 16:26:53 +02:00

697 lines
24 KiB
Rust

use console::Term;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use kmeans::*;
use ndarray::{Array2, AsArray, Ix1, Ix2};
use numpy::{AllowTypeChange, IntoPyArray, PyArray2, PyArrayLike1, PyArrayLike2};
use pyo3::exceptions::PyTypeError;
use pyo3::prelude::*;
use pyo3_stub_gen::derive::*;
use pyo3_stub_gen::{StubGenConfig, StubInfo};
use rayon::prelude::*;
use std::collections::{HashMap, HashSet};
use std::hash::Hash;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[error(transparent)]
ProgressBarTemplate(#[from] indicatif::style::TemplateError),
#[error("shape mismatch: {0} != {1}")]
ShapeMismatch(usize, usize),
#[error("no centroids defined")]
NoCentroidsDefined,
}
impl From<Error> for PyErr {
fn from(err: Error) -> PyErr {
color_eyre::eyre::Report::from(err).into()
}
}
/// a progress bar with an ok style that when py::detach is used also works in jupyter
pub fn get_bar(count: Option<usize>) -> Result<ProgressBar, Error> {
let style = ProgressStyle::with_template(
"{spinner:.green} {percent}% [{wide_bar:.green/lime}] {pos:>7}/{len:7} [{elapsed}/{eta}, {per_sec:<5}]",
)?.progress_chars("#>-");
let bar = ProgressBar::with_draw_target(
count.map(|i| i as u64),
ProgressDrawTarget::term_like_with_hz(Box::new(Term::buffered_stdout()), 20),
)
.with_style(style);
bar.enable_steady_tick(Duration::from_millis(100));
Ok(bar)
}
trait Predict<T> {
type Error;
fn predict<'a, A>(&self, points: A) -> Result<(Vec<usize>, Vec<T>), Self::Error>
where
A: AsArray<'a, T, Ix2>,
T: 'a;
fn silhouette_simple<'p, 'a, P, A>(
&self,
points: P,
assignments: Option<A>,
) -> Result<f64, Self::Error>
where
P: AsArray<'p, f64, Ix2>,
A: AsArray<'a, usize, Ix1>;
}
/// Specify an initialization method using plusplus, random_partition, random_sample or precomputed.
#[gen_stub_pyclass_complex_enum]
#[pyclass(name = "KMeansInit", module = "kmeans_rs", from_py_object)]
#[derive(Clone, Debug)]
pub(crate) enum PyKMeansInit {
PlusPlus(),
RandomPartition(),
RandomSample(),
Precomputed(Vec<f64>),
}
#[gen_stub_pymethods]
#[pymethods]
impl PyKMeansInit {
/// K-Means++ initialization method, as implemented in Matlab
///
/// ## Description
/// This initialization method starts by selecting one sample as first centroid.
/// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
/// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
/// is from its centroid. Then, one sample is randomly selected, while taking their probability of being
/// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
/// their currently assigned cluster's centroid.
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
#[staticmethod]
pub(crate) fn plusplus() -> Self {
Self::PlusPlus()
}
/// Random-Partition initialization method
///
/// ## Description
/// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
/// These means are then used as initial clusters.
#[staticmethod]
pub(crate) fn random_partition() -> Self {
Self::RandomPartition()
}
/// Random sample initialization method (a.k.a. Forgy)
///
/// ## Description
/// This initialization method randomly selects k centroids from the samples as initial centroids.
#[staticmethod]
pub(crate) fn random_sample() -> Self {
Self::RandomSample()
}
/// Precomputed centroids initialization method
///
/// ## Description
/// This initialization method requires a precomputed list of k centroids to use as initial
/// centroids.
#[staticmethod]
pub(crate) fn precomputed(
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
centroids: PyArrayLike2<f64, AllowTypeChange>,
) -> Self {
Self::Precomputed(centroids.as_array().flatten().to_vec())
}
}
/// Specify a kmeans algorithm using lloyd or mini_batch.
#[gen_stub_pyclass_complex_enum]
#[pyclass(name = "KMeansAlgorithm", module = "kmeans_rs", from_py_object)]
#[derive(Clone, Debug)]
pub(crate) enum PyKMeansAlgorithm {
Lloyd(),
MiniBatch(usize),
}
#[gen_stub_pymethods]
#[pymethods]
impl PyKMeansAlgorithm {
/// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
#[staticmethod]
pub(crate) fn lloyd() -> Self {
Self::Lloyd()
}
/// Mini-Batch k-Means implementation.
/// (see: https://dl.acm.org/citation.cfm?id=1772862)
///
/// ## Arguments
/// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
#[staticmethod]
pub(crate) fn mini_batch(batch_size: usize) -> Self {
Self::MiniBatch(batch_size)
}
}
/// Compute kmeans clustering
/// this implementation is supposed to be faster than scipy or scikit-learn
/// when dealing with a lot of points
///
/// ## Arguments
/// - **points**: Numpy array #points x dimensions
/// - **k**: Amount of clusters to search for
/// - **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite)
/// - **init**: initialization method
/// - **algorithm**: algorithm to use
#[gen_stub_pyclass]
#[pyclass(name = "KMeans", module = "kmeans_rs", from_py_object)]
#[derive(Clone, Debug)]
pub(crate) struct PyKMeans {
ndim: usize,
inner: KMeansState<f64>,
}
#[gen_stub_pymethods]
#[pymethods]
impl PyKMeans {
#[new]
#[pyo3(signature = (points, k, max_iter=300, init=None, algorithm=None))]
pub(crate) fn new(
py: Python,
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
points: PyArrayLike2<f64, AllowTypeChange>,
k: usize,
max_iter: usize,
init: Option<PyKMeansInit>,
algorithm: Option<PyKMeansAlgorithm>,
) -> Self {
let points = points.as_array();
py.detach(|| {
let shape = points.shape();
let kmeans = if let Some(s) = points.as_slice() {
KMeans::<f64, 8, _>::new(s, shape[0], shape[1], EuclideanDistance)
} else {
let v = points.flatten().to_vec();
KMeans::<f64, 8, _>::new(v.as_slice(), shape[0], shape[1], EuclideanDistance)
};
let init = if let Some(init) = init {
init
} else {
PyKMeansInit::PlusPlus()
};
let algorithm = if let Some(algorithm) = algorithm {
algorithm
} else {
PyKMeansAlgorithm::Lloyd()
};
let config = KMeansConfig::default();
match algorithm {
PyKMeansAlgorithm::Lloyd() => PyKMeans {
ndim: shape[1],
inner: match init {
PyKMeansInit::PlusPlus() => {
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_kmeanplusplus, &config)
}
PyKMeansInit::RandomPartition() => {
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_partition, &config)
}
PyKMeansInit::RandomSample() => {
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_sample, &config)
}
PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_lloyd(
k,
max_iter,
KMeans::init_precomputed(centroids),
&config,
),
},
},
PyKMeansAlgorithm::MiniBatch(size) => PyKMeans {
ndim: shape[1],
inner: match init {
PyKMeansInit::PlusPlus() => kmeans.kmeans_minibatch(
size,
k,
max_iter,
KMeans::init_kmeanplusplus,
&config,
),
PyKMeansInit::RandomPartition() => kmeans.kmeans_minibatch(
size,
k,
max_iter,
KMeans::init_random_partition,
&config,
),
PyKMeansInit::RandomSample() => kmeans.kmeans_minibatch(
size,
k,
max_iter,
KMeans::init_random_sample,
&config,
),
PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_minibatch(
size,
k,
max_iter,
KMeans::init_precomputed(centroids),
&config,
),
},
},
}
})
}
/// K-Means++ initialization method, as implemented in Matlab
///
/// ## Description
/// This initialization method starts by selecting one sample as first centroid.
/// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
/// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
/// is from its centroid. Then, one sample is randomly selected, while taking their probability of being
/// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
/// their currently assigned cluster's centroid.
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
#[staticmethod]
pub(crate) fn init_plusplus() -> PyKMeansInit {
PyKMeansInit::PlusPlus()
}
/// Random-Parition initialization method
///
/// ## Description
/// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
/// These means are then used as initial clusters.
#[staticmethod]
pub(crate) fn init_random_partition() -> PyKMeansInit {
PyKMeansInit::RandomPartition()
}
/// Random sample initialization method (a.k.a. Forgy)
///
/// ## Description
/// This initialization method randomly selects k centroids from the samples as initial centroids.
#[staticmethod]
pub(crate) fn init_random_sample() -> PyKMeansInit {
PyKMeansInit::RandomSample()
}
/// Precomputed centroids initialization method
///
/// ## Description
/// This initialization method requires a precomputed list of k centroids to use as initial
/// centroids.
#[staticmethod]
pub(crate) fn init_precomputed(
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
centroids: PyArrayLike2<f64, AllowTypeChange>,
) -> PyKMeansInit {
PyKMeansInit::Precomputed(centroids.as_array().flatten().to_vec())
}
/// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
#[staticmethod]
pub(crate) fn algo_lloyd() -> PyKMeansAlgorithm {
PyKMeansAlgorithm::Lloyd()
}
/// Mini-Batch k-Means implementation.
/// (see: https://dl.acm.org/citation.cfm?id=1772862)
///
/// ## Arguments
/// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
#[staticmethod]
pub(crate) fn algo_mini_batch(batch_size: usize) -> PyKMeansAlgorithm {
PyKMeansAlgorithm::MiniBatch(batch_size)
}
/// find the closest cluster and the distance for each point
pub(crate) fn predict(
&self,
py: Python,
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
points: PyArrayLike2<f64, AllowTypeChange>,
) -> PyResult<(Vec<usize>, Vec<f64>)> {
let points = points.as_array();
Ok(py.detach(|| self.inner.predict(points))?)
}
/// calculate the mean simple (using centroids) silhouette score for a set of points,
/// assignments must be specified if they do not correspond to the assignments in the KMeans instance
#[pyo3(signature = (points, assignments = None))]
pub(crate) fn silhouette_simple(
&self,
py: Python,
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
points: PyArrayLike2<f64, AllowTypeChange>,
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
assignments: Option<PyArrayLike1<usize, AllowTypeChange>>,
) -> PyResult<f64> {
let points = points.as_array();
let assignments = assignments.as_ref().map(|a| a.as_array());
Ok(py.detach(|| self.inner.silhouette_simple(points, assignments))?)
}
/// number of dimensions
#[getter]
pub(crate) fn ndim(&self) -> usize {
self.ndim
}
/// number of clusters
#[getter]
pub(crate) fn k(&self) -> usize {
self.inner.k
}
/// sum of all distances, cost measure
#[getter]
pub(crate) fn distance_sum(&self) -> f64 {
self.inner.distsum
}
/// centroid coordinates
#[getter]
pub(crate) fn centroids<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyArray2<f64>>> {
let v = self.inner.centroids.to_vec();
Ok(Array2::from_shape_vec((v.len() / self.ndim, self.ndim), v)
.map_err(|e| PyErr::new::<PyTypeError, String>(e.to_string()))?
.into_pyarray(py))
}
/// centroid frequencies
#[getter]
pub(crate) fn centroid_frequency(&self) -> Vec<usize> {
self.inner.centroid_frequency.clone()
}
/// to which cluster each of the points is assigned
#[getter]
pub(crate) fn assignments(&self) -> Vec<usize> {
self.inner.assignments.clone()
}
/// distances of all points to the center it's assigned to
#[getter]
pub(crate) fn centroid_distances(&self) -> Vec<f64> {
self.inner.centroid_distances.clone()
}
}
impl Predict<f64> for KMeansState<f64> {
type Error = Error;
fn predict<'a, A>(&self, points: A) -> Result<(Vec<usize>, Vec<f64>), Self::Error>
where
A: AsArray<'a, f64, Ix2>,
{
let centroids = self.centroids.to_vec();
let ndim = centroids.len() / self.k;
let points = points.into();
let shape = points.shape();
if shape[1] != ndim {
return Err(Error::ShapeMismatch(shape[1], ndim));
}
if centroids.is_empty() {
return Err(Error::NoCentroidsDefined);
}
let fill = vec![0.0; 8 - ndim % 8];
let e = EuclideanDistance;
let dist = |s: &[f64]| {
s.par_chunks_exact(ndim)
.map(|point| {
let (i, d) = centroids
.par_chunks_exact(ndim)
.enumerate()
.fold(
|| (usize::MAX, f64::INFINITY),
|(i, a), (j, centroid)| {
let b = <EuclideanDistance as DistanceFunction<f64, 8>>::distance(
&e,
&[point, &fill].concat(),
&[centroid, &fill].concat(),
);
if a <= b { (i, a) } else { (j, b) }
},
)
.reduce(
|| (usize::MAX, f64::INFINITY),
|(i, a), (j, b)| {
if a <= b { (i, a) } else { (j, b) }
},
);
(i, d.sqrt())
})
.collect::<(Vec<_>, Vec<_>)>()
};
Ok(if let Some(s) = points.as_slice() {
dist(s)
} else {
let s = points.flatten().to_vec();
dist(&s)
})
}
fn silhouette_simple<'p, 'a, P, A>(
&self,
points: P,
assignments: Option<A>,
) -> Result<f64, Self::Error>
where
P: AsArray<'p, f64, Ix2>,
A: AsArray<'a, usize, Ix1>,
{
let points = points.into();
let shape = points.shape();
let centroids = Arc::new(self.centroids.to_vec());
let ndim = centroids.len() / self.k;
if shape[1] != ndim {
return Err(Error::ShapeMismatch(shape[1], ndim));
}
if centroids.is_empty() {
return Err(Error::NoCentroidsDefined);
}
let assignments = if let Some(assignments) = assignments {
assignments.into().to_vec()
} else {
self.assignments.to_vec()
};
let k = self.k;
let mut clusters = vec![Vec::new(); k];
for (point, assignment) in points.rows().into_iter().zip(assignments) {
clusters[assignment].extend(point.to_vec());
}
let fill = vec![0.0; 8 - ndim % 8];
let a = clusters
.par_iter()
.zip(centroids.clone().par_chunks_exact(ndim))
.flat_map(|(points, centroid)| {
let c = [centroid, &fill].concat();
let fill = fill.clone();
let e = EuclideanDistance;
points.par_chunks_exact(ndim).map(move |point| {
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
&e,
&c,
&[point, &fill].concat(),
)
.sqrt()
})
})
.collect::<Vec<_>>();
let b = clusters
.par_iter()
.enumerate()
.flat_map(|(i, points)| {
let centroids = centroids.clone();
let fill = fill.clone();
let e = EuclideanDistance;
points.par_chunks_exact(ndim).map(move |point| {
centroids
.par_chunks_exact(ndim)
.enumerate()
.map(|(j, centroid)| {
if i == j {
f64::INFINITY
} else {
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
&e,
&[centroid, &fill].concat(),
&[point, &fill].concat(),
)
.sqrt()
}
})
.min_by(|a, b| a.total_cmp(b))
.unwrap_or(f64::INFINITY)
})
})
.collect::<Vec<_>>();
Ok(a.into_iter()
.zip(b)
.map(|(a, b)| (b - a) / a.max(b))
.sum::<f64>()
/ points.shape()[0] as f64)
}
}
fn silhouette<'p, 'a, P, A, K>(points: P, assignments: A) -> Result<f64, Error>
where
P: AsArray<'p, f64, Ix2>,
A: AsArray<'a, K, Ix1>,
K: 'a + Eq + Hash,
{
let points = points.into();
let assignments = assignments.into();
let shape = points.shape();
let n = shape[0];
let ndim = shape[1];
let labels = assignments
.iter()
.collect::<HashSet<_>>()
.into_iter()
.enumerate()
.map(|(k, v)| (v, k))
.collect::<HashMap<_, _>>();
let assignments = assignments.iter().map(|k| labels[k]).collect::<Vec<_>>();
let k = labels.len();
let mut clusters = vec![Vec::new(); k];
for (point, assignment) in points.rows().into_iter().zip(assignments) {
clusters[assignment].extend(point.to_vec());
}
let bar = get_bar(Some(k * n + k * n * k))?;
let fill = vec![0.0; 8 - ndim % 8];
let e = EuclideanDistance;
let a = clusters
.par_iter()
.flat_map(|points| {
let c = (points.len() / ndim - 1) as f64;
points
.par_chunks_exact(ndim)
.map(|i| {
let q = points
.par_chunks_exact(ndim)
.map(|j| {
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
&e,
&[i, &fill].concat(),
&[j, &fill].concat(),
)
.sqrt()
})
.sum::<f64>()
/ c;
bar.inc(1);
q
})
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
let b = clusters
.par_iter()
.enumerate()
.flat_map(|(i, points_i)| {
points_i
.par_chunks_exact(ndim)
.map(|a| {
clusters
.par_iter()
.enumerate()
.map(|(j, points_j)| {
let c = (points_j.len() / ndim) as f64;
let q = if i == j {
f64::INFINITY
} else {
points_j
.par_chunks_exact(ndim)
.map(|b| {
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
&e,
&[a, &fill].concat(),
&[b, &fill].concat(),
)
.sqrt()
})
.sum::<f64>()
/ c
};
bar.inc(1);
q
})
.min_by(|a, b| a.total_cmp(b))
.unwrap_or(f64::INFINITY)
})
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
bar.finish();
Ok(a.into_iter()
.zip(b)
.map(|(a, b)| (b - a) / a.max(b))
.sum::<f64>()
/ points.shape()[0] as f64)
}
/// calculate the mean silhouette score for a set of points
#[gen_stub_pyfunction(module = "kmeans_rs")]
#[pyfunction(name = "silhouette")]
pub(crate) fn py_silhouette(
py: Python,
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
points: PyArrayLike2<f64, AllowTypeChange>,
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
assignments: PyArrayLike1<usize, AllowTypeChange>,
) -> PyResult<f64> {
let points = points.as_array();
let assignments = assignments.as_array();
Ok(py.detach(|| silhouette(points, assignments))?)
}
/// generates kmeans/__init__.pyi
#[pyfunction]
fn generate_stub(dest_path: String) -> PyResult<()> {
Ok(StubInfo::from_project_root(
"kmeans_rs".to_string(),
PathBuf::from(dest_path).join("py"),
true,
StubGenConfig::default(),
)?
.generate()?)
}
#[pymodule]
#[pyo3(name = "kmeans_rs")]
mod kmeans_rs {
use pyo3::prelude::*;
#[pymodule_export]
use super::generate_stub;
#[pymodule_export]
use super::PyKMeans;
#[pymodule_export]
use super::PyKMeansInit;
#[pymodule_export]
use super::PyKMeansAlgorithm;
#[pymodule_export]
use super::py_silhouette;
#[pymodule_init]
fn init(_: &Bound<'_, PyModule>) -> PyResult<()> {
Ok(color_eyre::install()?)
}
}