20e04c8b53
CI / linux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 55s
CI / linux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:ppc64le]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:s390x]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 6s
CI / musllinux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 4s
CI / musllinux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 4s
CI / windows (map[runner:windows-latest target:x64]) (push) Has been cancelled
CI / windows (map[runner:windows-latest target:x86]) (push) Has been cancelled
CI / macos (map[runner:macos-13 target:x86_64]) (push) Has been cancelled
CI / macos (map[runner:macos-14 target:aarch64]) (push) Has been cancelled
CI / Release (push) Has been cancelled
CI / sdist (push) Has been cancelled
CI / linux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 5s
697 lines
24 KiB
Rust
697 lines
24 KiB
Rust
use console::Term;
|
|
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
|
|
use kmeans::*;
|
|
use ndarray::{Array2, AsArray, Ix1, Ix2};
|
|
use numpy::{AllowTypeChange, IntoPyArray, PyArray2, PyArrayLike1, PyArrayLike2};
|
|
use pyo3::exceptions::PyTypeError;
|
|
use pyo3::prelude::*;
|
|
use pyo3_stub_gen::derive::*;
|
|
use pyo3_stub_gen::{StubGenConfig, StubInfo};
|
|
use rayon::prelude::*;
|
|
use std::collections::{HashMap, HashSet};
|
|
use std::hash::Hash;
|
|
use std::path::PathBuf;
|
|
use std::sync::Arc;
|
|
use std::time::Duration;
|
|
|
|
#[derive(Debug, thiserror::Error)]
|
|
pub enum Error {
|
|
#[error(transparent)]
|
|
ProgressBarTemplate(#[from] indicatif::style::TemplateError),
|
|
#[error("shape mismatch: {0} != {1}")]
|
|
ShapeMismatch(usize, usize),
|
|
#[error("no centroids defined")]
|
|
NoCentroidsDefined,
|
|
}
|
|
|
|
impl From<Error> for PyErr {
|
|
fn from(err: Error) -> PyErr {
|
|
color_eyre::eyre::Report::from(err).into()
|
|
}
|
|
}
|
|
|
|
/// a progress bar with an ok style that when py::detach is used also works in jupyter
|
|
pub fn get_bar(count: Option<usize>) -> Result<ProgressBar, Error> {
|
|
let style = ProgressStyle::with_template(
|
|
"{spinner:.green} {percent}% [{wide_bar:.green/lime}] {pos:>7}/{len:7} [{elapsed}/{eta}, {per_sec:<5}]",
|
|
)?.progress_chars("#>-");
|
|
let bar = ProgressBar::with_draw_target(
|
|
count.map(|i| i as u64),
|
|
ProgressDrawTarget::term_like_with_hz(Box::new(Term::buffered_stdout()), 20),
|
|
)
|
|
.with_style(style);
|
|
bar.enable_steady_tick(Duration::from_millis(100));
|
|
Ok(bar)
|
|
}
|
|
|
|
trait Predict<T> {
|
|
type Error;
|
|
|
|
fn predict<'a, A>(&self, points: A) -> Result<(Vec<usize>, Vec<T>), Self::Error>
|
|
where
|
|
A: AsArray<'a, T, Ix2>,
|
|
T: 'a;
|
|
|
|
fn silhouette_simple<'p, 'a, P, A>(
|
|
&self,
|
|
points: P,
|
|
assignments: Option<A>,
|
|
) -> Result<f64, Self::Error>
|
|
where
|
|
P: AsArray<'p, f64, Ix2>,
|
|
A: AsArray<'a, usize, Ix1>;
|
|
}
|
|
|
|
/// Specify an initialization method using plusplus, random_partition, random_sample or precomputed.
|
|
#[gen_stub_pyclass_complex_enum]
|
|
#[pyclass(name = "KMeansInit", module = "kmeans_rs", from_py_object)]
|
|
#[derive(Clone, Debug)]
|
|
pub(crate) enum PyKMeansInit {
|
|
PlusPlus(),
|
|
RandomPartition(),
|
|
RandomSample(),
|
|
Precomputed(Vec<f64>),
|
|
}
|
|
|
|
#[gen_stub_pymethods]
|
|
#[pymethods]
|
|
impl PyKMeansInit {
|
|
/// K-Means++ initialization method, as implemented in Matlab
|
|
///
|
|
/// ## Description
|
|
/// This initialization method starts by selecting one sample as first centroid.
|
|
/// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
|
|
/// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
|
|
/// is from its centroid. Then, one sample is randomly selected, while taking their probability of being
|
|
/// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
|
|
/// their currently assigned cluster's centroid.
|
|
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
|
#[staticmethod]
|
|
pub(crate) fn plusplus() -> Self {
|
|
Self::PlusPlus()
|
|
}
|
|
|
|
/// Random-Partition initialization method
|
|
///
|
|
/// ## Description
|
|
/// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
|
|
/// These means are then used as initial clusters.
|
|
#[staticmethod]
|
|
pub(crate) fn random_partition() -> Self {
|
|
Self::RandomPartition()
|
|
}
|
|
|
|
/// Random sample initialization method (a.k.a. Forgy)
|
|
///
|
|
/// ## Description
|
|
/// This initialization method randomly selects k centroids from the samples as initial centroids.
|
|
#[staticmethod]
|
|
pub(crate) fn random_sample() -> Self {
|
|
Self::RandomSample()
|
|
}
|
|
|
|
/// Precomputed centroids initialization method
|
|
///
|
|
/// ## Description
|
|
/// This initialization method requires a precomputed list of k centroids to use as initial
|
|
/// centroids.
|
|
#[staticmethod]
|
|
pub(crate) fn precomputed(
|
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
|
centroids: PyArrayLike2<f64, AllowTypeChange>,
|
|
) -> Self {
|
|
Self::Precomputed(centroids.as_array().flatten().to_vec())
|
|
}
|
|
}
|
|
|
|
/// Specify a kmeans algorithm using lloyd or mini_batch.
|
|
#[gen_stub_pyclass_complex_enum]
|
|
#[pyclass(name = "KMeansAlgorithm", module = "kmeans_rs", from_py_object)]
|
|
#[derive(Clone, Debug)]
|
|
pub(crate) enum PyKMeansAlgorithm {
|
|
Lloyd(),
|
|
MiniBatch(usize),
|
|
}
|
|
|
|
#[gen_stub_pymethods]
|
|
#[pymethods]
|
|
impl PyKMeansAlgorithm {
|
|
/// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
|
|
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
|
#[staticmethod]
|
|
pub(crate) fn lloyd() -> Self {
|
|
Self::Lloyd()
|
|
}
|
|
|
|
/// Mini-Batch k-Means implementation.
|
|
/// (see: https://dl.acm.org/citation.cfm?id=1772862)
|
|
///
|
|
/// ## Arguments
|
|
/// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
|
|
#[staticmethod]
|
|
pub(crate) fn mini_batch(batch_size: usize) -> Self {
|
|
Self::MiniBatch(batch_size)
|
|
}
|
|
}
|
|
|
|
/// Compute kmeans clustering
|
|
/// this implementation is supposed to be faster than scipy or scikit-learn
|
|
/// when dealing with a lot of points
|
|
///
|
|
/// ## Arguments
|
|
/// - **points**: Numpy array #points x dimensions
|
|
/// - **k**: Amount of clusters to search for
|
|
/// - **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite)
|
|
/// - **init**: initialization method
|
|
/// - **algorithm**: algorithm to use
|
|
#[gen_stub_pyclass]
|
|
#[pyclass(name = "KMeans", module = "kmeans_rs", from_py_object)]
|
|
#[derive(Clone, Debug)]
|
|
pub(crate) struct PyKMeans {
|
|
ndim: usize,
|
|
inner: KMeansState<f64>,
|
|
}
|
|
|
|
#[gen_stub_pymethods]
|
|
#[pymethods]
|
|
impl PyKMeans {
|
|
#[new]
|
|
#[pyo3(signature = (points, k, max_iter=300, init=None, algorithm=None))]
|
|
pub(crate) fn new(
|
|
py: Python,
|
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
|
points: PyArrayLike2<f64, AllowTypeChange>,
|
|
k: usize,
|
|
max_iter: usize,
|
|
init: Option<PyKMeansInit>,
|
|
algorithm: Option<PyKMeansAlgorithm>,
|
|
) -> Self {
|
|
let points = points.as_array();
|
|
py.detach(|| {
|
|
let shape = points.shape();
|
|
let kmeans = if let Some(s) = points.as_slice() {
|
|
KMeans::<f64, 8, _>::new(s, shape[0], shape[1], EuclideanDistance)
|
|
} else {
|
|
let v = points.flatten().to_vec();
|
|
KMeans::<f64, 8, _>::new(v.as_slice(), shape[0], shape[1], EuclideanDistance)
|
|
};
|
|
let init = if let Some(init) = init {
|
|
init
|
|
} else {
|
|
PyKMeansInit::PlusPlus()
|
|
};
|
|
let algorithm = if let Some(algorithm) = algorithm {
|
|
algorithm
|
|
} else {
|
|
PyKMeansAlgorithm::Lloyd()
|
|
};
|
|
let config = KMeansConfig::default();
|
|
match algorithm {
|
|
PyKMeansAlgorithm::Lloyd() => PyKMeans {
|
|
ndim: shape[1],
|
|
inner: match init {
|
|
PyKMeansInit::PlusPlus() => {
|
|
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_kmeanplusplus, &config)
|
|
}
|
|
PyKMeansInit::RandomPartition() => {
|
|
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_partition, &config)
|
|
}
|
|
PyKMeansInit::RandomSample() => {
|
|
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_sample, &config)
|
|
}
|
|
PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_lloyd(
|
|
k,
|
|
max_iter,
|
|
KMeans::init_precomputed(centroids),
|
|
&config,
|
|
),
|
|
},
|
|
},
|
|
PyKMeansAlgorithm::MiniBatch(size) => PyKMeans {
|
|
ndim: shape[1],
|
|
inner: match init {
|
|
PyKMeansInit::PlusPlus() => kmeans.kmeans_minibatch(
|
|
size,
|
|
k,
|
|
max_iter,
|
|
KMeans::init_kmeanplusplus,
|
|
&config,
|
|
),
|
|
PyKMeansInit::RandomPartition() => kmeans.kmeans_minibatch(
|
|
size,
|
|
k,
|
|
max_iter,
|
|
KMeans::init_random_partition,
|
|
&config,
|
|
),
|
|
PyKMeansInit::RandomSample() => kmeans.kmeans_minibatch(
|
|
size,
|
|
k,
|
|
max_iter,
|
|
KMeans::init_random_sample,
|
|
&config,
|
|
),
|
|
PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_minibatch(
|
|
size,
|
|
k,
|
|
max_iter,
|
|
KMeans::init_precomputed(centroids),
|
|
&config,
|
|
),
|
|
},
|
|
},
|
|
}
|
|
})
|
|
}
|
|
|
|
/// K-Means++ initialization method, as implemented in Matlab
|
|
///
|
|
/// ## Description
|
|
/// This initialization method starts by selecting one sample as first centroid.
|
|
/// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
|
|
/// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
|
|
/// is from its centroid. Then, one sample is randomly selected, while taking their probability of being
|
|
/// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
|
|
/// their currently assigned cluster's centroid.
|
|
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
|
#[staticmethod]
|
|
pub(crate) fn init_plusplus() -> PyKMeansInit {
|
|
PyKMeansInit::PlusPlus()
|
|
}
|
|
|
|
/// Random-Parition initialization method
|
|
///
|
|
/// ## Description
|
|
/// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
|
|
/// These means are then used as initial clusters.
|
|
#[staticmethod]
|
|
pub(crate) fn init_random_partition() -> PyKMeansInit {
|
|
PyKMeansInit::RandomPartition()
|
|
}
|
|
|
|
/// Random sample initialization method (a.k.a. Forgy)
|
|
///
|
|
/// ## Description
|
|
/// This initialization method randomly selects k centroids from the samples as initial centroids.
|
|
#[staticmethod]
|
|
pub(crate) fn init_random_sample() -> PyKMeansInit {
|
|
PyKMeansInit::RandomSample()
|
|
}
|
|
|
|
/// Precomputed centroids initialization method
|
|
///
|
|
/// ## Description
|
|
/// This initialization method requires a precomputed list of k centroids to use as initial
|
|
/// centroids.
|
|
#[staticmethod]
|
|
pub(crate) fn init_precomputed(
|
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
|
centroids: PyArrayLike2<f64, AllowTypeChange>,
|
|
) -> PyKMeansInit {
|
|
PyKMeansInit::Precomputed(centroids.as_array().flatten().to_vec())
|
|
}
|
|
|
|
/// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
|
|
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
|
#[staticmethod]
|
|
pub(crate) fn algo_lloyd() -> PyKMeansAlgorithm {
|
|
PyKMeansAlgorithm::Lloyd()
|
|
}
|
|
|
|
/// Mini-Batch k-Means implementation.
|
|
/// (see: https://dl.acm.org/citation.cfm?id=1772862)
|
|
///
|
|
/// ## Arguments
|
|
/// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
|
|
#[staticmethod]
|
|
pub(crate) fn algo_mini_batch(batch_size: usize) -> PyKMeansAlgorithm {
|
|
PyKMeansAlgorithm::MiniBatch(batch_size)
|
|
}
|
|
|
|
/// find the closest cluster and the distance for each point
|
|
pub(crate) fn predict(
|
|
&self,
|
|
py: Python,
|
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
|
points: PyArrayLike2<f64, AllowTypeChange>,
|
|
) -> PyResult<(Vec<usize>, Vec<f64>)> {
|
|
let points = points.as_array();
|
|
Ok(py.detach(|| self.inner.predict(points))?)
|
|
}
|
|
|
|
/// calculate the mean simple (using centroids) silhouette score for a set of points,
|
|
/// assignments must be specified if they do not correspond to the assignments in the KMeans instance
|
|
#[pyo3(signature = (points, assignments = None))]
|
|
pub(crate) fn silhouette_simple(
|
|
&self,
|
|
py: Python,
|
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
|
points: PyArrayLike2<f64, AllowTypeChange>,
|
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
|
assignments: Option<PyArrayLike1<usize, AllowTypeChange>>,
|
|
) -> PyResult<f64> {
|
|
let points = points.as_array();
|
|
let assignments = assignments.as_ref().map(|a| a.as_array());
|
|
Ok(py.detach(|| self.inner.silhouette_simple(points, assignments))?)
|
|
}
|
|
|
|
/// number of dimensions
|
|
#[getter]
|
|
pub(crate) fn ndim(&self) -> usize {
|
|
self.ndim
|
|
}
|
|
|
|
/// number of clusters
|
|
#[getter]
|
|
pub(crate) fn k(&self) -> usize {
|
|
self.inner.k
|
|
}
|
|
|
|
/// sum of all distances, cost measure
|
|
#[getter]
|
|
pub(crate) fn distance_sum(&self) -> f64 {
|
|
self.inner.distsum
|
|
}
|
|
|
|
/// centroid coordinates
|
|
#[getter]
|
|
pub(crate) fn centroids<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyArray2<f64>>> {
|
|
let v = self.inner.centroids.to_vec();
|
|
Ok(Array2::from_shape_vec((v.len() / self.ndim, self.ndim), v)
|
|
.map_err(|e| PyErr::new::<PyTypeError, String>(e.to_string()))?
|
|
.into_pyarray(py))
|
|
}
|
|
|
|
/// centroid frequencies
|
|
#[getter]
|
|
pub(crate) fn centroid_frequency(&self) -> Vec<usize> {
|
|
self.inner.centroid_frequency.clone()
|
|
}
|
|
|
|
/// to which cluster each of the points is assigned
|
|
#[getter]
|
|
pub(crate) fn assignments(&self) -> Vec<usize> {
|
|
self.inner.assignments.clone()
|
|
}
|
|
|
|
/// distances of all points to the center it's assigned to
|
|
#[getter]
|
|
pub(crate) fn centroid_distances(&self) -> Vec<f64> {
|
|
self.inner.centroid_distances.clone()
|
|
}
|
|
}
|
|
|
|
impl Predict<f64> for KMeansState<f64> {
|
|
type Error = Error;
|
|
|
|
fn predict<'a, A>(&self, points: A) -> Result<(Vec<usize>, Vec<f64>), Self::Error>
|
|
where
|
|
A: AsArray<'a, f64, Ix2>,
|
|
{
|
|
let centroids = self.centroids.to_vec();
|
|
let ndim = centroids.len() / self.k;
|
|
let points = points.into();
|
|
let shape = points.shape();
|
|
if shape[1] != ndim {
|
|
return Err(Error::ShapeMismatch(shape[1], ndim));
|
|
}
|
|
if centroids.is_empty() {
|
|
return Err(Error::NoCentroidsDefined);
|
|
}
|
|
let fill = vec![0.0; 8 - ndim % 8];
|
|
let e = EuclideanDistance;
|
|
let dist = |s: &[f64]| {
|
|
s.par_chunks_exact(ndim)
|
|
.map(|point| {
|
|
let (i, d) = centroids
|
|
.par_chunks_exact(ndim)
|
|
.enumerate()
|
|
.fold(
|
|
|| (usize::MAX, f64::INFINITY),
|
|
|(i, a), (j, centroid)| {
|
|
let b = <EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
|
&e,
|
|
&[point, &fill].concat(),
|
|
&[centroid, &fill].concat(),
|
|
);
|
|
if a <= b { (i, a) } else { (j, b) }
|
|
},
|
|
)
|
|
.reduce(
|
|
|| (usize::MAX, f64::INFINITY),
|
|
|(i, a), (j, b)| {
|
|
if a <= b { (i, a) } else { (j, b) }
|
|
},
|
|
);
|
|
(i, d.sqrt())
|
|
})
|
|
.collect::<(Vec<_>, Vec<_>)>()
|
|
};
|
|
|
|
Ok(if let Some(s) = points.as_slice() {
|
|
dist(s)
|
|
} else {
|
|
let s = points.flatten().to_vec();
|
|
dist(&s)
|
|
})
|
|
}
|
|
|
|
fn silhouette_simple<'p, 'a, P, A>(
|
|
&self,
|
|
points: P,
|
|
assignments: Option<A>,
|
|
) -> Result<f64, Self::Error>
|
|
where
|
|
P: AsArray<'p, f64, Ix2>,
|
|
A: AsArray<'a, usize, Ix1>,
|
|
{
|
|
let points = points.into();
|
|
let shape = points.shape();
|
|
let centroids = Arc::new(self.centroids.to_vec());
|
|
let ndim = centroids.len() / self.k;
|
|
|
|
if shape[1] != ndim {
|
|
return Err(Error::ShapeMismatch(shape[1], ndim));
|
|
}
|
|
if centroids.is_empty() {
|
|
return Err(Error::NoCentroidsDefined);
|
|
}
|
|
|
|
let assignments = if let Some(assignments) = assignments {
|
|
assignments.into().to_vec()
|
|
} else {
|
|
self.assignments.to_vec()
|
|
};
|
|
let k = self.k;
|
|
let mut clusters = vec![Vec::new(); k];
|
|
for (point, assignment) in points.rows().into_iter().zip(assignments) {
|
|
clusters[assignment].extend(point.to_vec());
|
|
}
|
|
let fill = vec![0.0; 8 - ndim % 8];
|
|
let a = clusters
|
|
.par_iter()
|
|
.zip(centroids.clone().par_chunks_exact(ndim))
|
|
.flat_map(|(points, centroid)| {
|
|
let c = [centroid, &fill].concat();
|
|
let fill = fill.clone();
|
|
let e = EuclideanDistance;
|
|
points.par_chunks_exact(ndim).map(move |point| {
|
|
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
|
&e,
|
|
&c,
|
|
&[point, &fill].concat(),
|
|
)
|
|
.sqrt()
|
|
})
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
let b = clusters
|
|
.par_iter()
|
|
.enumerate()
|
|
.flat_map(|(i, points)| {
|
|
let centroids = centroids.clone();
|
|
let fill = fill.clone();
|
|
let e = EuclideanDistance;
|
|
points.par_chunks_exact(ndim).map(move |point| {
|
|
centroids
|
|
.par_chunks_exact(ndim)
|
|
.enumerate()
|
|
.map(|(j, centroid)| {
|
|
if i == j {
|
|
f64::INFINITY
|
|
} else {
|
|
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
|
&e,
|
|
&[centroid, &fill].concat(),
|
|
&[point, &fill].concat(),
|
|
)
|
|
.sqrt()
|
|
}
|
|
})
|
|
.min_by(|a, b| a.total_cmp(b))
|
|
.unwrap_or(f64::INFINITY)
|
|
})
|
|
})
|
|
.collect::<Vec<_>>();
|
|
Ok(a.into_iter()
|
|
.zip(b)
|
|
.map(|(a, b)| (b - a) / a.max(b))
|
|
.sum::<f64>()
|
|
/ points.shape()[0] as f64)
|
|
}
|
|
}
|
|
|
|
fn silhouette<'p, 'a, P, A, K>(points: P, assignments: A) -> Result<f64, Error>
|
|
where
|
|
P: AsArray<'p, f64, Ix2>,
|
|
A: AsArray<'a, K, Ix1>,
|
|
K: 'a + Eq + Hash,
|
|
{
|
|
let points = points.into();
|
|
let assignments = assignments.into();
|
|
let shape = points.shape();
|
|
let n = shape[0];
|
|
let ndim = shape[1];
|
|
|
|
let labels = assignments
|
|
.iter()
|
|
.collect::<HashSet<_>>()
|
|
.into_iter()
|
|
.enumerate()
|
|
.map(|(k, v)| (v, k))
|
|
.collect::<HashMap<_, _>>();
|
|
let assignments = assignments.iter().map(|k| labels[k]).collect::<Vec<_>>();
|
|
let k = labels.len();
|
|
|
|
let mut clusters = vec![Vec::new(); k];
|
|
for (point, assignment) in points.rows().into_iter().zip(assignments) {
|
|
clusters[assignment].extend(point.to_vec());
|
|
}
|
|
let bar = get_bar(Some(k * n + k * n * k))?;
|
|
let fill = vec![0.0; 8 - ndim % 8];
|
|
let e = EuclideanDistance;
|
|
let a = clusters
|
|
.par_iter()
|
|
.flat_map(|points| {
|
|
let c = (points.len() / ndim - 1) as f64;
|
|
points
|
|
.par_chunks_exact(ndim)
|
|
.map(|i| {
|
|
let q = points
|
|
.par_chunks_exact(ndim)
|
|
.map(|j| {
|
|
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
|
&e,
|
|
&[i, &fill].concat(),
|
|
&[j, &fill].concat(),
|
|
)
|
|
.sqrt()
|
|
})
|
|
.sum::<f64>()
|
|
/ c;
|
|
bar.inc(1);
|
|
q
|
|
})
|
|
.collect::<Vec<_>>()
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
let b = clusters
|
|
.par_iter()
|
|
.enumerate()
|
|
.flat_map(|(i, points_i)| {
|
|
points_i
|
|
.par_chunks_exact(ndim)
|
|
.map(|a| {
|
|
clusters
|
|
.par_iter()
|
|
.enumerate()
|
|
.map(|(j, points_j)| {
|
|
let c = (points_j.len() / ndim) as f64;
|
|
let q = if i == j {
|
|
f64::INFINITY
|
|
} else {
|
|
points_j
|
|
.par_chunks_exact(ndim)
|
|
.map(|b| {
|
|
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
|
&e,
|
|
&[a, &fill].concat(),
|
|
&[b, &fill].concat(),
|
|
)
|
|
.sqrt()
|
|
})
|
|
.sum::<f64>()
|
|
/ c
|
|
};
|
|
bar.inc(1);
|
|
q
|
|
})
|
|
.min_by(|a, b| a.total_cmp(b))
|
|
.unwrap_or(f64::INFINITY)
|
|
})
|
|
.collect::<Vec<_>>()
|
|
})
|
|
.collect::<Vec<_>>();
|
|
bar.finish();
|
|
Ok(a.into_iter()
|
|
.zip(b)
|
|
.map(|(a, b)| (b - a) / a.max(b))
|
|
.sum::<f64>()
|
|
/ points.shape()[0] as f64)
|
|
}
|
|
|
|
/// calculate the mean silhouette score for a set of points
|
|
#[gen_stub_pyfunction(module = "kmeans_rs")]
|
|
#[pyfunction(name = "silhouette")]
|
|
pub(crate) fn py_silhouette(
|
|
py: Python,
|
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
|
points: PyArrayLike2<f64, AllowTypeChange>,
|
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
|
assignments: PyArrayLike1<usize, AllowTypeChange>,
|
|
) -> PyResult<f64> {
|
|
let points = points.as_array();
|
|
let assignments = assignments.as_array();
|
|
Ok(py.detach(|| silhouette(points, assignments))?)
|
|
}
|
|
|
|
/// generates kmeans/__init__.pyi
|
|
#[pyfunction]
|
|
fn generate_stub(dest_path: String) -> PyResult<()> {
|
|
Ok(StubInfo::from_project_root(
|
|
"kmeans_rs".to_string(),
|
|
PathBuf::from(dest_path).join("py"),
|
|
true,
|
|
StubGenConfig::default(),
|
|
)?
|
|
.generate()?)
|
|
}
|
|
|
|
#[pymodule]
|
|
#[pyo3(name = "kmeans_rs")]
|
|
mod kmeans_rs {
|
|
use pyo3::prelude::*;
|
|
|
|
#[pymodule_export]
|
|
use super::generate_stub;
|
|
|
|
#[pymodule_export]
|
|
use super::PyKMeans;
|
|
|
|
#[pymodule_export]
|
|
use super::PyKMeansInit;
|
|
|
|
#[pymodule_export]
|
|
use super::PyKMeansAlgorithm;
|
|
|
|
#[pymodule_export]
|
|
use super::py_silhouette;
|
|
|
|
#[pymodule_init]
|
|
fn init(_: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
Ok(color_eyre::install()?)
|
|
}
|
|
}
|