first commit

2026-04-16 16:26:53 +02:00
commit 20e04c8b53
11 changed files with 1529 additions and 0 deletions
@@ -0,0 +1,33 @@
+import os
+import sys
+from importlib.metadata import version
+from pathlib import Path
+
+os.environ["RUST_BACKTRACE"] = "full"
+os.environ["COLORBT_SHOW_HIDDEN"] = "1"
+
+from .kmeans_rs import *  # noqa
+
+try:
+    __version__ = version(Path(__file__).parent.name)
+except (Exception,):
+    __version__ = "unknown"
+
+try:
+    with open(Path(__file__).parent.parent / ".git" / "HEAD") as g:
+        head = g.read().split(":")[1].strip()
+    with open(Path(__file__).parent.parent / ".git" / head) as h:
+        __git_commit_hash__ = h.read().rstrip("\n")
+except (Exception,):
+    __git_commit_hash__ = "unknown"
+
+
+def kmeans_generate_stub():
+    if len(sys.argv) > 1:
+        path = Path(sys.argv[1]).resolve()
+    else:
+        path = Path.cwd().resolve()
+    if (path / "py" / "kmeans_rs" / "__init__.py").exists():
+        generate_stub(str(path))  # noqa
+    else:
+        raise ModuleNotFoundError(str(path / "py" / "kmeans_rs" / "__init__.py"))
@@ -0,0 +1,245 @@
+# This file is automatically generated by pyo3_stub_gen
+# ruff: noqa: E501, F401, F403, F405
+
+import builtins
+import numpy
+import numpy.typing
+import typing
+__all__ = [
+    "KMeans",
+    "KMeansAlgorithm",
+    "KMeansInit",
+    "silhouette",
+]
+
+@typing.final
+class KMeans:
+    r"""
+    Compute kmeans clustering
+    this implementation is supposed to be faster than scipy or scikit-learn
+    when dealing with a lot of points
+    
+    ## Arguments
+    - **points**: Numpy array #points x dimensions
+    - **k**: Amount of clusters to search for
+    - **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite)
+    - **init**: initialization method
+    - **algorithm**: algorithm to use
+    """
+    @property
+    def ndim(self) -> builtins.int:
+        r"""
+        number of dimensions
+        """
+    @property
+    def k(self) -> builtins.int:
+        r"""
+        number of clusters
+        """
+    @property
+    def distance_sum(self) -> builtins.float:
+        r"""
+        sum of all distances, cost measure
+        """
+    @property
+    def centroids(self) -> numpy.typing.NDArray[numpy.float64]:
+        r"""
+        centroid coordinates
+        """
+    @property
+    def centroid_frequency(self) -> builtins.list[builtins.int]:
+        r"""
+        centroid frequencies
+        """
+    @property
+    def assignments(self) -> builtins.list[builtins.int]:
+        r"""
+        to which cluster each of the points is assigned
+        """
+    @property
+    def centroid_distances(self) -> builtins.list[builtins.float]:
+        r"""
+        distances of all points to the center it's assigned to
+        """
+    def __new__(cls, points: numpy.typing.ArrayLike, k: builtins.int, max_iter: builtins.int = 300, init: typing.Optional[KMeansInit] = None, algorithm: typing.Optional[KMeansAlgorithm] = None) -> KMeans: ...
+    @staticmethod
+    def init_plusplus() -> KMeansInit:
+        r"""
+        K-Means++ initialization method, as implemented in Matlab
+        
+        ## Description
+        This initialization method starts by selecting one sample as first centroid.
+        Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
+        each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
+        is from its centroid. Then, one sample is randomly selected, while taking their probability of being
+        the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
+        their currently assigned cluster's centroid.
+        (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5    Section: More About)
+        """
+    @staticmethod
+    def init_random_partition() -> KMeansInit:
+        r"""
+        Random-Parition initialization method
+        
+        ## Description
+        This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
+        These means are then used as initial clusters.
+        """
+    @staticmethod
+    def init_random_sample() -> KMeansInit:
+        r"""
+        Random sample initialization method (a.k.a. Forgy)
+        
+        ## Description
+        This initialization method randomly selects k centroids from the samples as initial centroids.
+        """
+    @staticmethod
+    def init_precomputed(centroids: numpy.typing.ArrayLike) -> KMeansInit:
+        r"""
+        Precomputed centroids initialization method
+        
+        ## Description
+        This initialization method requires a precomputed list of k centroids to use as initial
+        centroids.
+        """
+    @staticmethod
+    def algo_lloyd() -> KMeansAlgorithm:
+        r"""
+        Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
+        (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5    Section: More About)
+        """
+    @staticmethod
+    def algo_mini_batch(batch_size: builtins.int) -> KMeansAlgorithm:
+        r"""
+        Mini-Batch k-Means implementation.
+        (see: https://dl.acm.org/citation.cfm?id=1772862)
+        
+        ## Arguments
+        - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
+        """
+    def predict(self, points: numpy.typing.ArrayLike) -> tuple[builtins.list[builtins.int], builtins.list[builtins.float]]:
+        r"""
+        find the closest cluster and the distance for each point
+        """
+    def silhouette_simple(self, points: numpy.typing.ArrayLike, assignments: numpy.typing.ArrayLike = None) -> builtins.float:
+        r"""
+        calculate the mean simple (using centroids) silhouette score for a set of points,
+        assignments must be specified if they do not correspond to the assignments in the KMeans instance
+        """
+
+class KMeansAlgorithm:
+    r"""
+    Specify a kmeans algorithm using lloyd or mini_batch.
+    """
+    @staticmethod
+    def lloyd() -> KMeansAlgorithm:
+        r"""
+        Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
+        (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5    Section: More About)
+        """
+    @staticmethod
+    def mini_batch(batch_size: builtins.int) -> KMeansAlgorithm:
+        r"""
+        Mini-Batch k-Means implementation.
+        (see: https://dl.acm.org/citation.cfm?id=1772862)
+        
+        ## Arguments
+        - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
+        """
+    @typing.final
+    class Lloyd(KMeansAlgorithm):
+        __match_args__ = ()
+        def __new__(cls) -> KMeansAlgorithm.Lloyd: ...
+        def __len__(self) -> builtins.int: ...
+        def __getitem__(self, key: builtins.int) -> typing.Any: ...
+    
+    @typing.final
+    class MiniBatch(KMeansAlgorithm):
+        __match_args__ = ("_0",)
+        @property
+        def _0(self) -> builtins.int: ...
+        def __new__(cls, _0: builtins.int) -> KMeansAlgorithm.MiniBatch: ...
+        def __len__(self) -> builtins.int: ...
+        def __getitem__(self, key: builtins.int) -> typing.Any: ...
+    
+
+class KMeansInit:
+    r"""
+    Specify an initialization method using plusplus, random_partition, random_sample or precomputed.
+    """
+    @staticmethod
+    def plusplus() -> KMeansInit:
+        r"""
+        K-Means++ initialization method, as implemented in Matlab
+        
+        ## Description
+        This initialization method starts by selecting one sample as first centroid.
+        Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
+        each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
+        is from its centroid. Then, one sample is randomly selected, while taking their probability of being
+        the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
+        their currently assigned cluster's centroid.
+        (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5    Section: More About)
+        """
+    @staticmethod
+    def random_partition() -> KMeansInit:
+        r"""
+        Random-Partition initialization method
+        
+        ## Description
+        This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
+        These means are then used as initial clusters.
+        """
+    @staticmethod
+    def random_sample() -> KMeansInit:
+        r"""
+        Random sample initialization method (a.k.a. Forgy)
+        
+        ## Description
+        This initialization method randomly selects k centroids from the samples as initial centroids.
+        """
+    @staticmethod
+    def precomputed(centroids: numpy.typing.ArrayLike) -> KMeansInit:
+        r"""
+        Precomputed centroids initialization method
+        
+        ## Description
+        This initialization method requires a precomputed list of k centroids to use as initial
+        centroids.
+        """
+    @typing.final
+    class PlusPlus(KMeansInit):
+        __match_args__ = ()
+        def __new__(cls) -> KMeansInit.PlusPlus: ...
+        def __len__(self) -> builtins.int: ...
+        def __getitem__(self, key: builtins.int) -> typing.Any: ...
+    
+    @typing.final
+    class RandomPartition(KMeansInit):
+        __match_args__ = ()
+        def __new__(cls) -> KMeansInit.RandomPartition: ...
+        def __len__(self) -> builtins.int: ...
+        def __getitem__(self, key: builtins.int) -> typing.Any: ...
+    
+    @typing.final
+    class RandomSample(KMeansInit):
+        __match_args__ = ()
+        def __new__(cls) -> KMeansInit.RandomSample: ...
+        def __len__(self) -> builtins.int: ...
+        def __getitem__(self, key: builtins.int) -> typing.Any: ...
+    
+    @typing.final
+    class Precomputed(KMeansInit):
+        __match_args__ = ("_0",)
+        @property
+        def _0(self) -> builtins.list[builtins.float]: ...
+        def __new__(cls, _0: typing.Sequence[builtins.float]) -> KMeansInit.Precomputed: ...
+        def __len__(self) -> builtins.int: ...
+        def __getitem__(self, key: builtins.int) -> typing.Any: ...
+    
+
+def silhouette(points: numpy.typing.ArrayLike, assignments: numpy.typing.ArrayLike) -> builtins.float:
+    r"""
+    calculate the mean silhouette score for a set of points
+    """
+