From 20e04c8b537ff1d3c3937fecce7e21b321128dfe Mon Sep 17 00:00:00 2001 From: "w.pomp" Date: Thu, 16 Apr 2026 16:26:53 +0200 Subject: [PATCH] first commit --- .github/workflows/CI.yml | 181 ++++++++++ .gitignore | 74 ++++ Cargo.toml | 30 ++ LICENSE-APACHE | 201 +++++++++++ LICENSE-MIT | 27 ++ README.md | 5 + py/kmeans_rs/__init__.py | 33 ++ py/kmeans_rs/__init__.pyi | 245 ++++++++++++++ pyproject.toml | 35 ++ rust-toolchain.toml | 2 + src/lib.rs | 696 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 1529 insertions(+) create mode 100644 .github/workflows/CI.yml create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 LICENSE-APACHE create mode 100644 LICENSE-MIT create mode 100644 README.md create mode 100644 py/kmeans_rs/__init__.py create mode 100644 py/kmeans_rs/__init__.pyi create mode 100644 pyproject.toml create mode 100644 rust-toolchain.toml create mode 100644 src/lib.rs diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..c8fe915 --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,181 @@ +# This file is autogenerated by maturin v1.8.4 +# To update, run +# +# maturin generate-ci github +# +name: CI + +on: + push: + branches: + - main + - master + tags: + - '*' + pull_request: + workflow_dispatch: + +permissions: + contents: read + +jobs: + linux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-22.04 + target: x86_64 + - runner: ubuntu-22.04 + target: x86 + - runner: ubuntu-22.04 + target: aarch64 + - runner: ubuntu-22.04 + target: armv7 + - runner: ubuntu-22.04 + target: s390x + - runner: ubuntu-22.04 + target: ppc64le + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} + manylinux: auto + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-linux-${{ matrix.platform.target }} + path: dist + + musllinux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-22.04 + target: x86_64 + - runner: ubuntu-22.04 + target: x86 + - runner: ubuntu-22.04 + target: aarch64 + - runner: ubuntu-22.04 + target: armv7 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} + manylinux: musllinux_1_2 + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-musllinux-${{ matrix.platform.target }} + path: dist + + windows: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: windows-latest + target: x64 + - runner: windows-latest + target: x86 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + architecture: ${{ matrix.platform.target }} + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-windows-${{ matrix.platform.target }} + path: dist + + macos: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: macos-13 + target: x86_64 + - runner: macos-14 + target: aarch64 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-macos-${{ matrix.platform.target }} + path: dist + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist + path: dist + + release: + name: Release + runs-on: ubuntu-latest + if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} + needs: [linux, musllinux, windows, macos, sdist] + permissions: + # Use to sign the release artifacts + id-token: write + # Used to upload release artifacts + contents: write + # Used to generate artifact attestation + attestations: write + steps: + - uses: actions/download-artifact@v4 + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v2 + with: + subject-path: 'wheels-*/*' + - name: Publish to PyPI + if: ${{ startsWith(github.ref, 'refs/tags/') }} + uses: PyO3/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --non-interactive --skip-existing wheels-*/* diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..11af59c --- /dev/null +++ b/.gitignore @@ -0,0 +1,74 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +.venv/ +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +include/ +man/ +venv/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +.DS_Store + +# Sphinx documentation +docs/_build/ + +# PyCharm +.idea/ + +# VSCode +.vscode/ + +# Pyenv +.python-version + +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..208633b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "kmeans_rs" +version = "0.1.0" +edition = "2024" +rust-version = "1.94.0" +authors = ["Wim Pomp "] +license = "MIT OR Apache-2.0" +description = "Python wrapper for Rust kmeans library." +homepage = "https://git.wimpomp.nl/wim/kmeans_rs" +repository = "https://git.wimpomp.nl/wim/kmeans_rs" +readme = "README.md" +keywords = ["kmeans"] +categories = ["science"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lib] +name = "kmeans_rs" +crate-type = ["cdylib"] + +[dependencies] +color-eyre = "0.6" +console = "0.16" +indicatif = { version = "0.18", features = ["rayon"] } +kmeans = "2" +ndarray = "0.17" +numpy = "0.28" +pyo3 = { version = "0.28", features = ["abi3-py310", "anyhow", "eyre"] } +pyo3-stub-gen = "0.22" +rayon = "1" +thiserror = "2" diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..f8e5e5e --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..5a3aa7f --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,27 @@ +Copyright (c) 2015 - 2021 Ulrik Sverdrup "bluss", + Jim Turner, + and ndarray developers + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..20e5044 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +### KMeans +A small library wrapping the unaffiliated Rust kmeans library: https://crates.io/crates/kmeans, +kmeans is fast for big datasets due because of the use of multicore processing and SIMD. + +Building requires rust nightly. diff --git a/py/kmeans_rs/__init__.py b/py/kmeans_rs/__init__.py new file mode 100644 index 0000000..ecd66ff --- /dev/null +++ b/py/kmeans_rs/__init__.py @@ -0,0 +1,33 @@ +import os +import sys +from importlib.metadata import version +from pathlib import Path + +os.environ["RUST_BACKTRACE"] = "full" +os.environ["COLORBT_SHOW_HIDDEN"] = "1" + +from .kmeans_rs import * # noqa + +try: + __version__ = version(Path(__file__).parent.name) +except (Exception,): + __version__ = "unknown" + +try: + with open(Path(__file__).parent.parent / ".git" / "HEAD") as g: + head = g.read().split(":")[1].strip() + with open(Path(__file__).parent.parent / ".git" / head) as h: + __git_commit_hash__ = h.read().rstrip("\n") +except (Exception,): + __git_commit_hash__ = "unknown" + + +def kmeans_generate_stub(): + if len(sys.argv) > 1: + path = Path(sys.argv[1]).resolve() + else: + path = Path.cwd().resolve() + if (path / "py" / "kmeans_rs" / "__init__.py").exists(): + generate_stub(str(path)) # noqa + else: + raise ModuleNotFoundError(str(path / "py" / "kmeans_rs" / "__init__.py")) diff --git a/py/kmeans_rs/__init__.pyi b/py/kmeans_rs/__init__.pyi new file mode 100644 index 0000000..96f48b4 --- /dev/null +++ b/py/kmeans_rs/__init__.pyi @@ -0,0 +1,245 @@ +# This file is automatically generated by pyo3_stub_gen +# ruff: noqa: E501, F401, F403, F405 + +import builtins +import numpy +import numpy.typing +import typing +__all__ = [ + "KMeans", + "KMeansAlgorithm", + "KMeansInit", + "silhouette", +] + +@typing.final +class KMeans: + r""" + Compute kmeans clustering + this implementation is supposed to be faster than scipy or scikit-learn + when dealing with a lot of points + + ## Arguments + - **points**: Numpy array #points x dimensions + - **k**: Amount of clusters to search for + - **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite) + - **init**: initialization method + - **algorithm**: algorithm to use + """ + @property + def ndim(self) -> builtins.int: + r""" + number of dimensions + """ + @property + def k(self) -> builtins.int: + r""" + number of clusters + """ + @property + def distance_sum(self) -> builtins.float: + r""" + sum of all distances, cost measure + """ + @property + def centroids(self) -> numpy.typing.NDArray[numpy.float64]: + r""" + centroid coordinates + """ + @property + def centroid_frequency(self) -> builtins.list[builtins.int]: + r""" + centroid frequencies + """ + @property + def assignments(self) -> builtins.list[builtins.int]: + r""" + to which cluster each of the points is assigned + """ + @property + def centroid_distances(self) -> builtins.list[builtins.float]: + r""" + distances of all points to the center it's assigned to + """ + def __new__(cls, points: numpy.typing.ArrayLike, k: builtins.int, max_iter: builtins.int = 300, init: typing.Optional[KMeansInit] = None, algorithm: typing.Optional[KMeansAlgorithm] = None) -> KMeans: ... + @staticmethod + def init_plusplus() -> KMeansInit: + r""" + K-Means++ initialization method, as implemented in Matlab + + ## Description + This initialization method starts by selecting one sample as first centroid. + Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating + each sample's probability of "being a centroid". This probability is bigger, the farther away a sample + is from its centroid. Then, one sample is randomly selected, while taking their probability of being + the next centroid into account. This leads to a tendency of selecting centroids, that are far away from + their currently assigned cluster's centroid. + (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) + """ + @staticmethod + def init_random_partition() -> KMeansInit: + r""" + Random-Parition initialization method + + ## Description + This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means. + These means are then used as initial clusters. + """ + @staticmethod + def init_random_sample() -> KMeansInit: + r""" + Random sample initialization method (a.k.a. Forgy) + + ## Description + This initialization method randomly selects k centroids from the samples as initial centroids. + """ + @staticmethod + def init_precomputed(centroids: numpy.typing.ArrayLike) -> KMeansInit: + r""" + Precomputed centroids initialization method + + ## Description + This initialization method requires a precomputed list of k centroids to use as initial + centroids. + """ + @staticmethod + def algo_lloyd() -> KMeansAlgorithm: + r""" + Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase). + (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) + """ + @staticmethod + def algo_mini_batch(batch_size: builtins.int) -> KMeansAlgorithm: + r""" + Mini-Batch k-Means implementation. + (see: https://dl.acm.org/citation.cfm?id=1772862) + + ## Arguments + - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower) + """ + def predict(self, points: numpy.typing.ArrayLike) -> tuple[builtins.list[builtins.int], builtins.list[builtins.float]]: + r""" + find the closest cluster and the distance for each point + """ + def silhouette_simple(self, points: numpy.typing.ArrayLike, assignments: numpy.typing.ArrayLike = None) -> builtins.float: + r""" + calculate the mean simple (using centroids) silhouette score for a set of points, + assignments must be specified if they do not correspond to the assignments in the KMeans instance + """ + +class KMeansAlgorithm: + r""" + Specify a kmeans algorithm using lloyd or mini_batch. + """ + @staticmethod + def lloyd() -> KMeansAlgorithm: + r""" + Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase). + (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) + """ + @staticmethod + def mini_batch(batch_size: builtins.int) -> KMeansAlgorithm: + r""" + Mini-Batch k-Means implementation. + (see: https://dl.acm.org/citation.cfm?id=1772862) + + ## Arguments + - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower) + """ + @typing.final + class Lloyd(KMeansAlgorithm): + __match_args__ = () + def __new__(cls) -> KMeansAlgorithm.Lloyd: ... + def __len__(self) -> builtins.int: ... + def __getitem__(self, key: builtins.int) -> typing.Any: ... + + @typing.final + class MiniBatch(KMeansAlgorithm): + __match_args__ = ("_0",) + @property + def _0(self) -> builtins.int: ... + def __new__(cls, _0: builtins.int) -> KMeansAlgorithm.MiniBatch: ... + def __len__(self) -> builtins.int: ... + def __getitem__(self, key: builtins.int) -> typing.Any: ... + + +class KMeansInit: + r""" + Specify an initialization method using plusplus, random_partition, random_sample or precomputed. + """ + @staticmethod + def plusplus() -> KMeansInit: + r""" + K-Means++ initialization method, as implemented in Matlab + + ## Description + This initialization method starts by selecting one sample as first centroid. + Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating + each sample's probability of "being a centroid". This probability is bigger, the farther away a sample + is from its centroid. Then, one sample is randomly selected, while taking their probability of being + the next centroid into account. This leads to a tendency of selecting centroids, that are far away from + their currently assigned cluster's centroid. + (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) + """ + @staticmethod + def random_partition() -> KMeansInit: + r""" + Random-Partition initialization method + + ## Description + This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means. + These means are then used as initial clusters. + """ + @staticmethod + def random_sample() -> KMeansInit: + r""" + Random sample initialization method (a.k.a. Forgy) + + ## Description + This initialization method randomly selects k centroids from the samples as initial centroids. + """ + @staticmethod + def precomputed(centroids: numpy.typing.ArrayLike) -> KMeansInit: + r""" + Precomputed centroids initialization method + + ## Description + This initialization method requires a precomputed list of k centroids to use as initial + centroids. + """ + @typing.final + class PlusPlus(KMeansInit): + __match_args__ = () + def __new__(cls) -> KMeansInit.PlusPlus: ... + def __len__(self) -> builtins.int: ... + def __getitem__(self, key: builtins.int) -> typing.Any: ... + + @typing.final + class RandomPartition(KMeansInit): + __match_args__ = () + def __new__(cls) -> KMeansInit.RandomPartition: ... + def __len__(self) -> builtins.int: ... + def __getitem__(self, key: builtins.int) -> typing.Any: ... + + @typing.final + class RandomSample(KMeansInit): + __match_args__ = () + def __new__(cls) -> KMeansInit.RandomSample: ... + def __len__(self) -> builtins.int: ... + def __getitem__(self, key: builtins.int) -> typing.Any: ... + + @typing.final + class Precomputed(KMeansInit): + __match_args__ = ("_0",) + @property + def _0(self) -> builtins.list[builtins.float]: ... + def __new__(cls, _0: typing.Sequence[builtins.float]) -> KMeansInit.Precomputed: ... + def __len__(self) -> builtins.int: ... + def __getitem__(self, key: builtins.int) -> typing.Any: ... + + +def silhouette(points: numpy.typing.ArrayLike, assignments: numpy.typing.ArrayLike) -> builtins.float: + r""" + calculate the mean silhouette score for a set of points + """ + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..72e5849 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,35 @@ +[build-system] +requires = ["maturin>=1.9.4,<2.0"] +build-backend = "maturin" + +[project] +name = "kmeans" +dynamic = ["version"] +authors = [ + { name = "Wim Pomp", email = "w.pomp@nki.nl" }, +] +readme = "README.md" +keywords = ["kmeans"] +description = "Python wrapper for Rust kmeans library." +requires-python = ">=3.8" +classifiers = [ + "License :: OSI Approved :: MIT License", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] + +[tool.maturin] +python-source = "py" +module-name = "kmeans_rs" + +[project.scripts] +kmeans_generate_stub = "kmeans_rs:kmeans_generate_stub" + +[tool.ruff] +line-length = 119 +indent-width = 4 + +[tool.isort] +line_length = 119 diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..5d56faf --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,2 @@ +[toolchain] +channel = "nightly" diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..117b2d9 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,696 @@ +use console::Term; +use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; +use kmeans::*; +use ndarray::{Array2, AsArray, Ix1, Ix2}; +use numpy::{AllowTypeChange, IntoPyArray, PyArray2, PyArrayLike1, PyArrayLike2}; +use pyo3::exceptions::PyTypeError; +use pyo3::prelude::*; +use pyo3_stub_gen::derive::*; +use pyo3_stub_gen::{StubGenConfig, StubInfo}; +use rayon::prelude::*; +use std::collections::{HashMap, HashSet}; +use std::hash::Hash; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Duration; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error(transparent)] + ProgressBarTemplate(#[from] indicatif::style::TemplateError), + #[error("shape mismatch: {0} != {1}")] + ShapeMismatch(usize, usize), + #[error("no centroids defined")] + NoCentroidsDefined, +} + +impl From for PyErr { + fn from(err: Error) -> PyErr { + color_eyre::eyre::Report::from(err).into() + } +} + +/// a progress bar with an ok style that when py::detach is used also works in jupyter +pub fn get_bar(count: Option) -> Result { + let style = ProgressStyle::with_template( + "{spinner:.green} {percent}% [{wide_bar:.green/lime}] {pos:>7}/{len:7} [{elapsed}/{eta}, {per_sec:<5}]", + )?.progress_chars("#>-"); + let bar = ProgressBar::with_draw_target( + count.map(|i| i as u64), + ProgressDrawTarget::term_like_with_hz(Box::new(Term::buffered_stdout()), 20), + ) + .with_style(style); + bar.enable_steady_tick(Duration::from_millis(100)); + Ok(bar) +} + +trait Predict { + type Error; + + fn predict<'a, A>(&self, points: A) -> Result<(Vec, Vec), Self::Error> + where + A: AsArray<'a, T, Ix2>, + T: 'a; + + fn silhouette_simple<'p, 'a, P, A>( + &self, + points: P, + assignments: Option, + ) -> Result + where + P: AsArray<'p, f64, Ix2>, + A: AsArray<'a, usize, Ix1>; +} + +/// Specify an initialization method using plusplus, random_partition, random_sample or precomputed. +#[gen_stub_pyclass_complex_enum] +#[pyclass(name = "KMeansInit", module = "kmeans_rs", from_py_object)] +#[derive(Clone, Debug)] +pub(crate) enum PyKMeansInit { + PlusPlus(), + RandomPartition(), + RandomSample(), + Precomputed(Vec), +} + +#[gen_stub_pymethods] +#[pymethods] +impl PyKMeansInit { + /// K-Means++ initialization method, as implemented in Matlab + /// + /// ## Description + /// This initialization method starts by selecting one sample as first centroid. + /// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating + /// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample + /// is from its centroid. Then, one sample is randomly selected, while taking their probability of being + /// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from + /// their currently assigned cluster's centroid. + /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) + #[staticmethod] + pub(crate) fn plusplus() -> Self { + Self::PlusPlus() + } + + /// Random-Partition initialization method + /// + /// ## Description + /// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means. + /// These means are then used as initial clusters. + #[staticmethod] + pub(crate) fn random_partition() -> Self { + Self::RandomPartition() + } + + /// Random sample initialization method (a.k.a. Forgy) + /// + /// ## Description + /// This initialization method randomly selects k centroids from the samples as initial centroids. + #[staticmethod] + pub(crate) fn random_sample() -> Self { + Self::RandomSample() + } + + /// Precomputed centroids initialization method + /// + /// ## Description + /// This initialization method requires a precomputed list of k centroids to use as initial + /// centroids. + #[staticmethod] + pub(crate) fn precomputed( + #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] + centroids: PyArrayLike2, + ) -> Self { + Self::Precomputed(centroids.as_array().flatten().to_vec()) + } +} + +/// Specify a kmeans algorithm using lloyd or mini_batch. +#[gen_stub_pyclass_complex_enum] +#[pyclass(name = "KMeansAlgorithm", module = "kmeans_rs", from_py_object)] +#[derive(Clone, Debug)] +pub(crate) enum PyKMeansAlgorithm { + Lloyd(), + MiniBatch(usize), +} + +#[gen_stub_pymethods] +#[pymethods] +impl PyKMeansAlgorithm { + /// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase). + /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) + #[staticmethod] + pub(crate) fn lloyd() -> Self { + Self::Lloyd() + } + + /// Mini-Batch k-Means implementation. + /// (see: https://dl.acm.org/citation.cfm?id=1772862) + /// + /// ## Arguments + /// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower) + #[staticmethod] + pub(crate) fn mini_batch(batch_size: usize) -> Self { + Self::MiniBatch(batch_size) + } +} + +/// Compute kmeans clustering +/// this implementation is supposed to be faster than scipy or scikit-learn +/// when dealing with a lot of points +/// +/// ## Arguments +/// - **points**: Numpy array #points x dimensions +/// - **k**: Amount of clusters to search for +/// - **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite) +/// - **init**: initialization method +/// - **algorithm**: algorithm to use +#[gen_stub_pyclass] +#[pyclass(name = "KMeans", module = "kmeans_rs", from_py_object)] +#[derive(Clone, Debug)] +pub(crate) struct PyKMeans { + ndim: usize, + inner: KMeansState, +} + +#[gen_stub_pymethods] +#[pymethods] +impl PyKMeans { + #[new] + #[pyo3(signature = (points, k, max_iter=300, init=None, algorithm=None))] + pub(crate) fn new( + py: Python, + #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] + points: PyArrayLike2, + k: usize, + max_iter: usize, + init: Option, + algorithm: Option, + ) -> Self { + let points = points.as_array(); + py.detach(|| { + let shape = points.shape(); + let kmeans = if let Some(s) = points.as_slice() { + KMeans::::new(s, shape[0], shape[1], EuclideanDistance) + } else { + let v = points.flatten().to_vec(); + KMeans::::new(v.as_slice(), shape[0], shape[1], EuclideanDistance) + }; + let init = if let Some(init) = init { + init + } else { + PyKMeansInit::PlusPlus() + }; + let algorithm = if let Some(algorithm) = algorithm { + algorithm + } else { + PyKMeansAlgorithm::Lloyd() + }; + let config = KMeansConfig::default(); + match algorithm { + PyKMeansAlgorithm::Lloyd() => PyKMeans { + ndim: shape[1], + inner: match init { + PyKMeansInit::PlusPlus() => { + kmeans.kmeans_lloyd(k, max_iter, KMeans::init_kmeanplusplus, &config) + } + PyKMeansInit::RandomPartition() => { + kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_partition, &config) + } + PyKMeansInit::RandomSample() => { + kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_sample, &config) + } + PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_lloyd( + k, + max_iter, + KMeans::init_precomputed(centroids), + &config, + ), + }, + }, + PyKMeansAlgorithm::MiniBatch(size) => PyKMeans { + ndim: shape[1], + inner: match init { + PyKMeansInit::PlusPlus() => kmeans.kmeans_minibatch( + size, + k, + max_iter, + KMeans::init_kmeanplusplus, + &config, + ), + PyKMeansInit::RandomPartition() => kmeans.kmeans_minibatch( + size, + k, + max_iter, + KMeans::init_random_partition, + &config, + ), + PyKMeansInit::RandomSample() => kmeans.kmeans_minibatch( + size, + k, + max_iter, + KMeans::init_random_sample, + &config, + ), + PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_minibatch( + size, + k, + max_iter, + KMeans::init_precomputed(centroids), + &config, + ), + }, + }, + } + }) + } + + /// K-Means++ initialization method, as implemented in Matlab + /// + /// ## Description + /// This initialization method starts by selecting one sample as first centroid. + /// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating + /// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample + /// is from its centroid. Then, one sample is randomly selected, while taking their probability of being + /// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from + /// their currently assigned cluster's centroid. + /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) + #[staticmethod] + pub(crate) fn init_plusplus() -> PyKMeansInit { + PyKMeansInit::PlusPlus() + } + + /// Random-Parition initialization method + /// + /// ## Description + /// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means. + /// These means are then used as initial clusters. + #[staticmethod] + pub(crate) fn init_random_partition() -> PyKMeansInit { + PyKMeansInit::RandomPartition() + } + + /// Random sample initialization method (a.k.a. Forgy) + /// + /// ## Description + /// This initialization method randomly selects k centroids from the samples as initial centroids. + #[staticmethod] + pub(crate) fn init_random_sample() -> PyKMeansInit { + PyKMeansInit::RandomSample() + } + + /// Precomputed centroids initialization method + /// + /// ## Description + /// This initialization method requires a precomputed list of k centroids to use as initial + /// centroids. + #[staticmethod] + pub(crate) fn init_precomputed( + #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] + centroids: PyArrayLike2, + ) -> PyKMeansInit { + PyKMeansInit::Precomputed(centroids.as_array().flatten().to_vec()) + } + + /// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase). + /// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About) + #[staticmethod] + pub(crate) fn algo_lloyd() -> PyKMeansAlgorithm { + PyKMeansAlgorithm::Lloyd() + } + + /// Mini-Batch k-Means implementation. + /// (see: https://dl.acm.org/citation.cfm?id=1772862) + /// + /// ## Arguments + /// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower) + #[staticmethod] + pub(crate) fn algo_mini_batch(batch_size: usize) -> PyKMeansAlgorithm { + PyKMeansAlgorithm::MiniBatch(batch_size) + } + + /// find the closest cluster and the distance for each point + pub(crate) fn predict( + &self, + py: Python, + #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] + points: PyArrayLike2, + ) -> PyResult<(Vec, Vec)> { + let points = points.as_array(); + Ok(py.detach(|| self.inner.predict(points))?) + } + + /// calculate the mean simple (using centroids) silhouette score for a set of points, + /// assignments must be specified if they do not correspond to the assignments in the KMeans instance + #[pyo3(signature = (points, assignments = None))] + pub(crate) fn silhouette_simple( + &self, + py: Python, + #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] + points: PyArrayLike2, + #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] + assignments: Option>, + ) -> PyResult { + let points = points.as_array(); + let assignments = assignments.as_ref().map(|a| a.as_array()); + Ok(py.detach(|| self.inner.silhouette_simple(points, assignments))?) + } + + /// number of dimensions + #[getter] + pub(crate) fn ndim(&self) -> usize { + self.ndim + } + + /// number of clusters + #[getter] + pub(crate) fn k(&self) -> usize { + self.inner.k + } + + /// sum of all distances, cost measure + #[getter] + pub(crate) fn distance_sum(&self) -> f64 { + self.inner.distsum + } + + /// centroid coordinates + #[getter] + pub(crate) fn centroids<'py>(&self, py: Python<'py>) -> PyResult>> { + let v = self.inner.centroids.to_vec(); + Ok(Array2::from_shape_vec((v.len() / self.ndim, self.ndim), v) + .map_err(|e| PyErr::new::(e.to_string()))? + .into_pyarray(py)) + } + + /// centroid frequencies + #[getter] + pub(crate) fn centroid_frequency(&self) -> Vec { + self.inner.centroid_frequency.clone() + } + + /// to which cluster each of the points is assigned + #[getter] + pub(crate) fn assignments(&self) -> Vec { + self.inner.assignments.clone() + } + + /// distances of all points to the center it's assigned to + #[getter] + pub(crate) fn centroid_distances(&self) -> Vec { + self.inner.centroid_distances.clone() + } +} + +impl Predict for KMeansState { + type Error = Error; + + fn predict<'a, A>(&self, points: A) -> Result<(Vec, Vec), Self::Error> + where + A: AsArray<'a, f64, Ix2>, + { + let centroids = self.centroids.to_vec(); + let ndim = centroids.len() / self.k; + let points = points.into(); + let shape = points.shape(); + if shape[1] != ndim { + return Err(Error::ShapeMismatch(shape[1], ndim)); + } + if centroids.is_empty() { + return Err(Error::NoCentroidsDefined); + } + let fill = vec![0.0; 8 - ndim % 8]; + let e = EuclideanDistance; + let dist = |s: &[f64]| { + s.par_chunks_exact(ndim) + .map(|point| { + let (i, d) = centroids + .par_chunks_exact(ndim) + .enumerate() + .fold( + || (usize::MAX, f64::INFINITY), + |(i, a), (j, centroid)| { + let b = >::distance( + &e, + &[point, &fill].concat(), + &[centroid, &fill].concat(), + ); + if a <= b { (i, a) } else { (j, b) } + }, + ) + .reduce( + || (usize::MAX, f64::INFINITY), + |(i, a), (j, b)| { + if a <= b { (i, a) } else { (j, b) } + }, + ); + (i, d.sqrt()) + }) + .collect::<(Vec<_>, Vec<_>)>() + }; + + Ok(if let Some(s) = points.as_slice() { + dist(s) + } else { + let s = points.flatten().to_vec(); + dist(&s) + }) + } + + fn silhouette_simple<'p, 'a, P, A>( + &self, + points: P, + assignments: Option, + ) -> Result + where + P: AsArray<'p, f64, Ix2>, + A: AsArray<'a, usize, Ix1>, + { + let points = points.into(); + let shape = points.shape(); + let centroids = Arc::new(self.centroids.to_vec()); + let ndim = centroids.len() / self.k; + + if shape[1] != ndim { + return Err(Error::ShapeMismatch(shape[1], ndim)); + } + if centroids.is_empty() { + return Err(Error::NoCentroidsDefined); + } + + let assignments = if let Some(assignments) = assignments { + assignments.into().to_vec() + } else { + self.assignments.to_vec() + }; + let k = self.k; + let mut clusters = vec![Vec::new(); k]; + for (point, assignment) in points.rows().into_iter().zip(assignments) { + clusters[assignment].extend(point.to_vec()); + } + let fill = vec![0.0; 8 - ndim % 8]; + let a = clusters + .par_iter() + .zip(centroids.clone().par_chunks_exact(ndim)) + .flat_map(|(points, centroid)| { + let c = [centroid, &fill].concat(); + let fill = fill.clone(); + let e = EuclideanDistance; + points.par_chunks_exact(ndim).map(move |point| { + >::distance( + &e, + &c, + &[point, &fill].concat(), + ) + .sqrt() + }) + }) + .collect::>(); + + let b = clusters + .par_iter() + .enumerate() + .flat_map(|(i, points)| { + let centroids = centroids.clone(); + let fill = fill.clone(); + let e = EuclideanDistance; + points.par_chunks_exact(ndim).map(move |point| { + centroids + .par_chunks_exact(ndim) + .enumerate() + .map(|(j, centroid)| { + if i == j { + f64::INFINITY + } else { + >::distance( + &e, + &[centroid, &fill].concat(), + &[point, &fill].concat(), + ) + .sqrt() + } + }) + .min_by(|a, b| a.total_cmp(b)) + .unwrap_or(f64::INFINITY) + }) + }) + .collect::>(); + Ok(a.into_iter() + .zip(b) + .map(|(a, b)| (b - a) / a.max(b)) + .sum::() + / points.shape()[0] as f64) + } +} + +fn silhouette<'p, 'a, P, A, K>(points: P, assignments: A) -> Result +where + P: AsArray<'p, f64, Ix2>, + A: AsArray<'a, K, Ix1>, + K: 'a + Eq + Hash, +{ + let points = points.into(); + let assignments = assignments.into(); + let shape = points.shape(); + let n = shape[0]; + let ndim = shape[1]; + + let labels = assignments + .iter() + .collect::>() + .into_iter() + .enumerate() + .map(|(k, v)| (v, k)) + .collect::>(); + let assignments = assignments.iter().map(|k| labels[k]).collect::>(); + let k = labels.len(); + + let mut clusters = vec![Vec::new(); k]; + for (point, assignment) in points.rows().into_iter().zip(assignments) { + clusters[assignment].extend(point.to_vec()); + } + let bar = get_bar(Some(k * n + k * n * k))?; + let fill = vec![0.0; 8 - ndim % 8]; + let e = EuclideanDistance; + let a = clusters + .par_iter() + .flat_map(|points| { + let c = (points.len() / ndim - 1) as f64; + points + .par_chunks_exact(ndim) + .map(|i| { + let q = points + .par_chunks_exact(ndim) + .map(|j| { + >::distance( + &e, + &[i, &fill].concat(), + &[j, &fill].concat(), + ) + .sqrt() + }) + .sum::() + / c; + bar.inc(1); + q + }) + .collect::>() + }) + .collect::>(); + + let b = clusters + .par_iter() + .enumerate() + .flat_map(|(i, points_i)| { + points_i + .par_chunks_exact(ndim) + .map(|a| { + clusters + .par_iter() + .enumerate() + .map(|(j, points_j)| { + let c = (points_j.len() / ndim) as f64; + let q = if i == j { + f64::INFINITY + } else { + points_j + .par_chunks_exact(ndim) + .map(|b| { + >::distance( + &e, + &[a, &fill].concat(), + &[b, &fill].concat(), + ) + .sqrt() + }) + .sum::() + / c + }; + bar.inc(1); + q + }) + .min_by(|a, b| a.total_cmp(b)) + .unwrap_or(f64::INFINITY) + }) + .collect::>() + }) + .collect::>(); + bar.finish(); + Ok(a.into_iter() + .zip(b) + .map(|(a, b)| (b - a) / a.max(b)) + .sum::() + / points.shape()[0] as f64) +} + +/// calculate the mean silhouette score for a set of points +#[gen_stub_pyfunction(module = "kmeans_rs")] +#[pyfunction(name = "silhouette")] +pub(crate) fn py_silhouette( + py: Python, + #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] + points: PyArrayLike2, + #[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))] + assignments: PyArrayLike1, +) -> PyResult { + let points = points.as_array(); + let assignments = assignments.as_array(); + Ok(py.detach(|| silhouette(points, assignments))?) +} + +/// generates kmeans/__init__.pyi +#[pyfunction] +fn generate_stub(dest_path: String) -> PyResult<()> { + Ok(StubInfo::from_project_root( + "kmeans_rs".to_string(), + PathBuf::from(dest_path).join("py"), + true, + StubGenConfig::default(), + )? + .generate()?) +} + +#[pymodule] +#[pyo3(name = "kmeans_rs")] +mod kmeans_rs { + use pyo3::prelude::*; + + #[pymodule_export] + use super::generate_stub; + + #[pymodule_export] + use super::PyKMeans; + + #[pymodule_export] + use super::PyKMeansInit; + + #[pymodule_export] + use super::PyKMeansAlgorithm; + + #[pymodule_export] + use super::py_silhouette; + + #[pymodule_init] + fn init(_: &Bound<'_, PyModule>) -> PyResult<()> { + Ok(color_eyre::install()?) + } +}