first commit
CI / linux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 55s
CI / linux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:ppc64le]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:s390x]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 6s
CI / musllinux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 4s
CI / musllinux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 4s
CI / windows (map[runner:windows-latest target:x64]) (push) Has been cancelled
CI / windows (map[runner:windows-latest target:x86]) (push) Has been cancelled
CI / macos (map[runner:macos-13 target:x86_64]) (push) Has been cancelled
CI / macos (map[runner:macos-14 target:aarch64]) (push) Has been cancelled
CI / Release (push) Has been cancelled
CI / sdist (push) Has been cancelled
CI / linux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 5s
CI / linux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 55s
CI / linux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:ppc64le]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:s390x]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 6s
CI / musllinux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 4s
CI / musllinux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 4s
CI / windows (map[runner:windows-latest target:x64]) (push) Has been cancelled
CI / windows (map[runner:windows-latest target:x86]) (push) Has been cancelled
CI / macos (map[runner:macos-13 target:x86_64]) (push) Has been cancelled
CI / macos (map[runner:macos-14 target:aarch64]) (push) Has been cancelled
CI / Release (push) Has been cancelled
CI / sdist (push) Has been cancelled
CI / linux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 5s
This commit is contained in:
@@ -0,0 +1,181 @@
|
||||
# This file is autogenerated by maturin v1.8.4
|
||||
# To update, run
|
||||
#
|
||||
# maturin generate-ci github
|
||||
#
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- master
|
||||
tags:
|
||||
- '*'
|
||||
pull_request:
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
runs-on: ${{ matrix.platform.runner }}
|
||||
strategy:
|
||||
matrix:
|
||||
platform:
|
||||
- runner: ubuntu-22.04
|
||||
target: x86_64
|
||||
- runner: ubuntu-22.04
|
||||
target: x86
|
||||
- runner: ubuntu-22.04
|
||||
target: aarch64
|
||||
- runner: ubuntu-22.04
|
||||
target: armv7
|
||||
- runner: ubuntu-22.04
|
||||
target: s390x
|
||||
- runner: ubuntu-22.04
|
||||
target: ppc64le
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.x
|
||||
- name: Build wheels
|
||||
uses: PyO3/maturin-action@v1
|
||||
with:
|
||||
target: ${{ matrix.platform.target }}
|
||||
args: --release --out dist --find-interpreter
|
||||
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
|
||||
manylinux: auto
|
||||
- name: Upload wheels
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels-linux-${{ matrix.platform.target }}
|
||||
path: dist
|
||||
|
||||
musllinux:
|
||||
runs-on: ${{ matrix.platform.runner }}
|
||||
strategy:
|
||||
matrix:
|
||||
platform:
|
||||
- runner: ubuntu-22.04
|
||||
target: x86_64
|
||||
- runner: ubuntu-22.04
|
||||
target: x86
|
||||
- runner: ubuntu-22.04
|
||||
target: aarch64
|
||||
- runner: ubuntu-22.04
|
||||
target: armv7
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.x
|
||||
- name: Build wheels
|
||||
uses: PyO3/maturin-action@v1
|
||||
with:
|
||||
target: ${{ matrix.platform.target }}
|
||||
args: --release --out dist --find-interpreter
|
||||
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
|
||||
manylinux: musllinux_1_2
|
||||
- name: Upload wheels
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels-musllinux-${{ matrix.platform.target }}
|
||||
path: dist
|
||||
|
||||
windows:
|
||||
runs-on: ${{ matrix.platform.runner }}
|
||||
strategy:
|
||||
matrix:
|
||||
platform:
|
||||
- runner: windows-latest
|
||||
target: x64
|
||||
- runner: windows-latest
|
||||
target: x86
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.x
|
||||
architecture: ${{ matrix.platform.target }}
|
||||
- name: Build wheels
|
||||
uses: PyO3/maturin-action@v1
|
||||
with:
|
||||
target: ${{ matrix.platform.target }}
|
||||
args: --release --out dist --find-interpreter
|
||||
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
|
||||
- name: Upload wheels
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels-windows-${{ matrix.platform.target }}
|
||||
path: dist
|
||||
|
||||
macos:
|
||||
runs-on: ${{ matrix.platform.runner }}
|
||||
strategy:
|
||||
matrix:
|
||||
platform:
|
||||
- runner: macos-13
|
||||
target: x86_64
|
||||
- runner: macos-14
|
||||
target: aarch64
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.x
|
||||
- name: Build wheels
|
||||
uses: PyO3/maturin-action@v1
|
||||
with:
|
||||
target: ${{ matrix.platform.target }}
|
||||
args: --release --out dist --find-interpreter
|
||||
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
|
||||
- name: Upload wheels
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels-macos-${{ matrix.platform.target }}
|
||||
path: dist
|
||||
|
||||
sdist:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Build sdist
|
||||
uses: PyO3/maturin-action@v1
|
||||
with:
|
||||
command: sdist
|
||||
args: --out dist
|
||||
- name: Upload sdist
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels-sdist
|
||||
path: dist
|
||||
|
||||
release:
|
||||
name: Release
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }}
|
||||
needs: [linux, musllinux, windows, macos, sdist]
|
||||
permissions:
|
||||
# Use to sign the release artifacts
|
||||
id-token: write
|
||||
# Used to upload release artifacts
|
||||
contents: write
|
||||
# Used to generate artifact attestation
|
||||
attestations: write
|
||||
steps:
|
||||
- uses: actions/download-artifact@v4
|
||||
- name: Generate artifact attestation
|
||||
uses: actions/attest-build-provenance@v2
|
||||
with:
|
||||
subject-path: 'wheels-*/*'
|
||||
- name: Publish to PyPI
|
||||
if: ${{ startsWith(github.ref, 'refs/tags/') }}
|
||||
uses: PyO3/maturin-action@v1
|
||||
env:
|
||||
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
|
||||
with:
|
||||
command: upload
|
||||
args: --non-interactive --skip-existing wheels-*/*
|
||||
+74
@@ -0,0 +1,74 @@
|
||||
/target
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
.pytest_cache/
|
||||
*.py[cod]
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
.venv/
|
||||
env/
|
||||
bin/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
include/
|
||||
man/
|
||||
venv/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
pip-selfcheck.json
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
|
||||
# Mr Developer
|
||||
.mr.developer.cfg
|
||||
.project
|
||||
.pydevproject
|
||||
|
||||
# Rope
|
||||
.ropeproject
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
*.pot
|
||||
|
||||
.DS_Store
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyCharm
|
||||
.idea/
|
||||
|
||||
# VSCode
|
||||
.vscode/
|
||||
|
||||
# Pyenv
|
||||
.python-version
|
||||
|
||||
Cargo.lock
|
||||
+30
@@ -0,0 +1,30 @@
|
||||
[package]
|
||||
name = "kmeans_rs"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
rust-version = "1.94.0"
|
||||
authors = ["Wim Pomp <w.pomp@nki.nl>"]
|
||||
license = "MIT OR Apache-2.0"
|
||||
description = "Python wrapper for Rust kmeans library."
|
||||
homepage = "https://git.wimpomp.nl/wim/kmeans_rs"
|
||||
repository = "https://git.wimpomp.nl/wim/kmeans_rs"
|
||||
readme = "README.md"
|
||||
keywords = ["kmeans"]
|
||||
categories = ["science"]
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
[lib]
|
||||
name = "kmeans_rs"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
color-eyre = "0.6"
|
||||
console = "0.16"
|
||||
indicatif = { version = "0.18", features = ["rayon"] }
|
||||
kmeans = "2"
|
||||
ndarray = "0.17"
|
||||
numpy = "0.28"
|
||||
pyo3 = { version = "0.28", features = ["abi3-py310", "anyhow", "eyre"] }
|
||||
pyo3-stub-gen = "0.22"
|
||||
rayon = "1"
|
||||
thiserror = "2"
|
||||
+201
@@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2015 - 2021 Ulrik Sverdrup "bluss",
|
||||
Jim Turner,
|
||||
and ndarray developers
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
@@ -0,0 +1,5 @@
|
||||
### KMeans
|
||||
A small library wrapping the unaffiliated Rust kmeans library: https://crates.io/crates/kmeans,
|
||||
kmeans is fast for big datasets due because of the use of multicore processing and SIMD.
|
||||
|
||||
Building requires rust nightly.
|
||||
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
import sys
|
||||
from importlib.metadata import version
|
||||
from pathlib import Path
|
||||
|
||||
os.environ["RUST_BACKTRACE"] = "full"
|
||||
os.environ["COLORBT_SHOW_HIDDEN"] = "1"
|
||||
|
||||
from .kmeans_rs import * # noqa
|
||||
|
||||
try:
|
||||
__version__ = version(Path(__file__).parent.name)
|
||||
except (Exception,):
|
||||
__version__ = "unknown"
|
||||
|
||||
try:
|
||||
with open(Path(__file__).parent.parent / ".git" / "HEAD") as g:
|
||||
head = g.read().split(":")[1].strip()
|
||||
with open(Path(__file__).parent.parent / ".git" / head) as h:
|
||||
__git_commit_hash__ = h.read().rstrip("\n")
|
||||
except (Exception,):
|
||||
__git_commit_hash__ = "unknown"
|
||||
|
||||
|
||||
def kmeans_generate_stub():
|
||||
if len(sys.argv) > 1:
|
||||
path = Path(sys.argv[1]).resolve()
|
||||
else:
|
||||
path = Path.cwd().resolve()
|
||||
if (path / "py" / "kmeans_rs" / "__init__.py").exists():
|
||||
generate_stub(str(path)) # noqa
|
||||
else:
|
||||
raise ModuleNotFoundError(str(path / "py" / "kmeans_rs" / "__init__.py"))
|
||||
@@ -0,0 +1,245 @@
|
||||
# This file is automatically generated by pyo3_stub_gen
|
||||
# ruff: noqa: E501, F401, F403, F405
|
||||
|
||||
import builtins
|
||||
import numpy
|
||||
import numpy.typing
|
||||
import typing
|
||||
__all__ = [
|
||||
"KMeans",
|
||||
"KMeansAlgorithm",
|
||||
"KMeansInit",
|
||||
"silhouette",
|
||||
]
|
||||
|
||||
@typing.final
|
||||
class KMeans:
|
||||
r"""
|
||||
Compute kmeans clustering
|
||||
this implementation is supposed to be faster than scipy or scikit-learn
|
||||
when dealing with a lot of points
|
||||
|
||||
## Arguments
|
||||
- **points**: Numpy array #points x dimensions
|
||||
- **k**: Amount of clusters to search for
|
||||
- **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite)
|
||||
- **init**: initialization method
|
||||
- **algorithm**: algorithm to use
|
||||
"""
|
||||
@property
|
||||
def ndim(self) -> builtins.int:
|
||||
r"""
|
||||
number of dimensions
|
||||
"""
|
||||
@property
|
||||
def k(self) -> builtins.int:
|
||||
r"""
|
||||
number of clusters
|
||||
"""
|
||||
@property
|
||||
def distance_sum(self) -> builtins.float:
|
||||
r"""
|
||||
sum of all distances, cost measure
|
||||
"""
|
||||
@property
|
||||
def centroids(self) -> numpy.typing.NDArray[numpy.float64]:
|
||||
r"""
|
||||
centroid coordinates
|
||||
"""
|
||||
@property
|
||||
def centroid_frequency(self) -> builtins.list[builtins.int]:
|
||||
r"""
|
||||
centroid frequencies
|
||||
"""
|
||||
@property
|
||||
def assignments(self) -> builtins.list[builtins.int]:
|
||||
r"""
|
||||
to which cluster each of the points is assigned
|
||||
"""
|
||||
@property
|
||||
def centroid_distances(self) -> builtins.list[builtins.float]:
|
||||
r"""
|
||||
distances of all points to the center it's assigned to
|
||||
"""
|
||||
def __new__(cls, points: numpy.typing.ArrayLike, k: builtins.int, max_iter: builtins.int = 300, init: typing.Optional[KMeansInit] = None, algorithm: typing.Optional[KMeansAlgorithm] = None) -> KMeans: ...
|
||||
@staticmethod
|
||||
def init_plusplus() -> KMeansInit:
|
||||
r"""
|
||||
K-Means++ initialization method, as implemented in Matlab
|
||||
|
||||
## Description
|
||||
This initialization method starts by selecting one sample as first centroid.
|
||||
Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
|
||||
each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
|
||||
is from its centroid. Then, one sample is randomly selected, while taking their probability of being
|
||||
the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
|
||||
their currently assigned cluster's centroid.
|
||||
(see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||
"""
|
||||
@staticmethod
|
||||
def init_random_partition() -> KMeansInit:
|
||||
r"""
|
||||
Random-Parition initialization method
|
||||
|
||||
## Description
|
||||
This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
|
||||
These means are then used as initial clusters.
|
||||
"""
|
||||
@staticmethod
|
||||
def init_random_sample() -> KMeansInit:
|
||||
r"""
|
||||
Random sample initialization method (a.k.a. Forgy)
|
||||
|
||||
## Description
|
||||
This initialization method randomly selects k centroids from the samples as initial centroids.
|
||||
"""
|
||||
@staticmethod
|
||||
def init_precomputed(centroids: numpy.typing.ArrayLike) -> KMeansInit:
|
||||
r"""
|
||||
Precomputed centroids initialization method
|
||||
|
||||
## Description
|
||||
This initialization method requires a precomputed list of k centroids to use as initial
|
||||
centroids.
|
||||
"""
|
||||
@staticmethod
|
||||
def algo_lloyd() -> KMeansAlgorithm:
|
||||
r"""
|
||||
Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
|
||||
(see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||
"""
|
||||
@staticmethod
|
||||
def algo_mini_batch(batch_size: builtins.int) -> KMeansAlgorithm:
|
||||
r"""
|
||||
Mini-Batch k-Means implementation.
|
||||
(see: https://dl.acm.org/citation.cfm?id=1772862)
|
||||
|
||||
## Arguments
|
||||
- **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
|
||||
"""
|
||||
def predict(self, points: numpy.typing.ArrayLike) -> tuple[builtins.list[builtins.int], builtins.list[builtins.float]]:
|
||||
r"""
|
||||
find the closest cluster and the distance for each point
|
||||
"""
|
||||
def silhouette_simple(self, points: numpy.typing.ArrayLike, assignments: numpy.typing.ArrayLike = None) -> builtins.float:
|
||||
r"""
|
||||
calculate the mean simple (using centroids) silhouette score for a set of points,
|
||||
assignments must be specified if they do not correspond to the assignments in the KMeans instance
|
||||
"""
|
||||
|
||||
class KMeansAlgorithm:
|
||||
r"""
|
||||
Specify a kmeans algorithm using lloyd or mini_batch.
|
||||
"""
|
||||
@staticmethod
|
||||
def lloyd() -> KMeansAlgorithm:
|
||||
r"""
|
||||
Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
|
||||
(see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||
"""
|
||||
@staticmethod
|
||||
def mini_batch(batch_size: builtins.int) -> KMeansAlgorithm:
|
||||
r"""
|
||||
Mini-Batch k-Means implementation.
|
||||
(see: https://dl.acm.org/citation.cfm?id=1772862)
|
||||
|
||||
## Arguments
|
||||
- **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
|
||||
"""
|
||||
@typing.final
|
||||
class Lloyd(KMeansAlgorithm):
|
||||
__match_args__ = ()
|
||||
def __new__(cls) -> KMeansAlgorithm.Lloyd: ...
|
||||
def __len__(self) -> builtins.int: ...
|
||||
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||
|
||||
@typing.final
|
||||
class MiniBatch(KMeansAlgorithm):
|
||||
__match_args__ = ("_0",)
|
||||
@property
|
||||
def _0(self) -> builtins.int: ...
|
||||
def __new__(cls, _0: builtins.int) -> KMeansAlgorithm.MiniBatch: ...
|
||||
def __len__(self) -> builtins.int: ...
|
||||
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||
|
||||
|
||||
class KMeansInit:
|
||||
r"""
|
||||
Specify an initialization method using plusplus, random_partition, random_sample or precomputed.
|
||||
"""
|
||||
@staticmethod
|
||||
def plusplus() -> KMeansInit:
|
||||
r"""
|
||||
K-Means++ initialization method, as implemented in Matlab
|
||||
|
||||
## Description
|
||||
This initialization method starts by selecting one sample as first centroid.
|
||||
Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
|
||||
each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
|
||||
is from its centroid. Then, one sample is randomly selected, while taking their probability of being
|
||||
the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
|
||||
their currently assigned cluster's centroid.
|
||||
(see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||
"""
|
||||
@staticmethod
|
||||
def random_partition() -> KMeansInit:
|
||||
r"""
|
||||
Random-Partition initialization method
|
||||
|
||||
## Description
|
||||
This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
|
||||
These means are then used as initial clusters.
|
||||
"""
|
||||
@staticmethod
|
||||
def random_sample() -> KMeansInit:
|
||||
r"""
|
||||
Random sample initialization method (a.k.a. Forgy)
|
||||
|
||||
## Description
|
||||
This initialization method randomly selects k centroids from the samples as initial centroids.
|
||||
"""
|
||||
@staticmethod
|
||||
def precomputed(centroids: numpy.typing.ArrayLike) -> KMeansInit:
|
||||
r"""
|
||||
Precomputed centroids initialization method
|
||||
|
||||
## Description
|
||||
This initialization method requires a precomputed list of k centroids to use as initial
|
||||
centroids.
|
||||
"""
|
||||
@typing.final
|
||||
class PlusPlus(KMeansInit):
|
||||
__match_args__ = ()
|
||||
def __new__(cls) -> KMeansInit.PlusPlus: ...
|
||||
def __len__(self) -> builtins.int: ...
|
||||
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||
|
||||
@typing.final
|
||||
class RandomPartition(KMeansInit):
|
||||
__match_args__ = ()
|
||||
def __new__(cls) -> KMeansInit.RandomPartition: ...
|
||||
def __len__(self) -> builtins.int: ...
|
||||
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||
|
||||
@typing.final
|
||||
class RandomSample(KMeansInit):
|
||||
__match_args__ = ()
|
||||
def __new__(cls) -> KMeansInit.RandomSample: ...
|
||||
def __len__(self) -> builtins.int: ...
|
||||
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||
|
||||
@typing.final
|
||||
class Precomputed(KMeansInit):
|
||||
__match_args__ = ("_0",)
|
||||
@property
|
||||
def _0(self) -> builtins.list[builtins.float]: ...
|
||||
def __new__(cls, _0: typing.Sequence[builtins.float]) -> KMeansInit.Precomputed: ...
|
||||
def __len__(self) -> builtins.int: ...
|
||||
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||
|
||||
|
||||
def silhouette(points: numpy.typing.ArrayLike, assignments: numpy.typing.ArrayLike) -> builtins.float:
|
||||
r"""
|
||||
calculate the mean silhouette score for a set of points
|
||||
"""
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
[build-system]
|
||||
requires = ["maturin>=1.9.4,<2.0"]
|
||||
build-backend = "maturin"
|
||||
|
||||
[project]
|
||||
name = "kmeans"
|
||||
dynamic = ["version"]
|
||||
authors = [
|
||||
{ name = "Wim Pomp", email = "w.pomp@nki.nl" },
|
||||
]
|
||||
readme = "README.md"
|
||||
keywords = ["kmeans"]
|
||||
description = "Python wrapper for Rust kmeans library."
|
||||
requires-python = ">=3.8"
|
||||
classifiers = [
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Programming Language :: Rust",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
]
|
||||
|
||||
[tool.maturin]
|
||||
python-source = "py"
|
||||
module-name = "kmeans_rs"
|
||||
|
||||
[project.scripts]
|
||||
kmeans_generate_stub = "kmeans_rs:kmeans_generate_stub"
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 119
|
||||
indent-width = 4
|
||||
|
||||
[tool.isort]
|
||||
line_length = 119
|
||||
@@ -0,0 +1,2 @@
|
||||
[toolchain]
|
||||
channel = "nightly"
|
||||
+696
@@ -0,0 +1,696 @@
|
||||
use console::Term;
|
||||
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
|
||||
use kmeans::*;
|
||||
use ndarray::{Array2, AsArray, Ix1, Ix2};
|
||||
use numpy::{AllowTypeChange, IntoPyArray, PyArray2, PyArrayLike1, PyArrayLike2};
|
||||
use pyo3::exceptions::PyTypeError;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3_stub_gen::derive::*;
|
||||
use pyo3_stub_gen::{StubGenConfig, StubInfo};
|
||||
use rayon::prelude::*;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::hash::Hash;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum Error {
|
||||
#[error(transparent)]
|
||||
ProgressBarTemplate(#[from] indicatif::style::TemplateError),
|
||||
#[error("shape mismatch: {0} != {1}")]
|
||||
ShapeMismatch(usize, usize),
|
||||
#[error("no centroids defined")]
|
||||
NoCentroidsDefined,
|
||||
}
|
||||
|
||||
impl From<Error> for PyErr {
|
||||
fn from(err: Error) -> PyErr {
|
||||
color_eyre::eyre::Report::from(err).into()
|
||||
}
|
||||
}
|
||||
|
||||
/// a progress bar with an ok style that when py::detach is used also works in jupyter
|
||||
pub fn get_bar(count: Option<usize>) -> Result<ProgressBar, Error> {
|
||||
let style = ProgressStyle::with_template(
|
||||
"{spinner:.green} {percent}% [{wide_bar:.green/lime}] {pos:>7}/{len:7} [{elapsed}/{eta}, {per_sec:<5}]",
|
||||
)?.progress_chars("#>-");
|
||||
let bar = ProgressBar::with_draw_target(
|
||||
count.map(|i| i as u64),
|
||||
ProgressDrawTarget::term_like_with_hz(Box::new(Term::buffered_stdout()), 20),
|
||||
)
|
||||
.with_style(style);
|
||||
bar.enable_steady_tick(Duration::from_millis(100));
|
||||
Ok(bar)
|
||||
}
|
||||
|
||||
trait Predict<T> {
|
||||
type Error;
|
||||
|
||||
fn predict<'a, A>(&self, points: A) -> Result<(Vec<usize>, Vec<T>), Self::Error>
|
||||
where
|
||||
A: AsArray<'a, T, Ix2>,
|
||||
T: 'a;
|
||||
|
||||
fn silhouette_simple<'p, 'a, P, A>(
|
||||
&self,
|
||||
points: P,
|
||||
assignments: Option<A>,
|
||||
) -> Result<f64, Self::Error>
|
||||
where
|
||||
P: AsArray<'p, f64, Ix2>,
|
||||
A: AsArray<'a, usize, Ix1>;
|
||||
}
|
||||
|
||||
/// Specify an initialization method using plusplus, random_partition, random_sample or precomputed.
|
||||
#[gen_stub_pyclass_complex_enum]
|
||||
#[pyclass(name = "KMeansInit", module = "kmeans_rs", from_py_object)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum PyKMeansInit {
|
||||
PlusPlus(),
|
||||
RandomPartition(),
|
||||
RandomSample(),
|
||||
Precomputed(Vec<f64>),
|
||||
}
|
||||
|
||||
#[gen_stub_pymethods]
|
||||
#[pymethods]
|
||||
impl PyKMeansInit {
|
||||
/// K-Means++ initialization method, as implemented in Matlab
|
||||
///
|
||||
/// ## Description
|
||||
/// This initialization method starts by selecting one sample as first centroid.
|
||||
/// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
|
||||
/// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
|
||||
/// is from its centroid. Then, one sample is randomly selected, while taking their probability of being
|
||||
/// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
|
||||
/// their currently assigned cluster's centroid.
|
||||
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||
#[staticmethod]
|
||||
pub(crate) fn plusplus() -> Self {
|
||||
Self::PlusPlus()
|
||||
}
|
||||
|
||||
/// Random-Partition initialization method
|
||||
///
|
||||
/// ## Description
|
||||
/// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
|
||||
/// These means are then used as initial clusters.
|
||||
#[staticmethod]
|
||||
pub(crate) fn random_partition() -> Self {
|
||||
Self::RandomPartition()
|
||||
}
|
||||
|
||||
/// Random sample initialization method (a.k.a. Forgy)
|
||||
///
|
||||
/// ## Description
|
||||
/// This initialization method randomly selects k centroids from the samples as initial centroids.
|
||||
#[staticmethod]
|
||||
pub(crate) fn random_sample() -> Self {
|
||||
Self::RandomSample()
|
||||
}
|
||||
|
||||
/// Precomputed centroids initialization method
|
||||
///
|
||||
/// ## Description
|
||||
/// This initialization method requires a precomputed list of k centroids to use as initial
|
||||
/// centroids.
|
||||
#[staticmethod]
|
||||
pub(crate) fn precomputed(
|
||||
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||
centroids: PyArrayLike2<f64, AllowTypeChange>,
|
||||
) -> Self {
|
||||
Self::Precomputed(centroids.as_array().flatten().to_vec())
|
||||
}
|
||||
}
|
||||
|
||||
/// Specify a kmeans algorithm using lloyd or mini_batch.
|
||||
#[gen_stub_pyclass_complex_enum]
|
||||
#[pyclass(name = "KMeansAlgorithm", module = "kmeans_rs", from_py_object)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum PyKMeansAlgorithm {
|
||||
Lloyd(),
|
||||
MiniBatch(usize),
|
||||
}
|
||||
|
||||
#[gen_stub_pymethods]
|
||||
#[pymethods]
|
||||
impl PyKMeansAlgorithm {
|
||||
/// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
|
||||
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||
#[staticmethod]
|
||||
pub(crate) fn lloyd() -> Self {
|
||||
Self::Lloyd()
|
||||
}
|
||||
|
||||
/// Mini-Batch k-Means implementation.
|
||||
/// (see: https://dl.acm.org/citation.cfm?id=1772862)
|
||||
///
|
||||
/// ## Arguments
|
||||
/// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
|
||||
#[staticmethod]
|
||||
pub(crate) fn mini_batch(batch_size: usize) -> Self {
|
||||
Self::MiniBatch(batch_size)
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute kmeans clustering
|
||||
/// this implementation is supposed to be faster than scipy or scikit-learn
|
||||
/// when dealing with a lot of points
|
||||
///
|
||||
/// ## Arguments
|
||||
/// - **points**: Numpy array #points x dimensions
|
||||
/// - **k**: Amount of clusters to search for
|
||||
/// - **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite)
|
||||
/// - **init**: initialization method
|
||||
/// - **algorithm**: algorithm to use
|
||||
#[gen_stub_pyclass]
|
||||
#[pyclass(name = "KMeans", module = "kmeans_rs", from_py_object)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct PyKMeans {
|
||||
ndim: usize,
|
||||
inner: KMeansState<f64>,
|
||||
}
|
||||
|
||||
#[gen_stub_pymethods]
|
||||
#[pymethods]
|
||||
impl PyKMeans {
|
||||
#[new]
|
||||
#[pyo3(signature = (points, k, max_iter=300, init=None, algorithm=None))]
|
||||
pub(crate) fn new(
|
||||
py: Python,
|
||||
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||
points: PyArrayLike2<f64, AllowTypeChange>,
|
||||
k: usize,
|
||||
max_iter: usize,
|
||||
init: Option<PyKMeansInit>,
|
||||
algorithm: Option<PyKMeansAlgorithm>,
|
||||
) -> Self {
|
||||
let points = points.as_array();
|
||||
py.detach(|| {
|
||||
let shape = points.shape();
|
||||
let kmeans = if let Some(s) = points.as_slice() {
|
||||
KMeans::<f64, 8, _>::new(s, shape[0], shape[1], EuclideanDistance)
|
||||
} else {
|
||||
let v = points.flatten().to_vec();
|
||||
KMeans::<f64, 8, _>::new(v.as_slice(), shape[0], shape[1], EuclideanDistance)
|
||||
};
|
||||
let init = if let Some(init) = init {
|
||||
init
|
||||
} else {
|
||||
PyKMeansInit::PlusPlus()
|
||||
};
|
||||
let algorithm = if let Some(algorithm) = algorithm {
|
||||
algorithm
|
||||
} else {
|
||||
PyKMeansAlgorithm::Lloyd()
|
||||
};
|
||||
let config = KMeansConfig::default();
|
||||
match algorithm {
|
||||
PyKMeansAlgorithm::Lloyd() => PyKMeans {
|
||||
ndim: shape[1],
|
||||
inner: match init {
|
||||
PyKMeansInit::PlusPlus() => {
|
||||
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_kmeanplusplus, &config)
|
||||
}
|
||||
PyKMeansInit::RandomPartition() => {
|
||||
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_partition, &config)
|
||||
}
|
||||
PyKMeansInit::RandomSample() => {
|
||||
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_sample, &config)
|
||||
}
|
||||
PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_lloyd(
|
||||
k,
|
||||
max_iter,
|
||||
KMeans::init_precomputed(centroids),
|
||||
&config,
|
||||
),
|
||||
},
|
||||
},
|
||||
PyKMeansAlgorithm::MiniBatch(size) => PyKMeans {
|
||||
ndim: shape[1],
|
||||
inner: match init {
|
||||
PyKMeansInit::PlusPlus() => kmeans.kmeans_minibatch(
|
||||
size,
|
||||
k,
|
||||
max_iter,
|
||||
KMeans::init_kmeanplusplus,
|
||||
&config,
|
||||
),
|
||||
PyKMeansInit::RandomPartition() => kmeans.kmeans_minibatch(
|
||||
size,
|
||||
k,
|
||||
max_iter,
|
||||
KMeans::init_random_partition,
|
||||
&config,
|
||||
),
|
||||
PyKMeansInit::RandomSample() => kmeans.kmeans_minibatch(
|
||||
size,
|
||||
k,
|
||||
max_iter,
|
||||
KMeans::init_random_sample,
|
||||
&config,
|
||||
),
|
||||
PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_minibatch(
|
||||
size,
|
||||
k,
|
||||
max_iter,
|
||||
KMeans::init_precomputed(centroids),
|
||||
&config,
|
||||
),
|
||||
},
|
||||
},
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// K-Means++ initialization method, as implemented in Matlab
|
||||
///
|
||||
/// ## Description
|
||||
/// This initialization method starts by selecting one sample as first centroid.
|
||||
/// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
|
||||
/// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
|
||||
/// is from its centroid. Then, one sample is randomly selected, while taking their probability of being
|
||||
/// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
|
||||
/// their currently assigned cluster's centroid.
|
||||
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||
#[staticmethod]
|
||||
pub(crate) fn init_plusplus() -> PyKMeansInit {
|
||||
PyKMeansInit::PlusPlus()
|
||||
}
|
||||
|
||||
/// Random-Parition initialization method
|
||||
///
|
||||
/// ## Description
|
||||
/// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
|
||||
/// These means are then used as initial clusters.
|
||||
#[staticmethod]
|
||||
pub(crate) fn init_random_partition() -> PyKMeansInit {
|
||||
PyKMeansInit::RandomPartition()
|
||||
}
|
||||
|
||||
/// Random sample initialization method (a.k.a. Forgy)
|
||||
///
|
||||
/// ## Description
|
||||
/// This initialization method randomly selects k centroids from the samples as initial centroids.
|
||||
#[staticmethod]
|
||||
pub(crate) fn init_random_sample() -> PyKMeansInit {
|
||||
PyKMeansInit::RandomSample()
|
||||
}
|
||||
|
||||
/// Precomputed centroids initialization method
|
||||
///
|
||||
/// ## Description
|
||||
/// This initialization method requires a precomputed list of k centroids to use as initial
|
||||
/// centroids.
|
||||
#[staticmethod]
|
||||
pub(crate) fn init_precomputed(
|
||||
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||
centroids: PyArrayLike2<f64, AllowTypeChange>,
|
||||
) -> PyKMeansInit {
|
||||
PyKMeansInit::Precomputed(centroids.as_array().flatten().to_vec())
|
||||
}
|
||||
|
||||
/// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
|
||||
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||
#[staticmethod]
|
||||
pub(crate) fn algo_lloyd() -> PyKMeansAlgorithm {
|
||||
PyKMeansAlgorithm::Lloyd()
|
||||
}
|
||||
|
||||
/// Mini-Batch k-Means implementation.
|
||||
/// (see: https://dl.acm.org/citation.cfm?id=1772862)
|
||||
///
|
||||
/// ## Arguments
|
||||
/// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
|
||||
#[staticmethod]
|
||||
pub(crate) fn algo_mini_batch(batch_size: usize) -> PyKMeansAlgorithm {
|
||||
PyKMeansAlgorithm::MiniBatch(batch_size)
|
||||
}
|
||||
|
||||
/// find the closest cluster and the distance for each point
|
||||
pub(crate) fn predict(
|
||||
&self,
|
||||
py: Python,
|
||||
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||
points: PyArrayLike2<f64, AllowTypeChange>,
|
||||
) -> PyResult<(Vec<usize>, Vec<f64>)> {
|
||||
let points = points.as_array();
|
||||
Ok(py.detach(|| self.inner.predict(points))?)
|
||||
}
|
||||
|
||||
/// calculate the mean simple (using centroids) silhouette score for a set of points,
|
||||
/// assignments must be specified if they do not correspond to the assignments in the KMeans instance
|
||||
#[pyo3(signature = (points, assignments = None))]
|
||||
pub(crate) fn silhouette_simple(
|
||||
&self,
|
||||
py: Python,
|
||||
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||
points: PyArrayLike2<f64, AllowTypeChange>,
|
||||
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||
assignments: Option<PyArrayLike1<usize, AllowTypeChange>>,
|
||||
) -> PyResult<f64> {
|
||||
let points = points.as_array();
|
||||
let assignments = assignments.as_ref().map(|a| a.as_array());
|
||||
Ok(py.detach(|| self.inner.silhouette_simple(points, assignments))?)
|
||||
}
|
||||
|
||||
/// number of dimensions
|
||||
#[getter]
|
||||
pub(crate) fn ndim(&self) -> usize {
|
||||
self.ndim
|
||||
}
|
||||
|
||||
/// number of clusters
|
||||
#[getter]
|
||||
pub(crate) fn k(&self) -> usize {
|
||||
self.inner.k
|
||||
}
|
||||
|
||||
/// sum of all distances, cost measure
|
||||
#[getter]
|
||||
pub(crate) fn distance_sum(&self) -> f64 {
|
||||
self.inner.distsum
|
||||
}
|
||||
|
||||
/// centroid coordinates
|
||||
#[getter]
|
||||
pub(crate) fn centroids<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyArray2<f64>>> {
|
||||
let v = self.inner.centroids.to_vec();
|
||||
Ok(Array2::from_shape_vec((v.len() / self.ndim, self.ndim), v)
|
||||
.map_err(|e| PyErr::new::<PyTypeError, String>(e.to_string()))?
|
||||
.into_pyarray(py))
|
||||
}
|
||||
|
||||
/// centroid frequencies
|
||||
#[getter]
|
||||
pub(crate) fn centroid_frequency(&self) -> Vec<usize> {
|
||||
self.inner.centroid_frequency.clone()
|
||||
}
|
||||
|
||||
/// to which cluster each of the points is assigned
|
||||
#[getter]
|
||||
pub(crate) fn assignments(&self) -> Vec<usize> {
|
||||
self.inner.assignments.clone()
|
||||
}
|
||||
|
||||
/// distances of all points to the center it's assigned to
|
||||
#[getter]
|
||||
pub(crate) fn centroid_distances(&self) -> Vec<f64> {
|
||||
self.inner.centroid_distances.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Predict<f64> for KMeansState<f64> {
|
||||
type Error = Error;
|
||||
|
||||
fn predict<'a, A>(&self, points: A) -> Result<(Vec<usize>, Vec<f64>), Self::Error>
|
||||
where
|
||||
A: AsArray<'a, f64, Ix2>,
|
||||
{
|
||||
let centroids = self.centroids.to_vec();
|
||||
let ndim = centroids.len() / self.k;
|
||||
let points = points.into();
|
||||
let shape = points.shape();
|
||||
if shape[1] != ndim {
|
||||
return Err(Error::ShapeMismatch(shape[1], ndim));
|
||||
}
|
||||
if centroids.is_empty() {
|
||||
return Err(Error::NoCentroidsDefined);
|
||||
}
|
||||
let fill = vec![0.0; 8 - ndim % 8];
|
||||
let e = EuclideanDistance;
|
||||
let dist = |s: &[f64]| {
|
||||
s.par_chunks_exact(ndim)
|
||||
.map(|point| {
|
||||
let (i, d) = centroids
|
||||
.par_chunks_exact(ndim)
|
||||
.enumerate()
|
||||
.fold(
|
||||
|| (usize::MAX, f64::INFINITY),
|
||||
|(i, a), (j, centroid)| {
|
||||
let b = <EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
||||
&e,
|
||||
&[point, &fill].concat(),
|
||||
&[centroid, &fill].concat(),
|
||||
);
|
||||
if a <= b { (i, a) } else { (j, b) }
|
||||
},
|
||||
)
|
||||
.reduce(
|
||||
|| (usize::MAX, f64::INFINITY),
|
||||
|(i, a), (j, b)| {
|
||||
if a <= b { (i, a) } else { (j, b) }
|
||||
},
|
||||
);
|
||||
(i, d.sqrt())
|
||||
})
|
||||
.collect::<(Vec<_>, Vec<_>)>()
|
||||
};
|
||||
|
||||
Ok(if let Some(s) = points.as_slice() {
|
||||
dist(s)
|
||||
} else {
|
||||
let s = points.flatten().to_vec();
|
||||
dist(&s)
|
||||
})
|
||||
}
|
||||
|
||||
fn silhouette_simple<'p, 'a, P, A>(
|
||||
&self,
|
||||
points: P,
|
||||
assignments: Option<A>,
|
||||
) -> Result<f64, Self::Error>
|
||||
where
|
||||
P: AsArray<'p, f64, Ix2>,
|
||||
A: AsArray<'a, usize, Ix1>,
|
||||
{
|
||||
let points = points.into();
|
||||
let shape = points.shape();
|
||||
let centroids = Arc::new(self.centroids.to_vec());
|
||||
let ndim = centroids.len() / self.k;
|
||||
|
||||
if shape[1] != ndim {
|
||||
return Err(Error::ShapeMismatch(shape[1], ndim));
|
||||
}
|
||||
if centroids.is_empty() {
|
||||
return Err(Error::NoCentroidsDefined);
|
||||
}
|
||||
|
||||
let assignments = if let Some(assignments) = assignments {
|
||||
assignments.into().to_vec()
|
||||
} else {
|
||||
self.assignments.to_vec()
|
||||
};
|
||||
let k = self.k;
|
||||
let mut clusters = vec![Vec::new(); k];
|
||||
for (point, assignment) in points.rows().into_iter().zip(assignments) {
|
||||
clusters[assignment].extend(point.to_vec());
|
||||
}
|
||||
let fill = vec![0.0; 8 - ndim % 8];
|
||||
let a = clusters
|
||||
.par_iter()
|
||||
.zip(centroids.clone().par_chunks_exact(ndim))
|
||||
.flat_map(|(points, centroid)| {
|
||||
let c = [centroid, &fill].concat();
|
||||
let fill = fill.clone();
|
||||
let e = EuclideanDistance;
|
||||
points.par_chunks_exact(ndim).map(move |point| {
|
||||
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
||||
&e,
|
||||
&c,
|
||||
&[point, &fill].concat(),
|
||||
)
|
||||
.sqrt()
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let b = clusters
|
||||
.par_iter()
|
||||
.enumerate()
|
||||
.flat_map(|(i, points)| {
|
||||
let centroids = centroids.clone();
|
||||
let fill = fill.clone();
|
||||
let e = EuclideanDistance;
|
||||
points.par_chunks_exact(ndim).map(move |point| {
|
||||
centroids
|
||||
.par_chunks_exact(ndim)
|
||||
.enumerate()
|
||||
.map(|(j, centroid)| {
|
||||
if i == j {
|
||||
f64::INFINITY
|
||||
} else {
|
||||
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
||||
&e,
|
||||
&[centroid, &fill].concat(),
|
||||
&[point, &fill].concat(),
|
||||
)
|
||||
.sqrt()
|
||||
}
|
||||
})
|
||||
.min_by(|a, b| a.total_cmp(b))
|
||||
.unwrap_or(f64::INFINITY)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
Ok(a.into_iter()
|
||||
.zip(b)
|
||||
.map(|(a, b)| (b - a) / a.max(b))
|
||||
.sum::<f64>()
|
||||
/ points.shape()[0] as f64)
|
||||
}
|
||||
}
|
||||
|
||||
fn silhouette<'p, 'a, P, A, K>(points: P, assignments: A) -> Result<f64, Error>
|
||||
where
|
||||
P: AsArray<'p, f64, Ix2>,
|
||||
A: AsArray<'a, K, Ix1>,
|
||||
K: 'a + Eq + Hash,
|
||||
{
|
||||
let points = points.into();
|
||||
let assignments = assignments.into();
|
||||
let shape = points.shape();
|
||||
let n = shape[0];
|
||||
let ndim = shape[1];
|
||||
|
||||
let labels = assignments
|
||||
.iter()
|
||||
.collect::<HashSet<_>>()
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(k, v)| (v, k))
|
||||
.collect::<HashMap<_, _>>();
|
||||
let assignments = assignments.iter().map(|k| labels[k]).collect::<Vec<_>>();
|
||||
let k = labels.len();
|
||||
|
||||
let mut clusters = vec![Vec::new(); k];
|
||||
for (point, assignment) in points.rows().into_iter().zip(assignments) {
|
||||
clusters[assignment].extend(point.to_vec());
|
||||
}
|
||||
let bar = get_bar(Some(k * n + k * n * k))?;
|
||||
let fill = vec![0.0; 8 - ndim % 8];
|
||||
let e = EuclideanDistance;
|
||||
let a = clusters
|
||||
.par_iter()
|
||||
.flat_map(|points| {
|
||||
let c = (points.len() / ndim - 1) as f64;
|
||||
points
|
||||
.par_chunks_exact(ndim)
|
||||
.map(|i| {
|
||||
let q = points
|
||||
.par_chunks_exact(ndim)
|
||||
.map(|j| {
|
||||
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
||||
&e,
|
||||
&[i, &fill].concat(),
|
||||
&[j, &fill].concat(),
|
||||
)
|
||||
.sqrt()
|
||||
})
|
||||
.sum::<f64>()
|
||||
/ c;
|
||||
bar.inc(1);
|
||||
q
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let b = clusters
|
||||
.par_iter()
|
||||
.enumerate()
|
||||
.flat_map(|(i, points_i)| {
|
||||
points_i
|
||||
.par_chunks_exact(ndim)
|
||||
.map(|a| {
|
||||
clusters
|
||||
.par_iter()
|
||||
.enumerate()
|
||||
.map(|(j, points_j)| {
|
||||
let c = (points_j.len() / ndim) as f64;
|
||||
let q = if i == j {
|
||||
f64::INFINITY
|
||||
} else {
|
||||
points_j
|
||||
.par_chunks_exact(ndim)
|
||||
.map(|b| {
|
||||
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
||||
&e,
|
||||
&[a, &fill].concat(),
|
||||
&[b, &fill].concat(),
|
||||
)
|
||||
.sqrt()
|
||||
})
|
||||
.sum::<f64>()
|
||||
/ c
|
||||
};
|
||||
bar.inc(1);
|
||||
q
|
||||
})
|
||||
.min_by(|a, b| a.total_cmp(b))
|
||||
.unwrap_or(f64::INFINITY)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
bar.finish();
|
||||
Ok(a.into_iter()
|
||||
.zip(b)
|
||||
.map(|(a, b)| (b - a) / a.max(b))
|
||||
.sum::<f64>()
|
||||
/ points.shape()[0] as f64)
|
||||
}
|
||||
|
||||
/// calculate the mean silhouette score for a set of points
|
||||
#[gen_stub_pyfunction(module = "kmeans_rs")]
|
||||
#[pyfunction(name = "silhouette")]
|
||||
pub(crate) fn py_silhouette(
|
||||
py: Python,
|
||||
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||
points: PyArrayLike2<f64, AllowTypeChange>,
|
||||
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||
assignments: PyArrayLike1<usize, AllowTypeChange>,
|
||||
) -> PyResult<f64> {
|
||||
let points = points.as_array();
|
||||
let assignments = assignments.as_array();
|
||||
Ok(py.detach(|| silhouette(points, assignments))?)
|
||||
}
|
||||
|
||||
/// generates kmeans/__init__.pyi
|
||||
#[pyfunction]
|
||||
fn generate_stub(dest_path: String) -> PyResult<()> {
|
||||
Ok(StubInfo::from_project_root(
|
||||
"kmeans_rs".to_string(),
|
||||
PathBuf::from(dest_path).join("py"),
|
||||
true,
|
||||
StubGenConfig::default(),
|
||||
)?
|
||||
.generate()?)
|
||||
}
|
||||
|
||||
#[pymodule]
|
||||
#[pyo3(name = "kmeans_rs")]
|
||||
mod kmeans_rs {
|
||||
use pyo3::prelude::*;
|
||||
|
||||
#[pymodule_export]
|
||||
use super::generate_stub;
|
||||
|
||||
#[pymodule_export]
|
||||
use super::PyKMeans;
|
||||
|
||||
#[pymodule_export]
|
||||
use super::PyKMeansInit;
|
||||
|
||||
#[pymodule_export]
|
||||
use super::PyKMeansAlgorithm;
|
||||
|
||||
#[pymodule_export]
|
||||
use super::py_silhouette;
|
||||
|
||||
#[pymodule_init]
|
||||
fn init(_: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
Ok(color_eyre::install()?)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user