first commit
CI / linux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 55s
CI / linux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:ppc64le]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:s390x]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 6s
CI / musllinux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 4s
CI / musllinux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 4s
CI / windows (map[runner:windows-latest target:x64]) (push) Has been cancelled
CI / windows (map[runner:windows-latest target:x86]) (push) Has been cancelled
CI / macos (map[runner:macos-13 target:x86_64]) (push) Has been cancelled
CI / macos (map[runner:macos-14 target:aarch64]) (push) Has been cancelled
CI / Release (push) Has been cancelled
CI / sdist (push) Has been cancelled
CI / linux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 5s
CI / linux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 55s
CI / linux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:ppc64le]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:s390x]) (push) Failing after 4s
CI / linux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:aarch64]) (push) Failing after 6s
CI / musllinux (map[runner:ubuntu-22.04 target:armv7]) (push) Failing after 5s
CI / musllinux (map[runner:ubuntu-22.04 target:x86]) (push) Failing after 4s
CI / musllinux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 4s
CI / windows (map[runner:windows-latest target:x64]) (push) Has been cancelled
CI / windows (map[runner:windows-latest target:x86]) (push) Has been cancelled
CI / macos (map[runner:macos-13 target:x86_64]) (push) Has been cancelled
CI / macos (map[runner:macos-14 target:aarch64]) (push) Has been cancelled
CI / Release (push) Has been cancelled
CI / sdist (push) Has been cancelled
CI / linux (map[runner:ubuntu-22.04 target:x86_64]) (push) Failing after 5s
This commit is contained in:
@@ -0,0 +1,181 @@
|
|||||||
|
# This file is autogenerated by maturin v1.8.4
|
||||||
|
# To update, run
|
||||||
|
#
|
||||||
|
# maturin generate-ci github
|
||||||
|
#
|
||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
- master
|
||||||
|
tags:
|
||||||
|
- '*'
|
||||||
|
pull_request:
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
linux:
|
||||||
|
runs-on: ${{ matrix.platform.runner }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
platform:
|
||||||
|
- runner: ubuntu-22.04
|
||||||
|
target: x86_64
|
||||||
|
- runner: ubuntu-22.04
|
||||||
|
target: x86
|
||||||
|
- runner: ubuntu-22.04
|
||||||
|
target: aarch64
|
||||||
|
- runner: ubuntu-22.04
|
||||||
|
target: armv7
|
||||||
|
- runner: ubuntu-22.04
|
||||||
|
target: s390x
|
||||||
|
- runner: ubuntu-22.04
|
||||||
|
target: ppc64le
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: 3.x
|
||||||
|
- name: Build wheels
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
with:
|
||||||
|
target: ${{ matrix.platform.target }}
|
||||||
|
args: --release --out dist --find-interpreter
|
||||||
|
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
|
||||||
|
manylinux: auto
|
||||||
|
- name: Upload wheels
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wheels-linux-${{ matrix.platform.target }}
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
musllinux:
|
||||||
|
runs-on: ${{ matrix.platform.runner }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
platform:
|
||||||
|
- runner: ubuntu-22.04
|
||||||
|
target: x86_64
|
||||||
|
- runner: ubuntu-22.04
|
||||||
|
target: x86
|
||||||
|
- runner: ubuntu-22.04
|
||||||
|
target: aarch64
|
||||||
|
- runner: ubuntu-22.04
|
||||||
|
target: armv7
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: 3.x
|
||||||
|
- name: Build wheels
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
with:
|
||||||
|
target: ${{ matrix.platform.target }}
|
||||||
|
args: --release --out dist --find-interpreter
|
||||||
|
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
|
||||||
|
manylinux: musllinux_1_2
|
||||||
|
- name: Upload wheels
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wheels-musllinux-${{ matrix.platform.target }}
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
windows:
|
||||||
|
runs-on: ${{ matrix.platform.runner }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
platform:
|
||||||
|
- runner: windows-latest
|
||||||
|
target: x64
|
||||||
|
- runner: windows-latest
|
||||||
|
target: x86
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: 3.x
|
||||||
|
architecture: ${{ matrix.platform.target }}
|
||||||
|
- name: Build wheels
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
with:
|
||||||
|
target: ${{ matrix.platform.target }}
|
||||||
|
args: --release --out dist --find-interpreter
|
||||||
|
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
|
||||||
|
- name: Upload wheels
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wheels-windows-${{ matrix.platform.target }}
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
macos:
|
||||||
|
runs-on: ${{ matrix.platform.runner }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
platform:
|
||||||
|
- runner: macos-13
|
||||||
|
target: x86_64
|
||||||
|
- runner: macos-14
|
||||||
|
target: aarch64
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: 3.x
|
||||||
|
- name: Build wheels
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
with:
|
||||||
|
target: ${{ matrix.platform.target }}
|
||||||
|
args: --release --out dist --find-interpreter
|
||||||
|
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
|
||||||
|
- name: Upload wheels
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wheels-macos-${{ matrix.platform.target }}
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
sdist:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Build sdist
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
with:
|
||||||
|
command: sdist
|
||||||
|
args: --out dist
|
||||||
|
- name: Upload sdist
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wheels-sdist
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
release:
|
||||||
|
name: Release
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }}
|
||||||
|
needs: [linux, musllinux, windows, macos, sdist]
|
||||||
|
permissions:
|
||||||
|
# Use to sign the release artifacts
|
||||||
|
id-token: write
|
||||||
|
# Used to upload release artifacts
|
||||||
|
contents: write
|
||||||
|
# Used to generate artifact attestation
|
||||||
|
attestations: write
|
||||||
|
steps:
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
- name: Generate artifact attestation
|
||||||
|
uses: actions/attest-build-provenance@v2
|
||||||
|
with:
|
||||||
|
subject-path: 'wheels-*/*'
|
||||||
|
- name: Publish to PyPI
|
||||||
|
if: ${{ startsWith(github.ref, 'refs/tags/') }}
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
env:
|
||||||
|
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
|
||||||
|
with:
|
||||||
|
command: upload
|
||||||
|
args: --non-interactive --skip-existing wheels-*/*
|
||||||
+74
@@ -0,0 +1,74 @@
|
|||||||
|
/target
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
.pytest_cache/
|
||||||
|
*.py[cod]
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
.venv/
|
||||||
|
env/
|
||||||
|
bin/
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
include/
|
||||||
|
man/
|
||||||
|
venv/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
pip-selfcheck.json
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
|
||||||
|
# Mr Developer
|
||||||
|
.mr.developer.cfg
|
||||||
|
.project
|
||||||
|
.pydevproject
|
||||||
|
|
||||||
|
# Rope
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
.idea/
|
||||||
|
|
||||||
|
# VSCode
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
# Pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
Cargo.lock
|
||||||
+30
@@ -0,0 +1,30 @@
|
|||||||
|
[package]
|
||||||
|
name = "kmeans_rs"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
rust-version = "1.94.0"
|
||||||
|
authors = ["Wim Pomp <w.pomp@nki.nl>"]
|
||||||
|
license = "MIT OR Apache-2.0"
|
||||||
|
description = "Python wrapper for Rust kmeans library."
|
||||||
|
homepage = "https://git.wimpomp.nl/wim/kmeans_rs"
|
||||||
|
repository = "https://git.wimpomp.nl/wim/kmeans_rs"
|
||||||
|
readme = "README.md"
|
||||||
|
keywords = ["kmeans"]
|
||||||
|
categories = ["science"]
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
[lib]
|
||||||
|
name = "kmeans_rs"
|
||||||
|
crate-type = ["cdylib"]
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
color-eyre = "0.6"
|
||||||
|
console = "0.16"
|
||||||
|
indicatif = { version = "0.18", features = ["rayon"] }
|
||||||
|
kmeans = "2"
|
||||||
|
ndarray = "0.17"
|
||||||
|
numpy = "0.28"
|
||||||
|
pyo3 = { version = "0.28", features = ["abi3-py310", "anyhow", "eyre"] }
|
||||||
|
pyo3-stub-gen = "0.22"
|
||||||
|
rayon = "1"
|
||||||
|
thiserror = "2"
|
||||||
+201
@@ -0,0 +1,201 @@
|
|||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
+27
@@ -0,0 +1,27 @@
|
|||||||
|
Copyright (c) 2015 - 2021 Ulrik Sverdrup "bluss",
|
||||||
|
Jim Turner,
|
||||||
|
and ndarray developers
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any
|
||||||
|
person obtaining a copy of this software and associated
|
||||||
|
documentation files (the "Software"), to deal in the
|
||||||
|
Software without restriction, including without
|
||||||
|
limitation the rights to use, copy, modify, merge,
|
||||||
|
publish, distribute, sublicense, and/or sell copies of
|
||||||
|
the Software, and to permit persons to whom the Software
|
||||||
|
is furnished to do so, subject to the following
|
||||||
|
conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice
|
||||||
|
shall be included in all copies or substantial portions
|
||||||
|
of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||||
|
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||||
|
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||||
|
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||||
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||||
|
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||||
|
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
DEALINGS IN THE SOFTWARE.
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
### KMeans
|
||||||
|
A small library wrapping the unaffiliated Rust kmeans library: https://crates.io/crates/kmeans,
|
||||||
|
kmeans is fast for big datasets due because of the use of multicore processing and SIMD.
|
||||||
|
|
||||||
|
Building requires rust nightly.
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from importlib.metadata import version
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
os.environ["RUST_BACKTRACE"] = "full"
|
||||||
|
os.environ["COLORBT_SHOW_HIDDEN"] = "1"
|
||||||
|
|
||||||
|
from .kmeans_rs import * # noqa
|
||||||
|
|
||||||
|
try:
|
||||||
|
__version__ = version(Path(__file__).parent.name)
|
||||||
|
except (Exception,):
|
||||||
|
__version__ = "unknown"
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(Path(__file__).parent.parent / ".git" / "HEAD") as g:
|
||||||
|
head = g.read().split(":")[1].strip()
|
||||||
|
with open(Path(__file__).parent.parent / ".git" / head) as h:
|
||||||
|
__git_commit_hash__ = h.read().rstrip("\n")
|
||||||
|
except (Exception,):
|
||||||
|
__git_commit_hash__ = "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def kmeans_generate_stub():
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
path = Path(sys.argv[1]).resolve()
|
||||||
|
else:
|
||||||
|
path = Path.cwd().resolve()
|
||||||
|
if (path / "py" / "kmeans_rs" / "__init__.py").exists():
|
||||||
|
generate_stub(str(path)) # noqa
|
||||||
|
else:
|
||||||
|
raise ModuleNotFoundError(str(path / "py" / "kmeans_rs" / "__init__.py"))
|
||||||
@@ -0,0 +1,245 @@
|
|||||||
|
# This file is automatically generated by pyo3_stub_gen
|
||||||
|
# ruff: noqa: E501, F401, F403, F405
|
||||||
|
|
||||||
|
import builtins
|
||||||
|
import numpy
|
||||||
|
import numpy.typing
|
||||||
|
import typing
|
||||||
|
__all__ = [
|
||||||
|
"KMeans",
|
||||||
|
"KMeansAlgorithm",
|
||||||
|
"KMeansInit",
|
||||||
|
"silhouette",
|
||||||
|
]
|
||||||
|
|
||||||
|
@typing.final
|
||||||
|
class KMeans:
|
||||||
|
r"""
|
||||||
|
Compute kmeans clustering
|
||||||
|
this implementation is supposed to be faster than scipy or scikit-learn
|
||||||
|
when dealing with a lot of points
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
- **points**: Numpy array #points x dimensions
|
||||||
|
- **k**: Amount of clusters to search for
|
||||||
|
- **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite)
|
||||||
|
- **init**: initialization method
|
||||||
|
- **algorithm**: algorithm to use
|
||||||
|
"""
|
||||||
|
@property
|
||||||
|
def ndim(self) -> builtins.int:
|
||||||
|
r"""
|
||||||
|
number of dimensions
|
||||||
|
"""
|
||||||
|
@property
|
||||||
|
def k(self) -> builtins.int:
|
||||||
|
r"""
|
||||||
|
number of clusters
|
||||||
|
"""
|
||||||
|
@property
|
||||||
|
def distance_sum(self) -> builtins.float:
|
||||||
|
r"""
|
||||||
|
sum of all distances, cost measure
|
||||||
|
"""
|
||||||
|
@property
|
||||||
|
def centroids(self) -> numpy.typing.NDArray[numpy.float64]:
|
||||||
|
r"""
|
||||||
|
centroid coordinates
|
||||||
|
"""
|
||||||
|
@property
|
||||||
|
def centroid_frequency(self) -> builtins.list[builtins.int]:
|
||||||
|
r"""
|
||||||
|
centroid frequencies
|
||||||
|
"""
|
||||||
|
@property
|
||||||
|
def assignments(self) -> builtins.list[builtins.int]:
|
||||||
|
r"""
|
||||||
|
to which cluster each of the points is assigned
|
||||||
|
"""
|
||||||
|
@property
|
||||||
|
def centroid_distances(self) -> builtins.list[builtins.float]:
|
||||||
|
r"""
|
||||||
|
distances of all points to the center it's assigned to
|
||||||
|
"""
|
||||||
|
def __new__(cls, points: numpy.typing.ArrayLike, k: builtins.int, max_iter: builtins.int = 300, init: typing.Optional[KMeansInit] = None, algorithm: typing.Optional[KMeansAlgorithm] = None) -> KMeans: ...
|
||||||
|
@staticmethod
|
||||||
|
def init_plusplus() -> KMeansInit:
|
||||||
|
r"""
|
||||||
|
K-Means++ initialization method, as implemented in Matlab
|
||||||
|
|
||||||
|
## Description
|
||||||
|
This initialization method starts by selecting one sample as first centroid.
|
||||||
|
Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
|
||||||
|
each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
|
||||||
|
is from its centroid. Then, one sample is randomly selected, while taking their probability of being
|
||||||
|
the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
|
||||||
|
their currently assigned cluster's centroid.
|
||||||
|
(see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def init_random_partition() -> KMeansInit:
|
||||||
|
r"""
|
||||||
|
Random-Parition initialization method
|
||||||
|
|
||||||
|
## Description
|
||||||
|
This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
|
||||||
|
These means are then used as initial clusters.
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def init_random_sample() -> KMeansInit:
|
||||||
|
r"""
|
||||||
|
Random sample initialization method (a.k.a. Forgy)
|
||||||
|
|
||||||
|
## Description
|
||||||
|
This initialization method randomly selects k centroids from the samples as initial centroids.
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def init_precomputed(centroids: numpy.typing.ArrayLike) -> KMeansInit:
|
||||||
|
r"""
|
||||||
|
Precomputed centroids initialization method
|
||||||
|
|
||||||
|
## Description
|
||||||
|
This initialization method requires a precomputed list of k centroids to use as initial
|
||||||
|
centroids.
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def algo_lloyd() -> KMeansAlgorithm:
|
||||||
|
r"""
|
||||||
|
Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
|
||||||
|
(see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def algo_mini_batch(batch_size: builtins.int) -> KMeansAlgorithm:
|
||||||
|
r"""
|
||||||
|
Mini-Batch k-Means implementation.
|
||||||
|
(see: https://dl.acm.org/citation.cfm?id=1772862)
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
- **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
|
||||||
|
"""
|
||||||
|
def predict(self, points: numpy.typing.ArrayLike) -> tuple[builtins.list[builtins.int], builtins.list[builtins.float]]:
|
||||||
|
r"""
|
||||||
|
find the closest cluster and the distance for each point
|
||||||
|
"""
|
||||||
|
def silhouette_simple(self, points: numpy.typing.ArrayLike, assignments: numpy.typing.ArrayLike = None) -> builtins.float:
|
||||||
|
r"""
|
||||||
|
calculate the mean simple (using centroids) silhouette score for a set of points,
|
||||||
|
assignments must be specified if they do not correspond to the assignments in the KMeans instance
|
||||||
|
"""
|
||||||
|
|
||||||
|
class KMeansAlgorithm:
|
||||||
|
r"""
|
||||||
|
Specify a kmeans algorithm using lloyd or mini_batch.
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def lloyd() -> KMeansAlgorithm:
|
||||||
|
r"""
|
||||||
|
Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
|
||||||
|
(see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def mini_batch(batch_size: builtins.int) -> KMeansAlgorithm:
|
||||||
|
r"""
|
||||||
|
Mini-Batch k-Means implementation.
|
||||||
|
(see: https://dl.acm.org/citation.cfm?id=1772862)
|
||||||
|
|
||||||
|
## Arguments
|
||||||
|
- **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
|
||||||
|
"""
|
||||||
|
@typing.final
|
||||||
|
class Lloyd(KMeansAlgorithm):
|
||||||
|
__match_args__ = ()
|
||||||
|
def __new__(cls) -> KMeansAlgorithm.Lloyd: ...
|
||||||
|
def __len__(self) -> builtins.int: ...
|
||||||
|
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||||
|
|
||||||
|
@typing.final
|
||||||
|
class MiniBatch(KMeansAlgorithm):
|
||||||
|
__match_args__ = ("_0",)
|
||||||
|
@property
|
||||||
|
def _0(self) -> builtins.int: ...
|
||||||
|
def __new__(cls, _0: builtins.int) -> KMeansAlgorithm.MiniBatch: ...
|
||||||
|
def __len__(self) -> builtins.int: ...
|
||||||
|
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||||
|
|
||||||
|
|
||||||
|
class KMeansInit:
|
||||||
|
r"""
|
||||||
|
Specify an initialization method using plusplus, random_partition, random_sample or precomputed.
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def plusplus() -> KMeansInit:
|
||||||
|
r"""
|
||||||
|
K-Means++ initialization method, as implemented in Matlab
|
||||||
|
|
||||||
|
## Description
|
||||||
|
This initialization method starts by selecting one sample as first centroid.
|
||||||
|
Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
|
||||||
|
each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
|
||||||
|
is from its centroid. Then, one sample is randomly selected, while taking their probability of being
|
||||||
|
the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
|
||||||
|
their currently assigned cluster's centroid.
|
||||||
|
(see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def random_partition() -> KMeansInit:
|
||||||
|
r"""
|
||||||
|
Random-Partition initialization method
|
||||||
|
|
||||||
|
## Description
|
||||||
|
This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
|
||||||
|
These means are then used as initial clusters.
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def random_sample() -> KMeansInit:
|
||||||
|
r"""
|
||||||
|
Random sample initialization method (a.k.a. Forgy)
|
||||||
|
|
||||||
|
## Description
|
||||||
|
This initialization method randomly selects k centroids from the samples as initial centroids.
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def precomputed(centroids: numpy.typing.ArrayLike) -> KMeansInit:
|
||||||
|
r"""
|
||||||
|
Precomputed centroids initialization method
|
||||||
|
|
||||||
|
## Description
|
||||||
|
This initialization method requires a precomputed list of k centroids to use as initial
|
||||||
|
centroids.
|
||||||
|
"""
|
||||||
|
@typing.final
|
||||||
|
class PlusPlus(KMeansInit):
|
||||||
|
__match_args__ = ()
|
||||||
|
def __new__(cls) -> KMeansInit.PlusPlus: ...
|
||||||
|
def __len__(self) -> builtins.int: ...
|
||||||
|
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||||
|
|
||||||
|
@typing.final
|
||||||
|
class RandomPartition(KMeansInit):
|
||||||
|
__match_args__ = ()
|
||||||
|
def __new__(cls) -> KMeansInit.RandomPartition: ...
|
||||||
|
def __len__(self) -> builtins.int: ...
|
||||||
|
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||||
|
|
||||||
|
@typing.final
|
||||||
|
class RandomSample(KMeansInit):
|
||||||
|
__match_args__ = ()
|
||||||
|
def __new__(cls) -> KMeansInit.RandomSample: ...
|
||||||
|
def __len__(self) -> builtins.int: ...
|
||||||
|
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||||
|
|
||||||
|
@typing.final
|
||||||
|
class Precomputed(KMeansInit):
|
||||||
|
__match_args__ = ("_0",)
|
||||||
|
@property
|
||||||
|
def _0(self) -> builtins.list[builtins.float]: ...
|
||||||
|
def __new__(cls, _0: typing.Sequence[builtins.float]) -> KMeansInit.Precomputed: ...
|
||||||
|
def __len__(self) -> builtins.int: ...
|
||||||
|
def __getitem__(self, key: builtins.int) -> typing.Any: ...
|
||||||
|
|
||||||
|
|
||||||
|
def silhouette(points: numpy.typing.ArrayLike, assignments: numpy.typing.ArrayLike) -> builtins.float:
|
||||||
|
r"""
|
||||||
|
calculate the mean silhouette score for a set of points
|
||||||
|
"""
|
||||||
|
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["maturin>=1.9.4,<2.0"]
|
||||||
|
build-backend = "maturin"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "kmeans"
|
||||||
|
dynamic = ["version"]
|
||||||
|
authors = [
|
||||||
|
{ name = "Wim Pomp", email = "w.pomp@nki.nl" },
|
||||||
|
]
|
||||||
|
readme = "README.md"
|
||||||
|
keywords = ["kmeans"]
|
||||||
|
description = "Python wrapper for Rust kmeans library."
|
||||||
|
requires-python = ">=3.8"
|
||||||
|
classifiers = [
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Programming Language :: Rust",
|
||||||
|
"Programming Language :: Python :: Implementation :: CPython",
|
||||||
|
"Programming Language :: Python :: Implementation :: PyPy",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.maturin]
|
||||||
|
python-source = "py"
|
||||||
|
module-name = "kmeans_rs"
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
kmeans_generate_stub = "kmeans_rs:kmeans_generate_stub"
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 119
|
||||||
|
indent-width = 4
|
||||||
|
|
||||||
|
[tool.isort]
|
||||||
|
line_length = 119
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
[toolchain]
|
||||||
|
channel = "nightly"
|
||||||
+696
@@ -0,0 +1,696 @@
|
|||||||
|
use console::Term;
|
||||||
|
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
|
||||||
|
use kmeans::*;
|
||||||
|
use ndarray::{Array2, AsArray, Ix1, Ix2};
|
||||||
|
use numpy::{AllowTypeChange, IntoPyArray, PyArray2, PyArrayLike1, PyArrayLike2};
|
||||||
|
use pyo3::exceptions::PyTypeError;
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
use pyo3_stub_gen::derive::*;
|
||||||
|
use pyo3_stub_gen::{StubGenConfig, StubInfo};
|
||||||
|
use rayon::prelude::*;
|
||||||
|
use std::collections::{HashMap, HashSet};
|
||||||
|
use std::hash::Hash;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum Error {
|
||||||
|
#[error(transparent)]
|
||||||
|
ProgressBarTemplate(#[from] indicatif::style::TemplateError),
|
||||||
|
#[error("shape mismatch: {0} != {1}")]
|
||||||
|
ShapeMismatch(usize, usize),
|
||||||
|
#[error("no centroids defined")]
|
||||||
|
NoCentroidsDefined,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<Error> for PyErr {
|
||||||
|
fn from(err: Error) -> PyErr {
|
||||||
|
color_eyre::eyre::Report::from(err).into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// a progress bar with an ok style that when py::detach is used also works in jupyter
|
||||||
|
pub fn get_bar(count: Option<usize>) -> Result<ProgressBar, Error> {
|
||||||
|
let style = ProgressStyle::with_template(
|
||||||
|
"{spinner:.green} {percent}% [{wide_bar:.green/lime}] {pos:>7}/{len:7} [{elapsed}/{eta}, {per_sec:<5}]",
|
||||||
|
)?.progress_chars("#>-");
|
||||||
|
let bar = ProgressBar::with_draw_target(
|
||||||
|
count.map(|i| i as u64),
|
||||||
|
ProgressDrawTarget::term_like_with_hz(Box::new(Term::buffered_stdout()), 20),
|
||||||
|
)
|
||||||
|
.with_style(style);
|
||||||
|
bar.enable_steady_tick(Duration::from_millis(100));
|
||||||
|
Ok(bar)
|
||||||
|
}
|
||||||
|
|
||||||
|
trait Predict<T> {
|
||||||
|
type Error;
|
||||||
|
|
||||||
|
fn predict<'a, A>(&self, points: A) -> Result<(Vec<usize>, Vec<T>), Self::Error>
|
||||||
|
where
|
||||||
|
A: AsArray<'a, T, Ix2>,
|
||||||
|
T: 'a;
|
||||||
|
|
||||||
|
fn silhouette_simple<'p, 'a, P, A>(
|
||||||
|
&self,
|
||||||
|
points: P,
|
||||||
|
assignments: Option<A>,
|
||||||
|
) -> Result<f64, Self::Error>
|
||||||
|
where
|
||||||
|
P: AsArray<'p, f64, Ix2>,
|
||||||
|
A: AsArray<'a, usize, Ix1>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Specify an initialization method using plusplus, random_partition, random_sample or precomputed.
|
||||||
|
#[gen_stub_pyclass_complex_enum]
|
||||||
|
#[pyclass(name = "KMeansInit", module = "kmeans_rs", from_py_object)]
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub(crate) enum PyKMeansInit {
|
||||||
|
PlusPlus(),
|
||||||
|
RandomPartition(),
|
||||||
|
RandomSample(),
|
||||||
|
Precomputed(Vec<f64>),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[gen_stub_pymethods]
|
||||||
|
#[pymethods]
|
||||||
|
impl PyKMeansInit {
|
||||||
|
/// K-Means++ initialization method, as implemented in Matlab
|
||||||
|
///
|
||||||
|
/// ## Description
|
||||||
|
/// This initialization method starts by selecting one sample as first centroid.
|
||||||
|
/// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
|
||||||
|
/// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
|
||||||
|
/// is from its centroid. Then, one sample is randomly selected, while taking their probability of being
|
||||||
|
/// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
|
||||||
|
/// their currently assigned cluster's centroid.
|
||||||
|
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn plusplus() -> Self {
|
||||||
|
Self::PlusPlus()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Random-Partition initialization method
|
||||||
|
///
|
||||||
|
/// ## Description
|
||||||
|
/// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
|
||||||
|
/// These means are then used as initial clusters.
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn random_partition() -> Self {
|
||||||
|
Self::RandomPartition()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Random sample initialization method (a.k.a. Forgy)
|
||||||
|
///
|
||||||
|
/// ## Description
|
||||||
|
/// This initialization method randomly selects k centroids from the samples as initial centroids.
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn random_sample() -> Self {
|
||||||
|
Self::RandomSample()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Precomputed centroids initialization method
|
||||||
|
///
|
||||||
|
/// ## Description
|
||||||
|
/// This initialization method requires a precomputed list of k centroids to use as initial
|
||||||
|
/// centroids.
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn precomputed(
|
||||||
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||||
|
centroids: PyArrayLike2<f64, AllowTypeChange>,
|
||||||
|
) -> Self {
|
||||||
|
Self::Precomputed(centroids.as_array().flatten().to_vec())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Specify a kmeans algorithm using lloyd or mini_batch.
|
||||||
|
#[gen_stub_pyclass_complex_enum]
|
||||||
|
#[pyclass(name = "KMeansAlgorithm", module = "kmeans_rs", from_py_object)]
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub(crate) enum PyKMeansAlgorithm {
|
||||||
|
Lloyd(),
|
||||||
|
MiniBatch(usize),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[gen_stub_pymethods]
|
||||||
|
#[pymethods]
|
||||||
|
impl PyKMeansAlgorithm {
|
||||||
|
/// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
|
||||||
|
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn lloyd() -> Self {
|
||||||
|
Self::Lloyd()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mini-Batch k-Means implementation.
|
||||||
|
/// (see: https://dl.acm.org/citation.cfm?id=1772862)
|
||||||
|
///
|
||||||
|
/// ## Arguments
|
||||||
|
/// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn mini_batch(batch_size: usize) -> Self {
|
||||||
|
Self::MiniBatch(batch_size)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute kmeans clustering
|
||||||
|
/// this implementation is supposed to be faster than scipy or scikit-learn
|
||||||
|
/// when dealing with a lot of points
|
||||||
|
///
|
||||||
|
/// ## Arguments
|
||||||
|
/// - **points**: Numpy array #points x dimensions
|
||||||
|
/// - **k**: Amount of clusters to search for
|
||||||
|
/// - **max_iter**: Limit the maximum amount of iterations (just pass a high number for infinite)
|
||||||
|
/// - **init**: initialization method
|
||||||
|
/// - **algorithm**: algorithm to use
|
||||||
|
#[gen_stub_pyclass]
|
||||||
|
#[pyclass(name = "KMeans", module = "kmeans_rs", from_py_object)]
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub(crate) struct PyKMeans {
|
||||||
|
ndim: usize,
|
||||||
|
inner: KMeansState<f64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[gen_stub_pymethods]
|
||||||
|
#[pymethods]
|
||||||
|
impl PyKMeans {
|
||||||
|
#[new]
|
||||||
|
#[pyo3(signature = (points, k, max_iter=300, init=None, algorithm=None))]
|
||||||
|
pub(crate) fn new(
|
||||||
|
py: Python,
|
||||||
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||||
|
points: PyArrayLike2<f64, AllowTypeChange>,
|
||||||
|
k: usize,
|
||||||
|
max_iter: usize,
|
||||||
|
init: Option<PyKMeansInit>,
|
||||||
|
algorithm: Option<PyKMeansAlgorithm>,
|
||||||
|
) -> Self {
|
||||||
|
let points = points.as_array();
|
||||||
|
py.detach(|| {
|
||||||
|
let shape = points.shape();
|
||||||
|
let kmeans = if let Some(s) = points.as_slice() {
|
||||||
|
KMeans::<f64, 8, _>::new(s, shape[0], shape[1], EuclideanDistance)
|
||||||
|
} else {
|
||||||
|
let v = points.flatten().to_vec();
|
||||||
|
KMeans::<f64, 8, _>::new(v.as_slice(), shape[0], shape[1], EuclideanDistance)
|
||||||
|
};
|
||||||
|
let init = if let Some(init) = init {
|
||||||
|
init
|
||||||
|
} else {
|
||||||
|
PyKMeansInit::PlusPlus()
|
||||||
|
};
|
||||||
|
let algorithm = if let Some(algorithm) = algorithm {
|
||||||
|
algorithm
|
||||||
|
} else {
|
||||||
|
PyKMeansAlgorithm::Lloyd()
|
||||||
|
};
|
||||||
|
let config = KMeansConfig::default();
|
||||||
|
match algorithm {
|
||||||
|
PyKMeansAlgorithm::Lloyd() => PyKMeans {
|
||||||
|
ndim: shape[1],
|
||||||
|
inner: match init {
|
||||||
|
PyKMeansInit::PlusPlus() => {
|
||||||
|
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_kmeanplusplus, &config)
|
||||||
|
}
|
||||||
|
PyKMeansInit::RandomPartition() => {
|
||||||
|
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_partition, &config)
|
||||||
|
}
|
||||||
|
PyKMeansInit::RandomSample() => {
|
||||||
|
kmeans.kmeans_lloyd(k, max_iter, KMeans::init_random_sample, &config)
|
||||||
|
}
|
||||||
|
PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_lloyd(
|
||||||
|
k,
|
||||||
|
max_iter,
|
||||||
|
KMeans::init_precomputed(centroids),
|
||||||
|
&config,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
PyKMeansAlgorithm::MiniBatch(size) => PyKMeans {
|
||||||
|
ndim: shape[1],
|
||||||
|
inner: match init {
|
||||||
|
PyKMeansInit::PlusPlus() => kmeans.kmeans_minibatch(
|
||||||
|
size,
|
||||||
|
k,
|
||||||
|
max_iter,
|
||||||
|
KMeans::init_kmeanplusplus,
|
||||||
|
&config,
|
||||||
|
),
|
||||||
|
PyKMeansInit::RandomPartition() => kmeans.kmeans_minibatch(
|
||||||
|
size,
|
||||||
|
k,
|
||||||
|
max_iter,
|
||||||
|
KMeans::init_random_partition,
|
||||||
|
&config,
|
||||||
|
),
|
||||||
|
PyKMeansInit::RandomSample() => kmeans.kmeans_minibatch(
|
||||||
|
size,
|
||||||
|
k,
|
||||||
|
max_iter,
|
||||||
|
KMeans::init_random_sample,
|
||||||
|
&config,
|
||||||
|
),
|
||||||
|
PyKMeansInit::Precomputed(centroids) => kmeans.kmeans_minibatch(
|
||||||
|
size,
|
||||||
|
k,
|
||||||
|
max_iter,
|
||||||
|
KMeans::init_precomputed(centroids),
|
||||||
|
&config,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// K-Means++ initialization method, as implemented in Matlab
|
||||||
|
///
|
||||||
|
/// ## Description
|
||||||
|
/// This initialization method starts by selecting one sample as first centroid.
|
||||||
|
/// Proceeding from there, the method iteratively selects one new centroid (per iteration) by calculating
|
||||||
|
/// each sample's probability of "being a centroid". This probability is bigger, the farther away a sample
|
||||||
|
/// is from its centroid. Then, one sample is randomly selected, while taking their probability of being
|
||||||
|
/// the next centroid into account. This leads to a tendency of selecting centroids, that are far away from
|
||||||
|
/// their currently assigned cluster's centroid.
|
||||||
|
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn init_plusplus() -> PyKMeansInit {
|
||||||
|
PyKMeansInit::PlusPlus()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Random-Parition initialization method
|
||||||
|
///
|
||||||
|
/// ## Description
|
||||||
|
/// This initialization method randomly partitions the samples into k partitions, and then calculates these partion's means.
|
||||||
|
/// These means are then used as initial clusters.
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn init_random_partition() -> PyKMeansInit {
|
||||||
|
PyKMeansInit::RandomPartition()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Random sample initialization method (a.k.a. Forgy)
|
||||||
|
///
|
||||||
|
/// ## Description
|
||||||
|
/// This initialization method randomly selects k centroids from the samples as initial centroids.
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn init_random_sample() -> PyKMeansInit {
|
||||||
|
PyKMeansInit::RandomSample()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Precomputed centroids initialization method
|
||||||
|
///
|
||||||
|
/// ## Description
|
||||||
|
/// This initialization method requires a precomputed list of k centroids to use as initial
|
||||||
|
/// centroids.
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn init_precomputed(
|
||||||
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||||
|
centroids: PyArrayLike2<f64, AllowTypeChange>,
|
||||||
|
) -> PyKMeansInit {
|
||||||
|
PyKMeansInit::Precomputed(centroids.as_array().flatten().to_vec())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normal K-Means algorithm implementation. This is the same algorithm as implemented in Matlab (one-phase).
|
||||||
|
/// (see: https://uk.mathworks.com/help/stats/kmeans.html#bueq7aj-5 Section: More About)
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn algo_lloyd() -> PyKMeansAlgorithm {
|
||||||
|
PyKMeansAlgorithm::Lloyd()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mini-Batch k-Means implementation.
|
||||||
|
/// (see: https://dl.acm.org/citation.cfm?id=1772862)
|
||||||
|
///
|
||||||
|
/// ## Arguments
|
||||||
|
/// - **batch_size**: Amount of samples to use per iteration (higher -> better approximation but slower)
|
||||||
|
#[staticmethod]
|
||||||
|
pub(crate) fn algo_mini_batch(batch_size: usize) -> PyKMeansAlgorithm {
|
||||||
|
PyKMeansAlgorithm::MiniBatch(batch_size)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// find the closest cluster and the distance for each point
|
||||||
|
pub(crate) fn predict(
|
||||||
|
&self,
|
||||||
|
py: Python,
|
||||||
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||||
|
points: PyArrayLike2<f64, AllowTypeChange>,
|
||||||
|
) -> PyResult<(Vec<usize>, Vec<f64>)> {
|
||||||
|
let points = points.as_array();
|
||||||
|
Ok(py.detach(|| self.inner.predict(points))?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// calculate the mean simple (using centroids) silhouette score for a set of points,
|
||||||
|
/// assignments must be specified if they do not correspond to the assignments in the KMeans instance
|
||||||
|
#[pyo3(signature = (points, assignments = None))]
|
||||||
|
pub(crate) fn silhouette_simple(
|
||||||
|
&self,
|
||||||
|
py: Python,
|
||||||
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||||
|
points: PyArrayLike2<f64, AllowTypeChange>,
|
||||||
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||||
|
assignments: Option<PyArrayLike1<usize, AllowTypeChange>>,
|
||||||
|
) -> PyResult<f64> {
|
||||||
|
let points = points.as_array();
|
||||||
|
let assignments = assignments.as_ref().map(|a| a.as_array());
|
||||||
|
Ok(py.detach(|| self.inner.silhouette_simple(points, assignments))?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// number of dimensions
|
||||||
|
#[getter]
|
||||||
|
pub(crate) fn ndim(&self) -> usize {
|
||||||
|
self.ndim
|
||||||
|
}
|
||||||
|
|
||||||
|
/// number of clusters
|
||||||
|
#[getter]
|
||||||
|
pub(crate) fn k(&self) -> usize {
|
||||||
|
self.inner.k
|
||||||
|
}
|
||||||
|
|
||||||
|
/// sum of all distances, cost measure
|
||||||
|
#[getter]
|
||||||
|
pub(crate) fn distance_sum(&self) -> f64 {
|
||||||
|
self.inner.distsum
|
||||||
|
}
|
||||||
|
|
||||||
|
/// centroid coordinates
|
||||||
|
#[getter]
|
||||||
|
pub(crate) fn centroids<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyArray2<f64>>> {
|
||||||
|
let v = self.inner.centroids.to_vec();
|
||||||
|
Ok(Array2::from_shape_vec((v.len() / self.ndim, self.ndim), v)
|
||||||
|
.map_err(|e| PyErr::new::<PyTypeError, String>(e.to_string()))?
|
||||||
|
.into_pyarray(py))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// centroid frequencies
|
||||||
|
#[getter]
|
||||||
|
pub(crate) fn centroid_frequency(&self) -> Vec<usize> {
|
||||||
|
self.inner.centroid_frequency.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// to which cluster each of the points is assigned
|
||||||
|
#[getter]
|
||||||
|
pub(crate) fn assignments(&self) -> Vec<usize> {
|
||||||
|
self.inner.assignments.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// distances of all points to the center it's assigned to
|
||||||
|
#[getter]
|
||||||
|
pub(crate) fn centroid_distances(&self) -> Vec<f64> {
|
||||||
|
self.inner.centroid_distances.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Predict<f64> for KMeansState<f64> {
|
||||||
|
type Error = Error;
|
||||||
|
|
||||||
|
fn predict<'a, A>(&self, points: A) -> Result<(Vec<usize>, Vec<f64>), Self::Error>
|
||||||
|
where
|
||||||
|
A: AsArray<'a, f64, Ix2>,
|
||||||
|
{
|
||||||
|
let centroids = self.centroids.to_vec();
|
||||||
|
let ndim = centroids.len() / self.k;
|
||||||
|
let points = points.into();
|
||||||
|
let shape = points.shape();
|
||||||
|
if shape[1] != ndim {
|
||||||
|
return Err(Error::ShapeMismatch(shape[1], ndim));
|
||||||
|
}
|
||||||
|
if centroids.is_empty() {
|
||||||
|
return Err(Error::NoCentroidsDefined);
|
||||||
|
}
|
||||||
|
let fill = vec![0.0; 8 - ndim % 8];
|
||||||
|
let e = EuclideanDistance;
|
||||||
|
let dist = |s: &[f64]| {
|
||||||
|
s.par_chunks_exact(ndim)
|
||||||
|
.map(|point| {
|
||||||
|
let (i, d) = centroids
|
||||||
|
.par_chunks_exact(ndim)
|
||||||
|
.enumerate()
|
||||||
|
.fold(
|
||||||
|
|| (usize::MAX, f64::INFINITY),
|
||||||
|
|(i, a), (j, centroid)| {
|
||||||
|
let b = <EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
||||||
|
&e,
|
||||||
|
&[point, &fill].concat(),
|
||||||
|
&[centroid, &fill].concat(),
|
||||||
|
);
|
||||||
|
if a <= b { (i, a) } else { (j, b) }
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.reduce(
|
||||||
|
|| (usize::MAX, f64::INFINITY),
|
||||||
|
|(i, a), (j, b)| {
|
||||||
|
if a <= b { (i, a) } else { (j, b) }
|
||||||
|
},
|
||||||
|
);
|
||||||
|
(i, d.sqrt())
|
||||||
|
})
|
||||||
|
.collect::<(Vec<_>, Vec<_>)>()
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(if let Some(s) = points.as_slice() {
|
||||||
|
dist(s)
|
||||||
|
} else {
|
||||||
|
let s = points.flatten().to_vec();
|
||||||
|
dist(&s)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn silhouette_simple<'p, 'a, P, A>(
|
||||||
|
&self,
|
||||||
|
points: P,
|
||||||
|
assignments: Option<A>,
|
||||||
|
) -> Result<f64, Self::Error>
|
||||||
|
where
|
||||||
|
P: AsArray<'p, f64, Ix2>,
|
||||||
|
A: AsArray<'a, usize, Ix1>,
|
||||||
|
{
|
||||||
|
let points = points.into();
|
||||||
|
let shape = points.shape();
|
||||||
|
let centroids = Arc::new(self.centroids.to_vec());
|
||||||
|
let ndim = centroids.len() / self.k;
|
||||||
|
|
||||||
|
if shape[1] != ndim {
|
||||||
|
return Err(Error::ShapeMismatch(shape[1], ndim));
|
||||||
|
}
|
||||||
|
if centroids.is_empty() {
|
||||||
|
return Err(Error::NoCentroidsDefined);
|
||||||
|
}
|
||||||
|
|
||||||
|
let assignments = if let Some(assignments) = assignments {
|
||||||
|
assignments.into().to_vec()
|
||||||
|
} else {
|
||||||
|
self.assignments.to_vec()
|
||||||
|
};
|
||||||
|
let k = self.k;
|
||||||
|
let mut clusters = vec![Vec::new(); k];
|
||||||
|
for (point, assignment) in points.rows().into_iter().zip(assignments) {
|
||||||
|
clusters[assignment].extend(point.to_vec());
|
||||||
|
}
|
||||||
|
let fill = vec![0.0; 8 - ndim % 8];
|
||||||
|
let a = clusters
|
||||||
|
.par_iter()
|
||||||
|
.zip(centroids.clone().par_chunks_exact(ndim))
|
||||||
|
.flat_map(|(points, centroid)| {
|
||||||
|
let c = [centroid, &fill].concat();
|
||||||
|
let fill = fill.clone();
|
||||||
|
let e = EuclideanDistance;
|
||||||
|
points.par_chunks_exact(ndim).map(move |point| {
|
||||||
|
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
||||||
|
&e,
|
||||||
|
&c,
|
||||||
|
&[point, &fill].concat(),
|
||||||
|
)
|
||||||
|
.sqrt()
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let b = clusters
|
||||||
|
.par_iter()
|
||||||
|
.enumerate()
|
||||||
|
.flat_map(|(i, points)| {
|
||||||
|
let centroids = centroids.clone();
|
||||||
|
let fill = fill.clone();
|
||||||
|
let e = EuclideanDistance;
|
||||||
|
points.par_chunks_exact(ndim).map(move |point| {
|
||||||
|
centroids
|
||||||
|
.par_chunks_exact(ndim)
|
||||||
|
.enumerate()
|
||||||
|
.map(|(j, centroid)| {
|
||||||
|
if i == j {
|
||||||
|
f64::INFINITY
|
||||||
|
} else {
|
||||||
|
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
||||||
|
&e,
|
||||||
|
&[centroid, &fill].concat(),
|
||||||
|
&[point, &fill].concat(),
|
||||||
|
)
|
||||||
|
.sqrt()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.min_by(|a, b| a.total_cmp(b))
|
||||||
|
.unwrap_or(f64::INFINITY)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
Ok(a.into_iter()
|
||||||
|
.zip(b)
|
||||||
|
.map(|(a, b)| (b - a) / a.max(b))
|
||||||
|
.sum::<f64>()
|
||||||
|
/ points.shape()[0] as f64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn silhouette<'p, 'a, P, A, K>(points: P, assignments: A) -> Result<f64, Error>
|
||||||
|
where
|
||||||
|
P: AsArray<'p, f64, Ix2>,
|
||||||
|
A: AsArray<'a, K, Ix1>,
|
||||||
|
K: 'a + Eq + Hash,
|
||||||
|
{
|
||||||
|
let points = points.into();
|
||||||
|
let assignments = assignments.into();
|
||||||
|
let shape = points.shape();
|
||||||
|
let n = shape[0];
|
||||||
|
let ndim = shape[1];
|
||||||
|
|
||||||
|
let labels = assignments
|
||||||
|
.iter()
|
||||||
|
.collect::<HashSet<_>>()
|
||||||
|
.into_iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(k, v)| (v, k))
|
||||||
|
.collect::<HashMap<_, _>>();
|
||||||
|
let assignments = assignments.iter().map(|k| labels[k]).collect::<Vec<_>>();
|
||||||
|
let k = labels.len();
|
||||||
|
|
||||||
|
let mut clusters = vec![Vec::new(); k];
|
||||||
|
for (point, assignment) in points.rows().into_iter().zip(assignments) {
|
||||||
|
clusters[assignment].extend(point.to_vec());
|
||||||
|
}
|
||||||
|
let bar = get_bar(Some(k * n + k * n * k))?;
|
||||||
|
let fill = vec![0.0; 8 - ndim % 8];
|
||||||
|
let e = EuclideanDistance;
|
||||||
|
let a = clusters
|
||||||
|
.par_iter()
|
||||||
|
.flat_map(|points| {
|
||||||
|
let c = (points.len() / ndim - 1) as f64;
|
||||||
|
points
|
||||||
|
.par_chunks_exact(ndim)
|
||||||
|
.map(|i| {
|
||||||
|
let q = points
|
||||||
|
.par_chunks_exact(ndim)
|
||||||
|
.map(|j| {
|
||||||
|
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
||||||
|
&e,
|
||||||
|
&[i, &fill].concat(),
|
||||||
|
&[j, &fill].concat(),
|
||||||
|
)
|
||||||
|
.sqrt()
|
||||||
|
})
|
||||||
|
.sum::<f64>()
|
||||||
|
/ c;
|
||||||
|
bar.inc(1);
|
||||||
|
q
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let b = clusters
|
||||||
|
.par_iter()
|
||||||
|
.enumerate()
|
||||||
|
.flat_map(|(i, points_i)| {
|
||||||
|
points_i
|
||||||
|
.par_chunks_exact(ndim)
|
||||||
|
.map(|a| {
|
||||||
|
clusters
|
||||||
|
.par_iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(j, points_j)| {
|
||||||
|
let c = (points_j.len() / ndim) as f64;
|
||||||
|
let q = if i == j {
|
||||||
|
f64::INFINITY
|
||||||
|
} else {
|
||||||
|
points_j
|
||||||
|
.par_chunks_exact(ndim)
|
||||||
|
.map(|b| {
|
||||||
|
<EuclideanDistance as DistanceFunction<f64, 8>>::distance(
|
||||||
|
&e,
|
||||||
|
&[a, &fill].concat(),
|
||||||
|
&[b, &fill].concat(),
|
||||||
|
)
|
||||||
|
.sqrt()
|
||||||
|
})
|
||||||
|
.sum::<f64>()
|
||||||
|
/ c
|
||||||
|
};
|
||||||
|
bar.inc(1);
|
||||||
|
q
|
||||||
|
})
|
||||||
|
.min_by(|a, b| a.total_cmp(b))
|
||||||
|
.unwrap_or(f64::INFINITY)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
bar.finish();
|
||||||
|
Ok(a.into_iter()
|
||||||
|
.zip(b)
|
||||||
|
.map(|(a, b)| (b - a) / a.max(b))
|
||||||
|
.sum::<f64>()
|
||||||
|
/ points.shape()[0] as f64)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// calculate the mean silhouette score for a set of points
|
||||||
|
#[gen_stub_pyfunction(module = "kmeans_rs")]
|
||||||
|
#[pyfunction(name = "silhouette")]
|
||||||
|
pub(crate) fn py_silhouette(
|
||||||
|
py: Python,
|
||||||
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||||
|
points: PyArrayLike2<f64, AllowTypeChange>,
|
||||||
|
#[gen_stub(override_type(type_repr="numpy.typing.ArrayLike", imports=("numpy", "numpy.typing")))]
|
||||||
|
assignments: PyArrayLike1<usize, AllowTypeChange>,
|
||||||
|
) -> PyResult<f64> {
|
||||||
|
let points = points.as_array();
|
||||||
|
let assignments = assignments.as_array();
|
||||||
|
Ok(py.detach(|| silhouette(points, assignments))?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// generates kmeans/__init__.pyi
|
||||||
|
#[pyfunction]
|
||||||
|
fn generate_stub(dest_path: String) -> PyResult<()> {
|
||||||
|
Ok(StubInfo::from_project_root(
|
||||||
|
"kmeans_rs".to_string(),
|
||||||
|
PathBuf::from(dest_path).join("py"),
|
||||||
|
true,
|
||||||
|
StubGenConfig::default(),
|
||||||
|
)?
|
||||||
|
.generate()?)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymodule]
|
||||||
|
#[pyo3(name = "kmeans_rs")]
|
||||||
|
mod kmeans_rs {
|
||||||
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
|
#[pymodule_export]
|
||||||
|
use super::generate_stub;
|
||||||
|
|
||||||
|
#[pymodule_export]
|
||||||
|
use super::PyKMeans;
|
||||||
|
|
||||||
|
#[pymodule_export]
|
||||||
|
use super::PyKMeansInit;
|
||||||
|
|
||||||
|
#[pymodule_export]
|
||||||
|
use super::PyKMeansAlgorithm;
|
||||||
|
|
||||||
|
#[pymodule_export]
|
||||||
|
use super::py_silhouette;
|
||||||
|
|
||||||
|
#[pymodule_init]
|
||||||
|
fn init(_: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||||
|
Ok(color_eyre::install()?)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user