diff options
-rw-r--r-- | README.md | 55 | ||||
-rw-r--r-- | src/km-find.c | 3 | ||||
-rw-r--r-- | src/km-rand-erand48.c | 6 | ||||
-rw-r--r-- | src/km-rand-path.c | 3 | ||||
-rw-r--r-- | src/km-solve.c | 1 |
5 files changed, 58 insertions, 10 deletions
@@ -1,7 +1,15 @@ -KMeans Classifier Implementation +K-Means Clustering +================== -Examples: +[K-Means][kmeans] clustering implementation. Features: +* Initialization methods: random, [forgy][forgy], and [k-means++][kmeanspp]. +* Uses [Silhouette method][silhouette] to determine the optimum cluster + count. +* No external dependencies. + +Examples +-------- Generate 1000 rows of test data, clustered around 3 points: # generate test data @@ -22,3 +30,46 @@ PNGs of results: echo $i ./km-test kmeans $i $png_path > $dst_path done + +Initialization Methods +---------------------- +Supported initialization methods: + +* `rand`: Pick random points as the initial cluster centroids. +* [forgy][forgy]: Pick random points from the data set as the initial + cluster centroids. +* [kmeans][kmeanspp]: Use the [k-means++][kmeanspp] initialization method + to pick the initial cluster centroids. This is the recommended + initialization method. + +Data File Format +---------------- +Reads and writes newline-delimited plain text files in the following +format: + +* Lines are delimited by newlines +* Each line is a record. +* Record fields are delimited by whitespace. +* The first row specifies the *shape* of the remaining rows as two + unsigned integers. The first unsigned integer -- `num_floats` -- + indicates the number of floating point columns per row, and the second + unsigned integer -- `num_ints` -- indicates the number of signed + integer values per row. +* The remaining lines contain data rows. Each row consists of + `num_floats` floating point numbers, followed by `num_ints` signed + integer values. + +Example data file: + + 3 0 + 1.2 3.6 5.2 + 9.8 6.5 4.3 + 3.2 5.6 8.7 + +See the files in `tests/` for additional examples. You can also use +the top-level `gen-data.rb` script to generate additional test data. + + [kmeans]: https://en.wikipedia.org/wiki/K-means_clustering "K-Means clustering" + [kmeanspp]: https://en.wikipedia.org/wiki/K-means%2B%2B "k-means++ initialization method" + [forgy]: https://en.wikipedia.org/wiki/K-means_clustering#Initialization_methods "Forgy initialization method" + [silhouette]: https://en.wikipedia.org/wiki/Silhouette_%28clustering%29 "Silhouette method" diff --git a/src/km-find.c b/src/km-find.c index 657c645..5ae5948 100644 --- a/src/km-find.c +++ b/src/km-find.c @@ -1,10 +1,7 @@ #include <stdbool.h> // bool -#include <math.h> // fabsf() #include "util.h" #include "km.h" -#define MIN_CLUSTER_DISTANCE 0.0001 - static float get_mean_cluster_size( const km_set_t * const set diff --git a/src/km-rand-erand48.c b/src/km-rand-erand48.c index de84b93..3d7b02d 100644 --- a/src/km-rand-erand48.c +++ b/src/km-rand-erand48.c @@ -1,7 +1,6 @@ #include <stdbool.h> // bool #define _DEFAULT_SOURCE -#include <stdlib.h> // drand48_r() -#include <stdio.h> // fopen() +#include <stdlib.h> // erand48() #include "util.h" #include "km.h" @@ -62,7 +61,8 @@ ERAND48_RAND_CBS = { .fini = on_fini, }; -// init system random source (uses system rand()) +// init erand48 random source (uses erand48(), seeded with the lower +// 48-bits of the seed parameter) bool km_rand_init_erand48( km_rand_t * const rs, diff --git a/src/km-rand-path.c b/src/km-rand-path.c index 6597038..c1d011d 100644 --- a/src/km-rand-path.c +++ b/src/km-rand-path.c @@ -114,7 +114,8 @@ PATH_RAND_CBS = { .fini = on_fini, }; -// init system random source (uses system rand()) +// init random source from a file (e.g. "/dev/urandom") +// (NOTE: this method is horribly slow, don't use it) bool km_rand_init_path( km_rand_t * const rs, diff --git a/src/km-solve.c b/src/km-solve.c index 4730eb4..b6d98d6 100644 --- a/src/km-solve.c +++ b/src/km-solve.c @@ -1,7 +1,6 @@ #include <stdbool.h> // bool #include <string.h> // memset() #include <float.h> // FLT_MAX -#include <math.h> // sqrt() #include "util.h" #include "km.h" |