From 7907db7627aafa35a45f7246e79f5369b6714828 Mon Sep 17 00:00:00 2001 From: Paul Duncan Date: Tue, 5 Feb 2019 03:51:13 -0500 Subject: populate README, clean up comments, remove unused includes --- README.md | 55 +++++++++++++++++++++++++++++++++++++++++++++++++-- src/km-find.c | 3 --- src/km-rand-erand48.c | 6 +++--- src/km-rand-path.c | 3 ++- src/km-solve.c | 1 - 5 files changed, 58 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 3667e8c..e43568f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,15 @@ -KMeans Classifier Implementation +K-Means Clustering +================== -Examples: +[K-Means][kmeans] clustering implementation. Features: +* Initialization methods: random, [forgy][forgy], and [k-means++][kmeanspp]. +* Uses [Silhouette method][silhouette] to determine the optimum cluster + count. +* No external dependencies. + +Examples +-------- Generate 1000 rows of test data, clustered around 3 points: # generate test data @@ -22,3 +30,46 @@ PNGs of results: echo $i ./km-test kmeans $i $png_path > $dst_path done + +Initialization Methods +---------------------- +Supported initialization methods: + +* `rand`: Pick random points as the initial cluster centroids. +* [forgy][forgy]: Pick random points from the data set as the initial + cluster centroids. +* [kmeans][kmeanspp]: Use the [k-means++][kmeanspp] initialization method + to pick the initial cluster centroids. This is the recommended + initialization method. + +Data File Format +---------------- +Reads and writes newline-delimited plain text files in the following +format: + +* Lines are delimited by newlines +* Each line is a record. +* Record fields are delimited by whitespace. +* The first row specifies the *shape* of the remaining rows as two + unsigned integers. The first unsigned integer -- `num_floats` -- + indicates the number of floating point columns per row, and the second + unsigned integer -- `num_ints` -- indicates the number of signed + integer values per row. +* The remaining lines contain data rows. Each row consists of + `num_floats` floating point numbers, followed by `num_ints` signed + integer values. + +Example data file: + + 3 0 + 1.2 3.6 5.2 + 9.8 6.5 4.3 + 3.2 5.6 8.7 + +See the files in `tests/` for additional examples. You can also use +the top-level `gen-data.rb` script to generate additional test data. + + [kmeans]: https://en.wikipedia.org/wiki/K-means_clustering "K-Means clustering" + [kmeanspp]: https://en.wikipedia.org/wiki/K-means%2B%2B "k-means++ initialization method" + [forgy]: https://en.wikipedia.org/wiki/K-means_clustering#Initialization_methods "Forgy initialization method" + [silhouette]: https://en.wikipedia.org/wiki/Silhouette_%28clustering%29 "Silhouette method" diff --git a/src/km-find.c b/src/km-find.c index 657c645..5ae5948 100644 --- a/src/km-find.c +++ b/src/km-find.c @@ -1,10 +1,7 @@ #include // bool -#include // fabsf() #include "util.h" #include "km.h" -#define MIN_CLUSTER_DISTANCE 0.0001 - static float get_mean_cluster_size( const km_set_t * const set diff --git a/src/km-rand-erand48.c b/src/km-rand-erand48.c index de84b93..3d7b02d 100644 --- a/src/km-rand-erand48.c +++ b/src/km-rand-erand48.c @@ -1,7 +1,6 @@ #include // bool #define _DEFAULT_SOURCE -#include // drand48_r() -#include // fopen() +#include // erand48() #include "util.h" #include "km.h" @@ -62,7 +61,8 @@ ERAND48_RAND_CBS = { .fini = on_fini, }; -// init system random source (uses system rand()) +// init erand48 random source (uses erand48(), seeded with the lower +// 48-bits of the seed parameter) bool km_rand_init_erand48( km_rand_t * const rs, diff --git a/src/km-rand-path.c b/src/km-rand-path.c index 6597038..c1d011d 100644 --- a/src/km-rand-path.c +++ b/src/km-rand-path.c @@ -114,7 +114,8 @@ PATH_RAND_CBS = { .fini = on_fini, }; -// init system random source (uses system rand()) +// init random source from a file (e.g. "/dev/urandom") +// (NOTE: this method is horribly slow, don't use it) bool km_rand_init_path( km_rand_t * const rs, diff --git a/src/km-solve.c b/src/km-solve.c index 4730eb4..b6d98d6 100644 --- a/src/km-solve.c +++ b/src/km-solve.c @@ -1,7 +1,6 @@ #include // bool #include // memset() #include // FLT_MAX -#include // sqrt() #include "util.h" #include "km.h" -- cgit v1.2.3