aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md55
-rw-r--r--src/km-find.c3
-rw-r--r--src/km-rand-erand48.c6
-rw-r--r--src/km-rand-path.c3
-rw-r--r--src/km-solve.c1
5 files changed, 58 insertions, 10 deletions
diff --git a/README.md b/README.md
index 3667e8c..e43568f 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,15 @@
-KMeans Classifier Implementation
+K-Means Clustering
+==================
-Examples:
+[K-Means][kmeans] clustering implementation. Features:
+* Initialization methods: random, [forgy][forgy], and [k-means++][kmeanspp].
+* Uses [Silhouette method][silhouette] to determine the optimum cluster
+ count.
+* No external dependencies.
+
+Examples
+--------
Generate 1000 rows of test data, clustered around 3 points:
# generate test data
@@ -22,3 +30,46 @@ PNGs of results:
echo $i
./km-test kmeans $i $png_path > $dst_path
done
+
+Initialization Methods
+----------------------
+Supported initialization methods:
+
+* `rand`: Pick random points as the initial cluster centroids.
+* [forgy][forgy]: Pick random points from the data set as the initial
+ cluster centroids.
+* [kmeans][kmeanspp]: Use the [k-means++][kmeanspp] initialization method
+ to pick the initial cluster centroids. This is the recommended
+ initialization method.
+
+Data File Format
+----------------
+Reads and writes newline-delimited plain text files in the following
+format:
+
+* Lines are delimited by newlines
+* Each line is a record.
+* Record fields are delimited by whitespace.
+* The first row specifies the *shape* of the remaining rows as two
+ unsigned integers. The first unsigned integer -- `num_floats` --
+ indicates the number of floating point columns per row, and the second
+ unsigned integer -- `num_ints` -- indicates the number of signed
+ integer values per row.
+* The remaining lines contain data rows. Each row consists of
+ `num_floats` floating point numbers, followed by `num_ints` signed
+ integer values.
+
+Example data file:
+
+ 3 0
+ 1.2 3.6 5.2
+ 9.8 6.5 4.3
+ 3.2 5.6 8.7
+
+See the files in `tests/` for additional examples. You can also use
+the top-level `gen-data.rb` script to generate additional test data.
+
+ [kmeans]: https://en.wikipedia.org/wiki/K-means_clustering "K-Means clustering"
+ [kmeanspp]: https://en.wikipedia.org/wiki/K-means%2B%2B "k-means++ initialization method"
+ [forgy]: https://en.wikipedia.org/wiki/K-means_clustering#Initialization_methods "Forgy initialization method"
+ [silhouette]: https://en.wikipedia.org/wiki/Silhouette_%28clustering%29 "Silhouette method"
diff --git a/src/km-find.c b/src/km-find.c
index 657c645..5ae5948 100644
--- a/src/km-find.c
+++ b/src/km-find.c
@@ -1,10 +1,7 @@
#include <stdbool.h> // bool
-#include <math.h> // fabsf()
#include "util.h"
#include "km.h"
-#define MIN_CLUSTER_DISTANCE 0.0001
-
static float
get_mean_cluster_size(
const km_set_t * const set
diff --git a/src/km-rand-erand48.c b/src/km-rand-erand48.c
index de84b93..3d7b02d 100644
--- a/src/km-rand-erand48.c
+++ b/src/km-rand-erand48.c
@@ -1,7 +1,6 @@
#include <stdbool.h> // bool
#define _DEFAULT_SOURCE
-#include <stdlib.h> // drand48_r()
-#include <stdio.h> // fopen()
+#include <stdlib.h> // erand48()
#include "util.h"
#include "km.h"
@@ -62,7 +61,8 @@ ERAND48_RAND_CBS = {
.fini = on_fini,
};
-// init system random source (uses system rand())
+// init erand48 random source (uses erand48(), seeded with the lower
+// 48-bits of the seed parameter)
bool
km_rand_init_erand48(
km_rand_t * const rs,
diff --git a/src/km-rand-path.c b/src/km-rand-path.c
index 6597038..c1d011d 100644
--- a/src/km-rand-path.c
+++ b/src/km-rand-path.c
@@ -114,7 +114,8 @@ PATH_RAND_CBS = {
.fini = on_fini,
};
-// init system random source (uses system rand())
+// init random source from a file (e.g. "/dev/urandom")
+// (NOTE: this method is horribly slow, don't use it)
bool
km_rand_init_path(
km_rand_t * const rs,
diff --git a/src/km-solve.c b/src/km-solve.c
index 4730eb4..b6d98d6 100644
--- a/src/km-solve.c
+++ b/src/km-solve.c
@@ -1,7 +1,6 @@
#include <stdbool.h> // bool
#include <string.h> // memset()
#include <float.h> // FLT_MAX
-#include <math.h> // sqrt()
#include "util.h"
#include "km.h"