aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2019-02-04 09:36:52 -0500
committerPaul Duncan <pabs@pablotron.org>2019-02-04 09:36:52 -0500
commitf496f3f1ce5bd068930382f7516494abdbf62489 (patch)
treef912fff2b50e04cfc4585fa6d8f6afcb05460a68
parent4a4373f113b607f715badd2ea06cba461fe3bbcf (diff)
downloadkmeans-f496f3f1ce5bd068930382f7516494abdbf62489.tar.bz2
kmeans-f496f3f1ce5bd068930382f7516494abdbf62489.zip
add kmeans++ method
-rw-r--r--Makefile2
-rw-r--r--km-init-kmeans.c94
-rw-r--r--km-init.c5
-rw-r--r--km-solve.c17
-rw-r--r--km.h7
-rw-r--r--util.h17
6 files changed, 122 insertions, 20 deletions
diff --git a/Makefile b/Makefile
index cfb7ae2..99aebc9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
APP=km-test
CFLAGS=-W -Wall -Wextra -Werror -pedantic -std=c11 -O2
OBJS=km-set.o km-draw.o km-load.o km-find.o km-rand.o km-solve.o \
- km-init-rand.o km-init-forgy.o km-init.o main.o
+ km-init-rand.o km-init-forgy.o km-init-kmeans.o km-init.o main.o
LIBS=-lm
.PHONY=all clean
diff --git a/km-init-kmeans.c b/km-init-kmeans.c
new file mode 100644
index 0000000..e49f879
--- /dev/null
+++ b/km-init-kmeans.c
@@ -0,0 +1,94 @@
+#include <stdbool.h> // bool
+#include <string.h> // memset()
+#include <float.h> // FLT_MAX
+#include "util.h"
+#include "km.h"
+
+static const float *
+get_random_row(
+ const km_set_t * const set,
+ km_rand_t * const rs
+) {
+ // get random offset
+ size_t ofs = 0;
+ if (!km_rand_fill_sizes(rs, 1, &ofs)) {
+ die("km_rand_fill_sizes()");
+ }
+
+ return km_set_get_row(set, ofs % set->num_rows);
+}
+
+// init a cluster set of num_clusters by picking the first cluster from
+// the set of points at random
+// random initial points from the set
+bool
+km_init_kmeans(
+ km_set_t * const cs,
+ const size_t num_clusters,
+ const km_set_t * const set,
+ km_rand_t * const rs
+) {
+ const size_t num_floats = set->shape.num_floats,
+ stride = sizeof(float) * num_floats;
+
+ // row values (filled below)
+ float floats[num_floats * num_clusters];
+
+ // pick first row randomly
+ {
+ // get random row
+ const float * const vals = get_random_row(set, rs);
+
+ // copy row values to floats buffer
+ memcpy(floats, vals, stride);
+ }
+
+ for (size_t i = 1; i < num_clusters;) {
+ // get random row
+ const float * const vals = get_random_row(set, rs);
+
+ // calculate squared distance to nearest cluster
+ float min_d2 = FLT_MAX;
+ for (size_t j = 0; j < i; j++) {
+ const float d2 = distance_squared(num_floats, vals, floats + j * stride);
+ if (d2 < min_d2) {
+ min_d2 = d2;
+ }
+ }
+
+ // get a random floating point value
+ float rand_val = 0;
+ if (!km_rand_fill(rs, 1, &rand_val)) {
+ // return failure
+ return false;
+ }
+
+ // check random value
+ if (rand_val < min_d2) {
+ // copy row values
+ memcpy(floats + i * num_floats, vals, stride);
+
+ // increment cluster count
+ i++;
+ }
+ }
+
+ // FIXME: should probably be heap-allocated
+ int ints[num_clusters];
+ memset(ints, 0, sizeof(ints));
+
+ // init cluster shape
+ const km_shape_t shape = {
+ .num_floats = num_floats,
+ .num_ints = 1,
+ };
+
+ // init cluster set
+ if (!km_set_init(cs, &shape, num_clusters)) {
+ // return failure
+ return false;
+ }
+
+ // add data, return result
+ return km_set_push(cs, num_clusters, floats, ints);
+}
diff --git a/km-init.c b/km-init.c
index f04fca0..280ea81 100644
--- a/km-init.c
+++ b/km-init.c
@@ -13,6 +13,7 @@ typedef bool (*km_init_fn_t)(
bool km_init_rand(km_set_t *, const size_t, const km_set_t *, km_rand_t *);
bool km_init_forgy(km_set_t *, const size_t, const km_set_t *, km_rand_t *);
+bool km_init_kmeans(km_set_t *, const size_t, const km_set_t *, km_rand_t *);
static const struct {
const km_init_type_t type;
@@ -26,6 +27,10 @@ static const struct {
.name = "forgy",
.type = KM_INIT_TYPE_FORGY,
.init = km_init_forgy,
+}, {
+ .name = "kmeans",
+ .type = KM_INIT_TYPE_KMEANS,
+ .init = km_init_kmeans,
}};
#define NUM_TYPES (sizeof(TYPES) / sizeof(TYPES[0]))
diff --git a/km-solve.c b/km-solve.c
index 4560c67..9eac13c 100644
--- a/km-solve.c
+++ b/km-solve.c
@@ -5,23 +5,6 @@
#include "util.h"
#include "km.h"
-// calculate squared euclidean distance between two points
-static float
-distance_squared(
- const size_t num_floats,
- const float * const a,
- const float * const b
-) {
- float r = 0.0;
-
- for (size_t i = 0; i < num_floats; i++) {
- r += (b[i] - a[i]) * (b[i] - a[i]);
- }
-
- // return squared distance
- return r;
-}
-
// alloc and initialize row map
static km_row_map_t *
km_row_map_init(
diff --git a/km.h b/km.h
index 866e941..d8fe225 100644
--- a/km.h
+++ b/km.h
@@ -189,12 +189,15 @@ _Bool km_load_path(
);
typedef enum {
- // init cluster set by picking random centroids
+ // rand: init cluster set by picking random centroids
KM_INIT_TYPE_RAND,
- // init cluster set by picking random rows from set
+ // forgy: init cluster set by picking random rows from set
KM_INIT_TYPE_FORGY,
+ // kmeans++: ...
+ KM_INIT_TYPE_KMEANS,
+
KM_INIT_TYPE_LAST,
} km_init_type_t;
diff --git a/util.h b/util.h
index fa54ce0..821c3e6 100644
--- a/util.h
+++ b/util.h
@@ -21,6 +21,23 @@
exit(EXIT_FAILURE); \
} while (0)
+// calculate squared euclidean distance between two points
+static inline float
+distance_squared(
+ const size_t num_floats,
+ const float * const a,
+ const float * const b
+) {
+ float r = 0.0;
+
+ for (size_t i = 0; i < num_floats; i++) {
+ r += (b[i] - a[i]) * (b[i] - a[i]);
+ }
+
+ // return squared distance
+ return r;
+}
+
static inline float
km_score(
const float mean_distance,