diff options
author | Paul Duncan <pabs@pablotron.org> | 2019-02-04 09:36:52 -0500 |
---|---|---|
committer | Paul Duncan <pabs@pablotron.org> | 2019-02-04 09:36:52 -0500 |
commit | f496f3f1ce5bd068930382f7516494abdbf62489 (patch) | |
tree | f912fff2b50e04cfc4585fa6d8f6afcb05460a68 | |
parent | 4a4373f113b607f715badd2ea06cba461fe3bbcf (diff) | |
download | kmeans-f496f3f1ce5bd068930382f7516494abdbf62489.tar.bz2 kmeans-f496f3f1ce5bd068930382f7516494abdbf62489.zip |
add kmeans++ method
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | km-init-kmeans.c | 94 | ||||
-rw-r--r-- | km-init.c | 5 | ||||
-rw-r--r-- | km-solve.c | 17 | ||||
-rw-r--r-- | km.h | 7 | ||||
-rw-r--r-- | util.h | 17 |
6 files changed, 122 insertions, 20 deletions
@@ -1,7 +1,7 @@ APP=km-test CFLAGS=-W -Wall -Wextra -Werror -pedantic -std=c11 -O2 OBJS=km-set.o km-draw.o km-load.o km-find.o km-rand.o km-solve.o \ - km-init-rand.o km-init-forgy.o km-init.o main.o + km-init-rand.o km-init-forgy.o km-init-kmeans.o km-init.o main.o LIBS=-lm .PHONY=all clean diff --git a/km-init-kmeans.c b/km-init-kmeans.c new file mode 100644 index 0000000..e49f879 --- /dev/null +++ b/km-init-kmeans.c @@ -0,0 +1,94 @@ +#include <stdbool.h> // bool +#include <string.h> // memset() +#include <float.h> // FLT_MAX +#include "util.h" +#include "km.h" + +static const float * +get_random_row( + const km_set_t * const set, + km_rand_t * const rs +) { + // get random offset + size_t ofs = 0; + if (!km_rand_fill_sizes(rs, 1, &ofs)) { + die("km_rand_fill_sizes()"); + } + + return km_set_get_row(set, ofs % set->num_rows); +} + +// init a cluster set of num_clusters by picking the first cluster from +// the set of points at random +// random initial points from the set +bool +km_init_kmeans( + km_set_t * const cs, + const size_t num_clusters, + const km_set_t * const set, + km_rand_t * const rs +) { + const size_t num_floats = set->shape.num_floats, + stride = sizeof(float) * num_floats; + + // row values (filled below) + float floats[num_floats * num_clusters]; + + // pick first row randomly + { + // get random row + const float * const vals = get_random_row(set, rs); + + // copy row values to floats buffer + memcpy(floats, vals, stride); + } + + for (size_t i = 1; i < num_clusters;) { + // get random row + const float * const vals = get_random_row(set, rs); + + // calculate squared distance to nearest cluster + float min_d2 = FLT_MAX; + for (size_t j = 0; j < i; j++) { + const float d2 = distance_squared(num_floats, vals, floats + j * stride); + if (d2 < min_d2) { + min_d2 = d2; + } + } + + // get a random floating point value + float rand_val = 0; + if (!km_rand_fill(rs, 1, &rand_val)) { + // return failure + return false; + } + + // check random value + if (rand_val < min_d2) { + // copy row values + memcpy(floats + i * num_floats, vals, stride); + + // increment cluster count + i++; + } + } + + // FIXME: should probably be heap-allocated + int ints[num_clusters]; + memset(ints, 0, sizeof(ints)); + + // init cluster shape + const km_shape_t shape = { + .num_floats = num_floats, + .num_ints = 1, + }; + + // init cluster set + if (!km_set_init(cs, &shape, num_clusters)) { + // return failure + return false; + } + + // add data, return result + return km_set_push(cs, num_clusters, floats, ints); +} @@ -13,6 +13,7 @@ typedef bool (*km_init_fn_t)( bool km_init_rand(km_set_t *, const size_t, const km_set_t *, km_rand_t *); bool km_init_forgy(km_set_t *, const size_t, const km_set_t *, km_rand_t *); +bool km_init_kmeans(km_set_t *, const size_t, const km_set_t *, km_rand_t *); static const struct { const km_init_type_t type; @@ -26,6 +27,10 @@ static const struct { .name = "forgy", .type = KM_INIT_TYPE_FORGY, .init = km_init_forgy, +}, { + .name = "kmeans", + .type = KM_INIT_TYPE_KMEANS, + .init = km_init_kmeans, }}; #define NUM_TYPES (sizeof(TYPES) / sizeof(TYPES[0])) @@ -5,23 +5,6 @@ #include "util.h" #include "km.h" -// calculate squared euclidean distance between two points -static float -distance_squared( - const size_t num_floats, - const float * const a, - const float * const b -) { - float r = 0.0; - - for (size_t i = 0; i < num_floats; i++) { - r += (b[i] - a[i]) * (b[i] - a[i]); - } - - // return squared distance - return r; -} - // alloc and initialize row map static km_row_map_t * km_row_map_init( @@ -189,12 +189,15 @@ _Bool km_load_path( ); typedef enum { - // init cluster set by picking random centroids + // rand: init cluster set by picking random centroids KM_INIT_TYPE_RAND, - // init cluster set by picking random rows from set + // forgy: init cluster set by picking random rows from set KM_INIT_TYPE_FORGY, + // kmeans++: ... + KM_INIT_TYPE_KMEANS, + KM_INIT_TYPE_LAST, } km_init_type_t; @@ -21,6 +21,23 @@ exit(EXIT_FAILURE); \ } while (0) +// calculate squared euclidean distance between two points +static inline float +distance_squared( + const size_t num_floats, + const float * const a, + const float * const b +) { + float r = 0.0; + + for (size_t i = 0; i < num_floats; i++) { + r += (b[i] - a[i]) * (b[i] - a[i]); + } + + // return squared distance + return r; +} + static inline float km_score( const float mean_distance, |