diff options
Diffstat (limited to 'src/km-init-kmeans.c')
-rw-r--r-- | src/km-init-kmeans.c | 124 |
1 files changed, 124 insertions, 0 deletions
diff --git a/src/km-init-kmeans.c b/src/km-init-kmeans.c new file mode 100644 index 0000000..f0c1b5f --- /dev/null +++ b/src/km-init-kmeans.c @@ -0,0 +1,124 @@ +#include <stdbool.h> // bool +#include <string.h> // memset() +#include <float.h> // FLT_MAX +#include "util.h" +#include "km.h" + +// sum the squared distance of every row in set from this point +static inline float +sum_distance_squared( + const float * const floats, + const km_set_t * const set +) { + float r = 0; + + // sum squared distances + for (size_t i = 0; i < set->num_rows; i++) { + const float * const vals = km_set_get_row(set, i); + r += distance_squared(set->shape.num_floats, floats, vals); + } + + // return result + return r; +} + +static const float * +get_random_row( + const km_set_t * const set, + km_rand_t * const rs +) { + // get random offset + size_t ofs = 0; + if (!km_rand_get_sizes(rs, 1, &ofs)) { + die("km_rand_fill_sizes()"); + } + + return km_set_get_row(set, ofs % set->num_rows); +} + +// init a cluster set of num_clusters by picking the first cluster from +// the set of points at random +// random initial points from the set +bool +km_init_kmeans( + km_set_t * const cs, + const size_t num_clusters, + const km_set_t * const set, + km_rand_t * const rs +) { + const size_t num_floats = set->shape.num_floats, + stride = sizeof(float) * num_floats; + + // row values (filled below) + float sums[num_clusters]; + float floats[num_floats * num_clusters]; + + // pick first row randomly + { + // get random row + const float * const vals = get_random_row(set, rs); + + // copy row values to floats buffer + memcpy(floats, vals, stride); + + // copy row values to floats buffer + sums[0] = sum_distance_squared(floats, set); + } + + for (size_t i = 1; i < num_clusters;) { + // get random row + const float * const vals = get_random_row(set, rs); + + // calculate squared distance to nearest cluster + size_t ofs = 0; + float min_d2 = FLT_MAX; + for (size_t j = 0; j < i; j++) { + const float d2 = distance_squared(num_floats, vals, floats + j * stride); + if (d2 < min_d2) { + ofs = j; + min_d2 = d2; + } + } + + // get a random floating point value + float rand_val = 0; + if (!km_rand_get_floats(rs, 1, &rand_val)) { + // return failure + return false; + } + + // check random value + if (rand_val / sums[ofs] < min_d2) { + // get destination + float * const dst = floats + i * num_floats; + + // copy row values + memcpy(dst, vals, stride); + + // calculate total cluster distance + sums[i] = sum_distance_squared(dst, set); + + // increment cluster count + i++; + } + } + + // FIXME: should probably be heap-allocated + int ints[num_clusters]; + memset(ints, 0, sizeof(ints)); + + // init cluster shape + const km_shape_t shape = { + .num_floats = num_floats, + .num_ints = 1, + }; + + // init cluster set + if (!km_set_init(cs, &shape, num_clusters)) { + // return failure + return false; + } + + // add data, return result + return km_set_push(cs, num_clusters, floats, ints); +} |