aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--Makefile10
-rw-r--r--km.c404
-rw-r--r--km.h82
4 files changed, 497 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5761abc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.o
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3873d7b
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,10 @@
+CFLAGS=-W -Wall -Wextra -pedantic -std=c11 -O2
+OBJS=km.o
+
+all: $(OBJS)
+
+%.o: %.c
+ $(CC) -c $(CFLAGS) $<
+
+clean:
+ $(RM) $(OBJS)
diff --git a/km.c b/km.c
new file mode 100644
index 0000000..a6c9301
--- /dev/null
+++ b/km.c
@@ -0,0 +1,404 @@
+#include <stdbool.h> // bool
+#include <stdint.h> // size_t
+#include <string.h> // memcpy()
+#include <stdlib.h> // rand()
+#include <float.h> // FLT_MAX
+#include <math.h> // sqrt()
+#include "km.h"
+
+#define MIN_ROWS (4096 / sizeof(float))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define UNUSED(a) ((void) (a))
+
+// calculate squared euclidean distance between two points
+static float
+distance_squared(
+ const size_t num_floats,
+ const float * const a,
+ const float * const b
+) {
+ float r = 0.0;
+
+ for (size_t i = 0; i <num_floats; i++) {
+ r += (b[i] - a[i]) * (b[i] - a[i]);
+ }
+
+ // return squared distance
+ return r;
+}
+
+// fill buffer with N random floats
+bool
+km_rand_src_fill(
+ km_rand_src_t * const rs,
+ const size_t num_floats,
+ float * const floats
+) {
+ return rs->cbs->fill(rs, num_floats, floats);
+}
+
+// finalize random source
+void
+km_rand_src_fini(
+ km_rand_src_t * const rs
+) {
+ if (rs->cbs->fini) {
+ rs->cbs->fini(rs);
+ }
+}
+
+// fill callback for system random source
+static bool
+rand_src_system_on_fill(
+ km_rand_src_t * const rs,
+ const size_t num_floats,
+ float * const floats
+) {
+ UNUSED(rs);
+
+ // generate random cluster centers
+ for (size_t i = 0; i < num_floats; i++) {
+ floats[i] = 1.0 * rand() / RAND_MAX;
+ }
+
+ // return success
+ return true;
+}
+
+// system random source callbacks
+static const km_rand_src_cbs_t
+RAND_SRC_SYSTEM_CBS = {
+ .fill = rand_src_system_on_fill,
+ .fini = NULL,
+};
+
+// init system random source (uses system rand())
+void
+km_rand_src_system_init(
+ km_rand_src_t * const rs
+) {
+ rs->cbs = &RAND_SRC_SYSTEM_CBS;
+ rs->data = NULL;
+}
+
+// grow data set
+static bool
+km_set_grow(
+ km_set_t * const set,
+ const size_t capacity
+) {
+ // alloc floats
+ float * const floats = malloc(sizeof(float) * set->shape.num_floats * capacity);
+ if (!floats) {
+ return false;
+ }
+
+ // alloc ints
+ int * const ints = malloc(sizeof(int) * set->shape.num_ints * capacity);
+ if (!ints) {
+ return false;
+ }
+
+ set->floats = floats;
+ set->ints = ints;
+ set->capacity = capacity;
+
+ return true;
+}
+
+// init data set with shape and initial size
+bool
+km_set_init(
+ km_set_t * const set,
+ const km_shape_t * const shape,
+ const size_t row_capacity
+) {
+ set->floats = NULL;
+ set->ints = NULL;
+ set->shape = *shape;
+ set->num_rows = 0;
+ set->capacity = 0;
+
+ return km_set_grow(set, MAX(MIN_ROWS, row_capacity + 1));
+}
+
+// finalize data set
+void
+km_set_fini(km_set_t * const set) {
+ if (set->floats) {
+ // free floats
+ free(set->floats);
+ set->floats = NULL;
+ }
+
+ if (set->ints) {
+ // free ints
+ free(set->ints);
+ set->ints = NULL;
+ }
+
+ // shrink capacity
+ set->capacity = 0;
+}
+
+/*
+ * // append row to data set
+ * bool
+ * km_set_push_row(
+ * km_set_t * const set,
+ * const float * const floats,
+ * const float * const ints
+ * ) {
+ * if (set->num_rows + 1 == set->capacity) {
+ * // resize buffers
+ * if (!km_set_grow(set, MAX(MIN_ROWS, 2 * set->capacity + 1))) {
+ * return false;
+ * }
+ * }
+ *
+ * // copy floats
+ * const size_t num_floats = set->shape.num_floats;
+ * if (num_floats > 0) {
+ * float * const dst = set->floats + num_floats * set->num_rows;
+ * const size_t stride = sizeof(float) * num_floats;
+ *
+ * memcpy(dst, floats, stride);
+ * }
+ *
+ * // copy ints
+ * const size_t num_ints = set->shape.num_ints;
+ * if (num_ints > 0) {
+ * int * const dst = set->ints + num_ints * set->num_rows;
+ * const size_t stride = sizeof(int) * num_ints;
+ *
+ * memcpy(dst, ints, stride);
+ * }
+ *
+ * // increment row count
+ * set->num_rows++;
+ *
+ * // return success
+ * return true;
+ * }
+ */
+
+// append rows to data set, growing set if necessary
+bool
+km_set_push_rows(
+ km_set_t * const set,
+ const size_t num_rows,
+ const float * const floats,
+ const int * const ints
+) {
+ const size_t capacity_needed = set->num_rows + num_rows;
+ // FIXME: potential overflow here
+ if (capacity_needed >= set->capacity) {
+ // crappy growth algorithm
+ const size_t new_capacity = 2 * capacity_needed + 1;
+
+ // resize storage
+ if (!km_set_grow(set, MAX(MIN_ROWS, new_capacity))) {
+ return false;
+ }
+ }
+
+ // copy floats
+ const size_t num_floats = set->shape.num_floats;
+ if (num_floats > 0) {
+ float * const dst = set->floats + num_floats * set->num_rows;
+ const size_t stride = sizeof(float) * num_floats;
+
+ // copy floats
+ memcpy(dst, floats, stride * num_rows);
+ }
+
+ // copy ints
+ const size_t num_ints = set->shape.num_ints;
+ if (num_ints > 0) {
+ int * const dst = set->ints + num_ints * set->num_rows;
+ const size_t stride = sizeof(int) * num_ints;
+
+ // copy ints
+ memcpy(dst, ints, stride * num_rows);
+ }
+
+ // increment row count
+ set->num_rows += num_rows;
+
+ // return success
+ return true;
+}
+
+// get row from data set
+float *
+km_set_get_row(
+ const km_set_t * const set,
+ const size_t i
+) {
+ const size_t num_floats = set->shape.num_floats;
+ return set->floats + i * num_floats;
+}
+
+typedef struct {
+ float d2;
+ size_t cluster;
+} row_map_item_t;
+
+// init a cluster set of size N from a data set by picking random
+// cluster centers
+bool
+km_clusters_rand_init(
+ km_set_t * const cs,
+ const size_t num_floats,
+ const size_t num_clusters,
+ km_rand_src_t * const rs
+) {
+ // init cluster shape
+ const km_shape_t shape = {
+ .num_floats = num_floats,
+ .num_ints = 1,
+ };
+
+ // generate random cluster centers
+ float floats[num_floats * num_clusters];
+ if (!km_rand_src_fill(rs, num_floats * num_clusters, floats)) {
+ // return failure
+ return false;
+ }
+
+ // FIXME: should probably be heap-allocated
+ int ints[num_clusters];
+ memset(ints, 0, sizeof(ints));
+
+ // init cluster set
+ if (!km_set_init(cs, &shape, num_clusters)) {
+ // return failure
+ return false;
+ }
+
+ // add data, return result
+ return km_set_push_rows(cs, num_clusters, floats, ints);
+}
+
+// use k-means to iteratively update the cluster centroids until there
+// are no more changes to the centroids
+bool
+km_clusters_solve(
+ km_set_t * const cs,
+ const km_set_t * const set,
+ const km_clusters_solve_cbs_t * const cbs,
+ void * const cb_data
+) {
+ const size_t num_clusters = cs->num_rows,
+ num_floats = set->shape.num_floats;
+
+ // row map: row => distance, cluster ID
+ row_map_item_t *row_map = malloc(sizeof(row_map_item_t) * set->num_rows);
+ if (!row_map) {
+ // return failure
+ return false;
+ }
+
+ // init row map by setting the maximum distance
+ for (size_t i = 0; i < set->num_rows; i++) {
+ row_map[i].d2 = FLT_MAX;
+ }
+
+ // calculate clusters by doing the following:
+ // * walk all clusters and all rows
+ // * if we find a closer cluster, move row to cluster
+ // * if there were changes to any cluster, then calculate a new
+ // centroid for each cluster by averaging the cluster rows
+ // * repeat until there are no more changes
+ bool changed = false;
+ do {
+ // no changes yet
+ changed = false;
+
+ for (size_t i = 0; i < num_clusters; i++) {
+ // get the floats for this cluster
+ const float * const floats = km_set_get_row(cs, i);
+
+ for (size_t j = 0; j < set->num_rows; j++) {
+ const float * const row_floats = km_set_get_row(set, j);
+
+ // calculate the distance squared between these clusters
+ const float d2 = distance_squared(num_floats, floats, row_floats);
+
+ if (d2 < row_map[j].d2) {
+ // row is closer to this cluster, update distance and cluster
+ row_map[j].d2 = d2;
+ row_map[j].cluster = i;
+
+ // flag change
+ changed = true;
+ }
+ }
+ }
+
+ if (changed) {
+ // if there were changes, then we need to calculate the new
+ // cluster centers
+
+ // calculate new center
+ for (size_t i = 0; i < num_clusters; i++) {
+ size_t num_rows = 0;
+ float * const floats = km_set_get_row(cs, i);
+ memset(floats, 0, sizeof(float) * num_floats);
+
+ for (size_t j = 0; j < set->num_rows; j++) {
+ const float * const row_floats = km_set_get_row(set, j);
+
+ if (row_map[j].cluster == i) {
+ // calculate numerator for average
+ for (size_t k = 0; k < num_floats; k++) {
+ floats[k] += row_floats[k];
+ }
+
+ // increment denominator for average
+ num_rows++;
+ }
+ }
+
+ // save number of rows in this cluster
+ cs->ints[i] = num_rows;
+
+ if (num_rows > 0) {
+ for (size_t k = 0; k < num_floats; k++) {
+ // divide by denominator to get average
+ floats[k] /= num_rows;
+ }
+ }
+ }
+ }
+
+ if (cbs && cbs->on_step) {
+ // pass clusters to step callback
+ cbs->on_step(cs, cb_data);
+ }
+ } while (changed);
+
+ if (cbs && cbs->on_step) {
+ float means[num_clusters];
+ memset(means, 0, sizeof(float) * num_clusters);
+
+ // calculate mean distances
+ for (size_t i = 0; i < set->num_rows; i++) {
+ means[row_map[i].cluster] += row_map[i].d2;
+ }
+
+ // calculate mean distances
+ for (size_t i = 0; i < num_clusters; i++) {
+ means[i] = (cs->ints[i]) ? (sqrt(means[i]) / cs->ints[i]) : 0;
+ }
+
+ // emit means
+ cbs->on_means(cs, means, num_clusters, cb_data);
+ }
+
+ // free row_map
+ free(row_map);
+
+ // return success
+ return true;
+}
diff --git a/km.h b/km.h
new file mode 100644
index 0000000..ef0f0d9
--- /dev/null
+++ b/km.h
@@ -0,0 +1,82 @@
+#ifndef KM_H
+#define KM_H
+
+#include <stddef.h> // size_t
+
+// forward typedef for callbacks
+typedef struct km_rand_src_t_ km_rand_src_t;
+
+// random number source callbacks
+typedef struct {
+ bool (*fill)(km_rand_src_t * const, const size_t, float * const);
+ void (*fini)(km_rand_src_t * const);
+} km_rand_src_cbs_t;
+
+// random number source
+struct km_rand_src_t_ {
+ const km_rand_src_cbs_t *cbs;
+ void *data;
+};
+
+// fill buffer with N random floats
+_Bool km_rand_src_fill(km_rand_src_t * const, const size_t, float * const);
+
+// finalize random source
+void km_rand_src_fini(km_rand_src_t * const);
+
+// init system random source (uses system rand())
+void km_rand_src_system_init(km_rand_src_t *);
+
+// shape of data set
+typedef struct {
+ size_t num_floats,
+ num_ints;
+} km_shape_t;
+
+// data set
+typedef struct {
+ float *floats;
+ int *ints;
+
+ km_shape_t shape;
+
+ size_t num_rows,
+ capacity;
+} km_set_t;
+
+// init data set with shape and initial size
+_Bool km_set_init(km_set_t * const, const km_shape_t *, const size_t);
+// finalize data set
+void km_set_fini(km_set_t * const);
+
+// append rows to data set, growing set if necessary
+_Bool km_set_push_rows(km_set_t *, const size_t, const float *, const int *);
+
+// get row from data set
+float *km_set_get_row(const km_set_t *, const size_t);
+
+// init a cluster set of size N from a data set by picking random
+// cluster centers
+_Bool km_clusters_rand_init(
+ km_set_t *,
+ const size_t,
+ const size_t,
+ km_rand_src_t *
+);
+
+typedef struct {
+ void (*on_step)(const km_set_t *, void *);
+ void (*on_means)(const km_set_t *, const float *, const size_t, void *);
+} km_clusters_solve_cbs_t;
+
+// use k-means to iteratively update the cluster centroids until there
+// are no more changes to the centroids
+_Bool
+km_clusters_solve(
+ km_set_t * const,
+ const km_set_t * const,
+ const km_clusters_solve_cbs_t * const,
+ void * const
+);
+
+#endif /* KM_H */