diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Makefile | 10 | ||||
-rw-r--r-- | km.c | 404 | ||||
-rw-r--r-- | km.h | 82 |
4 files changed, 497 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5761abc --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.o diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3873d7b --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +CFLAGS=-W -Wall -Wextra -pedantic -std=c11 -O2 +OBJS=km.o + +all: $(OBJS) + +%.o: %.c + $(CC) -c $(CFLAGS) $< + +clean: + $(RM) $(OBJS) @@ -0,0 +1,404 @@ +#include <stdbool.h> // bool +#include <stdint.h> // size_t +#include <string.h> // memcpy() +#include <stdlib.h> // rand() +#include <float.h> // FLT_MAX +#include <math.h> // sqrt() +#include "km.h" + +#define MIN_ROWS (4096 / sizeof(float)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define UNUSED(a) ((void) (a)) + +// calculate squared euclidean distance between two points +static float +distance_squared( + const size_t num_floats, + const float * const a, + const float * const b +) { + float r = 0.0; + + for (size_t i = 0; i <num_floats; i++) { + r += (b[i] - a[i]) * (b[i] - a[i]); + } + + // return squared distance + return r; +} + +// fill buffer with N random floats +bool +km_rand_src_fill( + km_rand_src_t * const rs, + const size_t num_floats, + float * const floats +) { + return rs->cbs->fill(rs, num_floats, floats); +} + +// finalize random source +void +km_rand_src_fini( + km_rand_src_t * const rs +) { + if (rs->cbs->fini) { + rs->cbs->fini(rs); + } +} + +// fill callback for system random source +static bool +rand_src_system_on_fill( + km_rand_src_t * const rs, + const size_t num_floats, + float * const floats +) { + UNUSED(rs); + + // generate random cluster centers + for (size_t i = 0; i < num_floats; i++) { + floats[i] = 1.0 * rand() / RAND_MAX; + } + + // return success + return true; +} + +// system random source callbacks +static const km_rand_src_cbs_t +RAND_SRC_SYSTEM_CBS = { + .fill = rand_src_system_on_fill, + .fini = NULL, +}; + +// init system random source (uses system rand()) +void +km_rand_src_system_init( + km_rand_src_t * const rs +) { + rs->cbs = &RAND_SRC_SYSTEM_CBS; + rs->data = NULL; +} + +// grow data set +static bool +km_set_grow( + km_set_t * const set, + const size_t capacity +) { + // alloc floats + float * const floats = malloc(sizeof(float) * set->shape.num_floats * capacity); + if (!floats) { + return false; + } + + // alloc ints + int * const ints = malloc(sizeof(int) * set->shape.num_ints * capacity); + if (!ints) { + return false; + } + + set->floats = floats; + set->ints = ints; + set->capacity = capacity; + + return true; +} + +// init data set with shape and initial size +bool +km_set_init( + km_set_t * const set, + const km_shape_t * const shape, + const size_t row_capacity +) { + set->floats = NULL; + set->ints = NULL; + set->shape = *shape; + set->num_rows = 0; + set->capacity = 0; + + return km_set_grow(set, MAX(MIN_ROWS, row_capacity + 1)); +} + +// finalize data set +void +km_set_fini(km_set_t * const set) { + if (set->floats) { + // free floats + free(set->floats); + set->floats = NULL; + } + + if (set->ints) { + // free ints + free(set->ints); + set->ints = NULL; + } + + // shrink capacity + set->capacity = 0; +} + +/* + * // append row to data set + * bool + * km_set_push_row( + * km_set_t * const set, + * const float * const floats, + * const float * const ints + * ) { + * if (set->num_rows + 1 == set->capacity) { + * // resize buffers + * if (!km_set_grow(set, MAX(MIN_ROWS, 2 * set->capacity + 1))) { + * return false; + * } + * } + * + * // copy floats + * const size_t num_floats = set->shape.num_floats; + * if (num_floats > 0) { + * float * const dst = set->floats + num_floats * set->num_rows; + * const size_t stride = sizeof(float) * num_floats; + * + * memcpy(dst, floats, stride); + * } + * + * // copy ints + * const size_t num_ints = set->shape.num_ints; + * if (num_ints > 0) { + * int * const dst = set->ints + num_ints * set->num_rows; + * const size_t stride = sizeof(int) * num_ints; + * + * memcpy(dst, ints, stride); + * } + * + * // increment row count + * set->num_rows++; + * + * // return success + * return true; + * } + */ + +// append rows to data set, growing set if necessary +bool +km_set_push_rows( + km_set_t * const set, + const size_t num_rows, + const float * const floats, + const int * const ints +) { + const size_t capacity_needed = set->num_rows + num_rows; + // FIXME: potential overflow here + if (capacity_needed >= set->capacity) { + // crappy growth algorithm + const size_t new_capacity = 2 * capacity_needed + 1; + + // resize storage + if (!km_set_grow(set, MAX(MIN_ROWS, new_capacity))) { + return false; + } + } + + // copy floats + const size_t num_floats = set->shape.num_floats; + if (num_floats > 0) { + float * const dst = set->floats + num_floats * set->num_rows; + const size_t stride = sizeof(float) * num_floats; + + // copy floats + memcpy(dst, floats, stride * num_rows); + } + + // copy ints + const size_t num_ints = set->shape.num_ints; + if (num_ints > 0) { + int * const dst = set->ints + num_ints * set->num_rows; + const size_t stride = sizeof(int) * num_ints; + + // copy ints + memcpy(dst, ints, stride * num_rows); + } + + // increment row count + set->num_rows += num_rows; + + // return success + return true; +} + +// get row from data set +float * +km_set_get_row( + const km_set_t * const set, + const size_t i +) { + const size_t num_floats = set->shape.num_floats; + return set->floats + i * num_floats; +} + +typedef struct { + float d2; + size_t cluster; +} row_map_item_t; + +// init a cluster set of size N from a data set by picking random +// cluster centers +bool +km_clusters_rand_init( + km_set_t * const cs, + const size_t num_floats, + const size_t num_clusters, + km_rand_src_t * const rs +) { + // init cluster shape + const km_shape_t shape = { + .num_floats = num_floats, + .num_ints = 1, + }; + + // generate random cluster centers + float floats[num_floats * num_clusters]; + if (!km_rand_src_fill(rs, num_floats * num_clusters, floats)) { + // return failure + return false; + } + + // FIXME: should probably be heap-allocated + int ints[num_clusters]; + memset(ints, 0, sizeof(ints)); + + // init cluster set + if (!km_set_init(cs, &shape, num_clusters)) { + // return failure + return false; + } + + // add data, return result + return km_set_push_rows(cs, num_clusters, floats, ints); +} + +// use k-means to iteratively update the cluster centroids until there +// are no more changes to the centroids +bool +km_clusters_solve( + km_set_t * const cs, + const km_set_t * const set, + const km_clusters_solve_cbs_t * const cbs, + void * const cb_data +) { + const size_t num_clusters = cs->num_rows, + num_floats = set->shape.num_floats; + + // row map: row => distance, cluster ID + row_map_item_t *row_map = malloc(sizeof(row_map_item_t) * set->num_rows); + if (!row_map) { + // return failure + return false; + } + + // init row map by setting the maximum distance + for (size_t i = 0; i < set->num_rows; i++) { + row_map[i].d2 = FLT_MAX; + } + + // calculate clusters by doing the following: + // * walk all clusters and all rows + // * if we find a closer cluster, move row to cluster + // * if there were changes to any cluster, then calculate a new + // centroid for each cluster by averaging the cluster rows + // * repeat until there are no more changes + bool changed = false; + do { + // no changes yet + changed = false; + + for (size_t i = 0; i < num_clusters; i++) { + // get the floats for this cluster + const float * const floats = km_set_get_row(cs, i); + + for (size_t j = 0; j < set->num_rows; j++) { + const float * const row_floats = km_set_get_row(set, j); + + // calculate the distance squared between these clusters + const float d2 = distance_squared(num_floats, floats, row_floats); + + if (d2 < row_map[j].d2) { + // row is closer to this cluster, update distance and cluster + row_map[j].d2 = d2; + row_map[j].cluster = i; + + // flag change + changed = true; + } + } + } + + if (changed) { + // if there were changes, then we need to calculate the new + // cluster centers + + // calculate new center + for (size_t i = 0; i < num_clusters; i++) { + size_t num_rows = 0; + float * const floats = km_set_get_row(cs, i); + memset(floats, 0, sizeof(float) * num_floats); + + for (size_t j = 0; j < set->num_rows; j++) { + const float * const row_floats = km_set_get_row(set, j); + + if (row_map[j].cluster == i) { + // calculate numerator for average + for (size_t k = 0; k < num_floats; k++) { + floats[k] += row_floats[k]; + } + + // increment denominator for average + num_rows++; + } + } + + // save number of rows in this cluster + cs->ints[i] = num_rows; + + if (num_rows > 0) { + for (size_t k = 0; k < num_floats; k++) { + // divide by denominator to get average + floats[k] /= num_rows; + } + } + } + } + + if (cbs && cbs->on_step) { + // pass clusters to step callback + cbs->on_step(cs, cb_data); + } + } while (changed); + + if (cbs && cbs->on_step) { + float means[num_clusters]; + memset(means, 0, sizeof(float) * num_clusters); + + // calculate mean distances + for (size_t i = 0; i < set->num_rows; i++) { + means[row_map[i].cluster] += row_map[i].d2; + } + + // calculate mean distances + for (size_t i = 0; i < num_clusters; i++) { + means[i] = (cs->ints[i]) ? (sqrt(means[i]) / cs->ints[i]) : 0; + } + + // emit means + cbs->on_means(cs, means, num_clusters, cb_data); + } + + // free row_map + free(row_map); + + // return success + return true; +} @@ -0,0 +1,82 @@ +#ifndef KM_H +#define KM_H + +#include <stddef.h> // size_t + +// forward typedef for callbacks +typedef struct km_rand_src_t_ km_rand_src_t; + +// random number source callbacks +typedef struct { + bool (*fill)(km_rand_src_t * const, const size_t, float * const); + void (*fini)(km_rand_src_t * const); +} km_rand_src_cbs_t; + +// random number source +struct km_rand_src_t_ { + const km_rand_src_cbs_t *cbs; + void *data; +}; + +// fill buffer with N random floats +_Bool km_rand_src_fill(km_rand_src_t * const, const size_t, float * const); + +// finalize random source +void km_rand_src_fini(km_rand_src_t * const); + +// init system random source (uses system rand()) +void km_rand_src_system_init(km_rand_src_t *); + +// shape of data set +typedef struct { + size_t num_floats, + num_ints; +} km_shape_t; + +// data set +typedef struct { + float *floats; + int *ints; + + km_shape_t shape; + + size_t num_rows, + capacity; +} km_set_t; + +// init data set with shape and initial size +_Bool km_set_init(km_set_t * const, const km_shape_t *, const size_t); +// finalize data set +void km_set_fini(km_set_t * const); + +// append rows to data set, growing set if necessary +_Bool km_set_push_rows(km_set_t *, const size_t, const float *, const int *); + +// get row from data set +float *km_set_get_row(const km_set_t *, const size_t); + +// init a cluster set of size N from a data set by picking random +// cluster centers +_Bool km_clusters_rand_init( + km_set_t *, + const size_t, + const size_t, + km_rand_src_t * +); + +typedef struct { + void (*on_step)(const km_set_t *, void *); + void (*on_means)(const km_set_t *, const float *, const size_t, void *); +} km_clusters_solve_cbs_t; + +// use k-means to iteratively update the cluster centroids until there +// are no more changes to the centroids +_Bool +km_clusters_solve( + km_set_t * const, + const km_set_t * const, + const km_clusters_solve_cbs_t * const, + void * const +); + +#endif /* KM_H */ |