#include // bool #include // size_t #include // memcpy() #include // rand() #include // FLT_MAX #include // sqrt() #include "km.h" #define MIN_ROWS (4096 / sizeof(float)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define UNUSED(a) ((void) (a)) // calculate squared euclidean distance between two points static float distance_squared( const size_t num_floats, const float * const a, const float * const b ) { float r = 0.0; for (size_t i = 0; i cbs->fill(rs, num_floats, floats); } // finalize random source void km_rand_src_fini( km_rand_src_t * const rs ) { if (rs->cbs->fini) { rs->cbs->fini(rs); } } // fill callback for system random source static bool rand_src_system_on_fill( km_rand_src_t * const rs, const size_t num_floats, float * const floats ) { UNUSED(rs); // generate random cluster centers for (size_t i = 0; i < num_floats; i++) { floats[i] = 1.0 * rand() / RAND_MAX; } // return success return true; } // system random source callbacks static const km_rand_src_cbs_t RAND_SRC_SYSTEM_CBS = { .fill = rand_src_system_on_fill, .fini = NULL, }; // init system random source (uses system rand()) void km_rand_src_system_init( km_rand_src_t * const rs ) { rs->cbs = &RAND_SRC_SYSTEM_CBS; rs->data = NULL; } // grow data set static bool km_set_grow( km_set_t * const set, const size_t capacity ) { // alloc floats float * const floats = malloc(sizeof(float) * set->shape.num_floats * capacity); if (!floats) { return false; } // alloc ints int * const ints = malloc(sizeof(int) * set->shape.num_ints * capacity); if (!ints) { return false; } set->floats = floats; set->ints = ints; set->capacity = capacity; return true; } // init data set with shape and initial size bool km_set_init( km_set_t * const set, const km_shape_t * const shape, const size_t row_capacity ) { set->floats = NULL; set->ints = NULL; set->shape = *shape; set->num_rows = 0; set->capacity = 0; return km_set_grow(set, MAX(MIN_ROWS, row_capacity + 1)); } // finalize data set void km_set_fini(km_set_t * const set) { if (set->floats) { // free floats free(set->floats); set->floats = NULL; } if (set->ints) { // free ints free(set->ints); set->ints = NULL; } // shrink capacity set->capacity = 0; } /* * // append row to data set * bool * km_set_push_row( * km_set_t * const set, * const float * const floats, * const float * const ints * ) { * if (set->num_rows + 1 == set->capacity) { * // resize buffers * if (!km_set_grow(set, MAX(MIN_ROWS, 2 * set->capacity + 1))) { * return false; * } * } * * // copy floats * const size_t num_floats = set->shape.num_floats; * if (num_floats > 0) { * float * const dst = set->floats + num_floats * set->num_rows; * const size_t stride = sizeof(float) * num_floats; * * memcpy(dst, floats, stride); * } * * // copy ints * const size_t num_ints = set->shape.num_ints; * if (num_ints > 0) { * int * const dst = set->ints + num_ints * set->num_rows; * const size_t stride = sizeof(int) * num_ints; * * memcpy(dst, ints, stride); * } * * // increment row count * set->num_rows++; * * // return success * return true; * } */ // append rows to data set, growing set if necessary bool km_set_push_rows( km_set_t * const set, const size_t num_rows, const float * const floats, const int * const ints ) { const size_t capacity_needed = set->num_rows + num_rows; // FIXME: potential overflow here if (capacity_needed >= set->capacity) { // crappy growth algorithm const size_t new_capacity = 2 * capacity_needed + 1; // resize storage if (!km_set_grow(set, MAX(MIN_ROWS, new_capacity))) { return false; } } // copy floats const size_t num_floats = set->shape.num_floats; if (num_floats > 0) { float * const dst = set->floats + num_floats * set->num_rows; const size_t stride = sizeof(float) * num_floats; // copy floats memcpy(dst, floats, stride * num_rows); } // copy ints const size_t num_ints = set->shape.num_ints; if (num_ints > 0) { int * const dst = set->ints + num_ints * set->num_rows; const size_t stride = sizeof(int) * num_ints; // copy ints memcpy(dst, ints, stride * num_rows); } // increment row count set->num_rows += num_rows; // return success return true; } // get row from data set float * km_set_get_row( const km_set_t * const set, const size_t i ) { const size_t num_floats = set->shape.num_floats; return set->floats + i * num_floats; } typedef struct { float d2; size_t cluster; } row_map_item_t; // init a cluster set of size N from a data set by picking random // cluster centers bool km_clusters_rand_init( km_set_t * const cs, const size_t num_floats, const size_t num_clusters, km_rand_src_t * const rs ) { // init cluster shape const km_shape_t shape = { .num_floats = num_floats, .num_ints = 1, }; // generate random cluster centers float floats[num_floats * num_clusters]; if (!km_rand_src_fill(rs, num_floats * num_clusters, floats)) { // return failure return false; } // FIXME: should probably be heap-allocated int ints[num_clusters]; memset(ints, 0, sizeof(ints)); // init cluster set if (!km_set_init(cs, &shape, num_clusters)) { // return failure return false; } // add data, return result return km_set_push_rows(cs, num_clusters, floats, ints); } // use k-means to iteratively update the cluster centroids until there // are no more changes to the centroids bool km_clusters_solve( km_set_t * const cs, const km_set_t * const set, const km_clusters_solve_cbs_t * const cbs, void * const cb_data ) { const size_t num_clusters = cs->num_rows, num_floats = set->shape.num_floats; // row map: row => distance, cluster ID row_map_item_t *row_map = malloc(sizeof(row_map_item_t) * set->num_rows); if (!row_map) { // return failure return false; } // init row map by setting the maximum distance for (size_t i = 0; i < set->num_rows; i++) { row_map[i].d2 = FLT_MAX; } // calculate clusters by doing the following: // * walk all clusters and all rows // * if we find a closer cluster, move row to cluster // * if there were changes to any cluster, then calculate a new // centroid for each cluster by averaging the cluster rows // * repeat until there are no more changes bool changed = false; do { // no changes yet changed = false; for (size_t i = 0; i < num_clusters; i++) { // get the floats for this cluster const float * const floats = km_set_get_row(cs, i); for (size_t j = 0; j < set->num_rows; j++) { const float * const row_floats = km_set_get_row(set, j); // calculate the distance squared between these clusters const float d2 = distance_squared(num_floats, floats, row_floats); if (d2 < row_map[j].d2) { // row is closer to this cluster, update distance and cluster row_map[j].d2 = d2; row_map[j].cluster = i; // flag change changed = true; } } } if (changed) { // if there were changes, then we need to calculate the new // cluster centers // calculate new center for (size_t i = 0; i < num_clusters; i++) { size_t num_rows = 0; float * const floats = km_set_get_row(cs, i); memset(floats, 0, sizeof(float) * num_floats); for (size_t j = 0; j < set->num_rows; j++) { const float * const row_floats = km_set_get_row(set, j); if (row_map[j].cluster == i) { // calculate numerator for average for (size_t k = 0; k < num_floats; k++) { floats[k] += row_floats[k]; } // increment denominator for average num_rows++; } } // save number of rows in this cluster cs->ints[i] = num_rows; if (num_rows > 0) { for (size_t k = 0; k < num_floats; k++) { // divide by denominator to get average floats[k] /= num_rows; } } } } if (cbs && cbs->on_step) { // pass clusters to step callback cbs->on_step(cs, cb_data); } } while (changed); if (cbs && cbs->on_step) { float means[num_clusters]; memset(means, 0, sizeof(float) * num_clusters); // calculate mean distances for (size_t i = 0; i < set->num_rows; i++) { means[row_map[i].cluster] += row_map[i].d2; } // calculate mean distances for (size_t i = 0; i < num_clusters; i++) { means[i] = (cs->ints[i]) ? (sqrt(means[i]) / cs->ints[i]) : 0; } // emit means cbs->on_means(cs, means, num_clusters, cb_data); } // free row_map free(row_map); // return success return true; }