diff options
author | Paul Duncan <pabs@pablotron.org> | 2019-02-03 04:54:10 -0500 |
---|---|---|
committer | Paul Duncan <pabs@pablotron.org> | 2019-02-03 04:54:10 -0500 |
commit | 1e3dc2401263a436a32377e207352ffbe6984e8f (patch) | |
tree | fc34de6b8d7a456d861d7534360073ad10d8c620 /km.c | |
parent | a3792d8769d2dc8ee0abae758c6fae3a35b5dfbc (diff) | |
download | kmeans-1e3dc2401263a436a32377e207352ffbe6984e8f.tar.bz2 kmeans-1e3dc2401263a436a32377e207352ffbe6984e8f.zip |
add outstanding changes
Diffstat (limited to 'km.c')
-rw-r--r-- | km.c | 387 |
1 files changed, 89 insertions, 298 deletions
@@ -2,85 +2,10 @@ #include <stdint.h> // size_t #include <string.h> // memcpy() #include <stdlib.h> // rand() -#include <float.h> // FLT_MAX -#include <math.h> // sqrt() +#include "util.h" #include "km.h" #define MIN_ROWS (4096 / sizeof(float)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) -#define UNUSED(a) ((void) (a)) -#define MIN_CLUSTER_DISTANCE 0.0001 - -// calculate squared euclidean distance between two points -static float -distance_squared( - const size_t num_floats, - const float * const a, - const float * const b -) { - float r = 0.0; - - for (size_t i = 0; i < num_floats; i++) { - r += (b[i] - a[i]) * (b[i] - a[i]); - } - - // return squared distance - return r; -} - -// fill buffer with N random floats -bool -km_rand_src_fill( - km_rand_src_t * const rs, - const size_t num_floats, - float * const floats -) { - return rs->cbs->fill(rs, num_floats, floats); -} - -// finalize random source -void -km_rand_src_fini( - km_rand_src_t * const rs -) { - if (rs->cbs->fini) { - rs->cbs->fini(rs); - } -} - -// fill callback for system random source -static bool -rand_src_system_on_fill( - km_rand_src_t * const rs, - const size_t num_floats, - float * const floats -) { - UNUSED(rs); - - // generate random cluster centers - for (size_t i = 0; i < num_floats; i++) { - floats[i] = 1.0 * rand() / RAND_MAX; - } - - // return success - return true; -} - -// system random source callbacks -static const km_rand_src_cbs_t -RAND_SRC_SYSTEM_CBS = { - .fill = rand_src_system_on_fill, - .fini = NULL, -}; - -// init system random source (uses system rand()) -void -km_rand_src_system_init( - km_rand_src_t * const rs -) { - rs->cbs = &RAND_SRC_SYSTEM_CBS; - rs->data = NULL; -} // grow data set static bool @@ -104,6 +29,7 @@ km_set_grow( return false; } + // update set set->floats = floats; set->ints = ints; set->capacity = capacity; @@ -119,11 +45,20 @@ km_set_init( const km_shape_t * const shape, const size_t row_capacity ) { + // alloc bounds + float * const bounds = malloc(2 * sizeof(float) * shape->num_floats); + if (!bounds) { + // return error + return false; + } + + set->state = KM_SET_STATE_INIT; set->floats = NULL; set->ints = NULL; set->shape = *shape; set->num_rows = 0; set->capacity = 0; + set->bounds = bounds; return km_set_grow(set, MAX(MIN_ROWS, row_capacity + 1)); } @@ -131,6 +66,16 @@ km_set_init( // finalize data set void km_set_fini(km_set_t * const set) { + if (set->state == KM_SET_STATE_FINI) { + return; + } + + if (set->bounds) { + // free bounds + free(set->bounds); + set->bounds = NULL; + } + if (set->floats) { // free floats free(set->floats); @@ -145,16 +90,25 @@ km_set_fini(km_set_t * const set) { // shrink capacity set->capacity = 0; + + // set state + set->state = KM_SET_STATE_FINI; } // append rows to data set, growing set if necessary bool -km_set_push_rows( +km_set_push( km_set_t * const set, const size_t num_rows, const float * const floats, const int * const ints ) { + // check state + if (set->state != KM_SET_STATE_INIT) { + // return failure + return false; + } + const size_t capacity_needed = set->num_rows + num_rows; // FIXME: potential overflow here if (capacity_needed >= set->capacity) { @@ -175,6 +129,28 @@ km_set_push_rows( // copy floats memcpy(dst, floats, stride * num_rows); + + if (!set->num_rows) { + // there were no rows, so populate bounds with first row + memcpy(set->bounds, floats, stride); + memcpy(set->bounds + num_floats, floats, stride); + } + + for (size_t i = 0; i < num_rows; i++) { + for (size_t j = 0; j < num_floats; j++) { + const float val = floats[i * num_floats + j]; + + if (val < set->bounds[j]) { + // update min bound + set->bounds[j] = val; + } + + if (val > set->bounds[num_floats + j]) { + // update max bound + set->bounds[num_floats + j] = val; + } + } + } } // copy ints @@ -194,6 +170,38 @@ km_set_push_rows( return true; } +bool +km_set_normalize( + km_set_t * const set +) { + const size_t num_floats = set->shape.num_floats; + + // check set state + if (set->state != KM_SET_STATE_INIT) { + // return failure + return false; + } + + // normalize values + for (size_t i = 0; i < set->num_rows; i++) { + for (size_t j = 0; j < num_floats; j++) { + const size_t ofs = i * num_floats + j; + const float val = set->floats[ofs], + min = set->bounds[j], + max = set->bounds[num_floats + j]; + + // normalize and write value + set->floats[ofs] = (val - min) / (max - min); + } + } + + // set state + set->state = KM_SET_STATE_NORMALIZED; + + // return success + return true; +} + // get row from data set float * km_set_get_row( @@ -204,15 +212,10 @@ km_set_get_row( return set->floats + i * num_floats; } -typedef struct { - float d2; - size_t cluster; -} row_map_item_t; - -// init a cluster set of size N from a data set by picking random -// cluster centers +// init a set with num_clusters clusters of shape num_floats by picking +// random cluster centers bool -km_clusters_rand_init( +km_set_init_rand_clusters( km_set_t * const cs, const size_t num_floats, const size_t num_clusters, @@ -242,217 +245,5 @@ km_clusters_rand_init( } // add data, return result - return km_set_push_rows(cs, num_clusters, floats, ints); -} - -// use k-means to iteratively update the cluster centroids until there -// are no more changes to the centroids -bool -km_clusters_solve( - km_set_t * const cs, - const km_set_t * const set, - const km_clusters_solve_cbs_t * const cbs, - void * const cb_data -) { - const size_t num_clusters = cs->num_rows, - num_floats = set->shape.num_floats; - - // row map: row => distance, cluster ID - row_map_item_t *row_map = malloc(sizeof(row_map_item_t) * set->num_rows); - if (!row_map) { - // return failure - return false; - } - - // init row map by setting the maximum distance - for (size_t i = 0; i < set->num_rows; i++) { - row_map[i].d2 = FLT_MAX; - } - - // calculate clusters by doing the following: - // * walk all clusters and all rows - // * if we find a closer cluster, move row to cluster - // * if there were changes to any cluster, then calculate a new - // centroid for each cluster by averaging the cluster rows - // * repeat until there are no more changes - bool changed = false; - do { - // no changes yet - changed = false; - - for (size_t i = 0; i < num_clusters; i++) { - // get the floats for this cluster - const float * const floats = km_set_get_row(cs, i); - - for (size_t j = 0; j < set->num_rows; j++) { - const float * const row_floats = km_set_get_row(set, j); - - // calculate the distance squared between these clusters - const float d2 = distance_squared(num_floats, floats, row_floats); - - if (d2 < row_map[j].d2) { - // row is closer to this cluster, update distance and cluster - row_map[j].d2 = d2; - row_map[j].cluster = i; - - // flag change - changed = true; - } - } - } - - if (changed) { - // if there were changes, then we need to calculate the new - // cluster centers - - // calculate new center - for (size_t i = 0; i < num_clusters; i++) { - size_t num_rows = 0; - float * const floats = km_set_get_row(cs, i); - memset(floats, 0, sizeof(float) * num_floats); - - for (size_t j = 0; j < set->num_rows; j++) { - const float * const row_floats = km_set_get_row(set, j); - - if (row_map[j].cluster == i) { - // calculate numerator for average - for (size_t k = 0; k < num_floats; k++) { - floats[k] += row_floats[k]; - } - - // increment denominator for average - num_rows++; - } - } - - // save number of rows in this cluster - cs->ints[i] = num_rows; - - if (num_rows > 0) { - for (size_t k = 0; k < num_floats; k++) { - // divide by denominator to get average - floats[k] /= num_rows; - } - } - } - } - - if (cbs && cbs->on_step) { - // pass clusters to step callback - cbs->on_step(cs, cb_data); - } - } while (changed); - - if (cbs && cbs->on_step) { - float means[num_clusters]; - memset(means, 0, sizeof(float) * num_clusters); - - // calculate mean distances - for (size_t i = 0; i < set->num_rows; i++) { - means[row_map[i].cluster] += row_map[i].d2; - } - - // calculate mean distances - for (size_t i = 0; i < num_clusters; i++) { - means[i] = (cs->ints[i]) ? (sqrt(means[i]) / cs->ints[i]) : 0; - } - - // emit means - cbs->on_means(cs, means, num_clusters, cb_data); - } - - // free row_map - free(row_map); - - // return success - return true; -} - -typedef struct { - float sum; - size_t num_empty_clusters; -} search_test_data_t; - -static void -search_test_on_means( - const km_set_t * const set, - const float * const means, - const size_t num_clusters, - void * const cb_data -) { - search_test_data_t *test_data = cb_data; - UNUSED(set); - - // calculate numerator for the average distance across all clusters in - // this test - for (size_t i = 0; i < num_clusters; i++) { - if (fabsf(means[i]) > MIN_CLUSTER_DISTANCE) { - test_data->sum += means[i]; - } else { - test_data->num_empty_clusters++; - } - } -} - -static const km_clusters_solve_cbs_t -SEARCH_TEST_CBS = { - .on_means = search_test_on_means, -}; - -bool -km_search( - const km_set_t * const set, - const size_t max_clusters, - const size_t num_tests, - const km_search_row_cb_t on_row, - void *cb_data -) { - // init random source - km_rand_src_t rs; - km_rand_src_system_init(&rs); - - for (size_t i = 2; i < max_clusters; i++) { - for (size_t j = 0; j < num_tests; j++) { - // init cluster set - km_set_t cs; - if (!km_clusters_rand_init(&cs, set->shape.num_floats, i, &rs)) { - // return failure - return false; - } - - // init test data - search_test_data_t data = { - .sum = 0, - .num_empty_clusters = 0, - }; - - // solve test - if (!km_clusters_solve(&cs, set, &SEARCH_TEST_CBS, &data)) { - // return failure - return false; - } - - if (on_row) { - // calculate mean - const float mean = (data.num_empty_clusters < i) ? (data.sum / (i - data.num_empty_clusters)) : 0; - - // init search result row - km_search_row_t row = { - .cluster_set = &cs, - .num_clusters = i, - .mean_distance = mean, - .num_empty_clusters = data.num_empty_clusters, - }; - - // emit row - on_row(&row, cb_data); - } - - // free cluster set - km_set_fini(&cs); - } - } - - // return success - return true; + return km_set_push(cs, num_clusters, floats, ints); } |