aboutsummaryrefslogtreecommitdiff
path: root/km.c
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2019-02-03 04:54:10 -0500
committerPaul Duncan <pabs@pablotron.org>2019-02-03 04:54:10 -0500
commit1e3dc2401263a436a32377e207352ffbe6984e8f (patch)
treefc34de6b8d7a456d861d7534360073ad10d8c620 /km.c
parenta3792d8769d2dc8ee0abae758c6fae3a35b5dfbc (diff)
downloadkmeans-1e3dc2401263a436a32377e207352ffbe6984e8f.tar.bz2
kmeans-1e3dc2401263a436a32377e207352ffbe6984e8f.zip
add outstanding changes
Diffstat (limited to 'km.c')
-rw-r--r--km.c387
1 files changed, 89 insertions, 298 deletions
diff --git a/km.c b/km.c
index 135132b..5b9ae4c 100644
--- a/km.c
+++ b/km.c
@@ -2,85 +2,10 @@
#include <stdint.h> // size_t
#include <string.h> // memcpy()
#include <stdlib.h> // rand()
-#include <float.h> // FLT_MAX
-#include <math.h> // sqrt()
+#include "util.h"
#include "km.h"
#define MIN_ROWS (4096 / sizeof(float))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define UNUSED(a) ((void) (a))
-#define MIN_CLUSTER_DISTANCE 0.0001
-
-// calculate squared euclidean distance between two points
-static float
-distance_squared(
- const size_t num_floats,
- const float * const a,
- const float * const b
-) {
- float r = 0.0;
-
- for (size_t i = 0; i < num_floats; i++) {
- r += (b[i] - a[i]) * (b[i] - a[i]);
- }
-
- // return squared distance
- return r;
-}
-
-// fill buffer with N random floats
-bool
-km_rand_src_fill(
- km_rand_src_t * const rs,
- const size_t num_floats,
- float * const floats
-) {
- return rs->cbs->fill(rs, num_floats, floats);
-}
-
-// finalize random source
-void
-km_rand_src_fini(
- km_rand_src_t * const rs
-) {
- if (rs->cbs->fini) {
- rs->cbs->fini(rs);
- }
-}
-
-// fill callback for system random source
-static bool
-rand_src_system_on_fill(
- km_rand_src_t * const rs,
- const size_t num_floats,
- float * const floats
-) {
- UNUSED(rs);
-
- // generate random cluster centers
- for (size_t i = 0; i < num_floats; i++) {
- floats[i] = 1.0 * rand() / RAND_MAX;
- }
-
- // return success
- return true;
-}
-
-// system random source callbacks
-static const km_rand_src_cbs_t
-RAND_SRC_SYSTEM_CBS = {
- .fill = rand_src_system_on_fill,
- .fini = NULL,
-};
-
-// init system random source (uses system rand())
-void
-km_rand_src_system_init(
- km_rand_src_t * const rs
-) {
- rs->cbs = &RAND_SRC_SYSTEM_CBS;
- rs->data = NULL;
-}
// grow data set
static bool
@@ -104,6 +29,7 @@ km_set_grow(
return false;
}
+ // update set
set->floats = floats;
set->ints = ints;
set->capacity = capacity;
@@ -119,11 +45,20 @@ km_set_init(
const km_shape_t * const shape,
const size_t row_capacity
) {
+ // alloc bounds
+ float * const bounds = malloc(2 * sizeof(float) * shape->num_floats);
+ if (!bounds) {
+ // return error
+ return false;
+ }
+
+ set->state = KM_SET_STATE_INIT;
set->floats = NULL;
set->ints = NULL;
set->shape = *shape;
set->num_rows = 0;
set->capacity = 0;
+ set->bounds = bounds;
return km_set_grow(set, MAX(MIN_ROWS, row_capacity + 1));
}
@@ -131,6 +66,16 @@ km_set_init(
// finalize data set
void
km_set_fini(km_set_t * const set) {
+ if (set->state == KM_SET_STATE_FINI) {
+ return;
+ }
+
+ if (set->bounds) {
+ // free bounds
+ free(set->bounds);
+ set->bounds = NULL;
+ }
+
if (set->floats) {
// free floats
free(set->floats);
@@ -145,16 +90,25 @@ km_set_fini(km_set_t * const set) {
// shrink capacity
set->capacity = 0;
+
+ // set state
+ set->state = KM_SET_STATE_FINI;
}
// append rows to data set, growing set if necessary
bool
-km_set_push_rows(
+km_set_push(
km_set_t * const set,
const size_t num_rows,
const float * const floats,
const int * const ints
) {
+ // check state
+ if (set->state != KM_SET_STATE_INIT) {
+ // return failure
+ return false;
+ }
+
const size_t capacity_needed = set->num_rows + num_rows;
// FIXME: potential overflow here
if (capacity_needed >= set->capacity) {
@@ -175,6 +129,28 @@ km_set_push_rows(
// copy floats
memcpy(dst, floats, stride * num_rows);
+
+ if (!set->num_rows) {
+ // there were no rows, so populate bounds with first row
+ memcpy(set->bounds, floats, stride);
+ memcpy(set->bounds + num_floats, floats, stride);
+ }
+
+ for (size_t i = 0; i < num_rows; i++) {
+ for (size_t j = 0; j < num_floats; j++) {
+ const float val = floats[i * num_floats + j];
+
+ if (val < set->bounds[j]) {
+ // update min bound
+ set->bounds[j] = val;
+ }
+
+ if (val > set->bounds[num_floats + j]) {
+ // update max bound
+ set->bounds[num_floats + j] = val;
+ }
+ }
+ }
}
// copy ints
@@ -194,6 +170,38 @@ km_set_push_rows(
return true;
}
+bool
+km_set_normalize(
+ km_set_t * const set
+) {
+ const size_t num_floats = set->shape.num_floats;
+
+ // check set state
+ if (set->state != KM_SET_STATE_INIT) {
+ // return failure
+ return false;
+ }
+
+ // normalize values
+ for (size_t i = 0; i < set->num_rows; i++) {
+ for (size_t j = 0; j < num_floats; j++) {
+ const size_t ofs = i * num_floats + j;
+ const float val = set->floats[ofs],
+ min = set->bounds[j],
+ max = set->bounds[num_floats + j];
+
+ // normalize and write value
+ set->floats[ofs] = (val - min) / (max - min);
+ }
+ }
+
+ // set state
+ set->state = KM_SET_STATE_NORMALIZED;
+
+ // return success
+ return true;
+}
+
// get row from data set
float *
km_set_get_row(
@@ -204,15 +212,10 @@ km_set_get_row(
return set->floats + i * num_floats;
}
-typedef struct {
- float d2;
- size_t cluster;
-} row_map_item_t;
-
-// init a cluster set of size N from a data set by picking random
-// cluster centers
+// init a set with num_clusters clusters of shape num_floats by picking
+// random cluster centers
bool
-km_clusters_rand_init(
+km_set_init_rand_clusters(
km_set_t * const cs,
const size_t num_floats,
const size_t num_clusters,
@@ -242,217 +245,5 @@ km_clusters_rand_init(
}
// add data, return result
- return km_set_push_rows(cs, num_clusters, floats, ints);
-}
-
-// use k-means to iteratively update the cluster centroids until there
-// are no more changes to the centroids
-bool
-km_clusters_solve(
- km_set_t * const cs,
- const km_set_t * const set,
- const km_clusters_solve_cbs_t * const cbs,
- void * const cb_data
-) {
- const size_t num_clusters = cs->num_rows,
- num_floats = set->shape.num_floats;
-
- // row map: row => distance, cluster ID
- row_map_item_t *row_map = malloc(sizeof(row_map_item_t) * set->num_rows);
- if (!row_map) {
- // return failure
- return false;
- }
-
- // init row map by setting the maximum distance
- for (size_t i = 0; i < set->num_rows; i++) {
- row_map[i].d2 = FLT_MAX;
- }
-
- // calculate clusters by doing the following:
- // * walk all clusters and all rows
- // * if we find a closer cluster, move row to cluster
- // * if there were changes to any cluster, then calculate a new
- // centroid for each cluster by averaging the cluster rows
- // * repeat until there are no more changes
- bool changed = false;
- do {
- // no changes yet
- changed = false;
-
- for (size_t i = 0; i < num_clusters; i++) {
- // get the floats for this cluster
- const float * const floats = km_set_get_row(cs, i);
-
- for (size_t j = 0; j < set->num_rows; j++) {
- const float * const row_floats = km_set_get_row(set, j);
-
- // calculate the distance squared between these clusters
- const float d2 = distance_squared(num_floats, floats, row_floats);
-
- if (d2 < row_map[j].d2) {
- // row is closer to this cluster, update distance and cluster
- row_map[j].d2 = d2;
- row_map[j].cluster = i;
-
- // flag change
- changed = true;
- }
- }
- }
-
- if (changed) {
- // if there were changes, then we need to calculate the new
- // cluster centers
-
- // calculate new center
- for (size_t i = 0; i < num_clusters; i++) {
- size_t num_rows = 0;
- float * const floats = km_set_get_row(cs, i);
- memset(floats, 0, sizeof(float) * num_floats);
-
- for (size_t j = 0; j < set->num_rows; j++) {
- const float * const row_floats = km_set_get_row(set, j);
-
- if (row_map[j].cluster == i) {
- // calculate numerator for average
- for (size_t k = 0; k < num_floats; k++) {
- floats[k] += row_floats[k];
- }
-
- // increment denominator for average
- num_rows++;
- }
- }
-
- // save number of rows in this cluster
- cs->ints[i] = num_rows;
-
- if (num_rows > 0) {
- for (size_t k = 0; k < num_floats; k++) {
- // divide by denominator to get average
- floats[k] /= num_rows;
- }
- }
- }
- }
-
- if (cbs && cbs->on_step) {
- // pass clusters to step callback
- cbs->on_step(cs, cb_data);
- }
- } while (changed);
-
- if (cbs && cbs->on_step) {
- float means[num_clusters];
- memset(means, 0, sizeof(float) * num_clusters);
-
- // calculate mean distances
- for (size_t i = 0; i < set->num_rows; i++) {
- means[row_map[i].cluster] += row_map[i].d2;
- }
-
- // calculate mean distances
- for (size_t i = 0; i < num_clusters; i++) {
- means[i] = (cs->ints[i]) ? (sqrt(means[i]) / cs->ints[i]) : 0;
- }
-
- // emit means
- cbs->on_means(cs, means, num_clusters, cb_data);
- }
-
- // free row_map
- free(row_map);
-
- // return success
- return true;
-}
-
-typedef struct {
- float sum;
- size_t num_empty_clusters;
-} search_test_data_t;
-
-static void
-search_test_on_means(
- const km_set_t * const set,
- const float * const means,
- const size_t num_clusters,
- void * const cb_data
-) {
- search_test_data_t *test_data = cb_data;
- UNUSED(set);
-
- // calculate numerator for the average distance across all clusters in
- // this test
- for (size_t i = 0; i < num_clusters; i++) {
- if (fabsf(means[i]) > MIN_CLUSTER_DISTANCE) {
- test_data->sum += means[i];
- } else {
- test_data->num_empty_clusters++;
- }
- }
-}
-
-static const km_clusters_solve_cbs_t
-SEARCH_TEST_CBS = {
- .on_means = search_test_on_means,
-};
-
-bool
-km_search(
- const km_set_t * const set,
- const size_t max_clusters,
- const size_t num_tests,
- const km_search_row_cb_t on_row,
- void *cb_data
-) {
- // init random source
- km_rand_src_t rs;
- km_rand_src_system_init(&rs);
-
- for (size_t i = 2; i < max_clusters; i++) {
- for (size_t j = 0; j < num_tests; j++) {
- // init cluster set
- km_set_t cs;
- if (!km_clusters_rand_init(&cs, set->shape.num_floats, i, &rs)) {
- // return failure
- return false;
- }
-
- // init test data
- search_test_data_t data = {
- .sum = 0,
- .num_empty_clusters = 0,
- };
-
- // solve test
- if (!km_clusters_solve(&cs, set, &SEARCH_TEST_CBS, &data)) {
- // return failure
- return false;
- }
-
- if (on_row) {
- // calculate mean
- const float mean = (data.num_empty_clusters < i) ? (data.sum / (i - data.num_empty_clusters)) : 0;
-
- // init search result row
- km_search_row_t row = {
- .cluster_set = &cs,
- .num_clusters = i,
- .mean_distance = mean,
- .num_empty_clusters = data.num_empty_clusters,
- };
-
- // emit row
- on_row(&row, cb_data);
- }
-
- // free cluster set
- km_set_fini(&cs);
- }
- }
-
- // return success
- return true;
+ return km_set_push(cs, num_clusters, floats, ints);
}