aboutsummaryrefslogtreecommitdiff
path: root/km-set.c
diff options
context:
space:
mode:
Diffstat (limited to 'km-set.c')
-rw-r--r--km-set.c249
1 files changed, 249 insertions, 0 deletions
diff --git a/km-set.c b/km-set.c
new file mode 100644
index 0000000..5b9ae4c
--- /dev/null
+++ b/km-set.c
@@ -0,0 +1,249 @@
+#include <stdbool.h> // bool
+#include <stdint.h> // size_t
+#include <string.h> // memcpy()
+#include <stdlib.h> // rand()
+#include "util.h"
+#include "km.h"
+
+#define MIN_ROWS (4096 / sizeof(float))
+
+// grow data set
+static bool
+km_set_grow(
+ km_set_t * const set,
+ const size_t capacity
+) {
+ // alloc floats
+ const size_t num_floats = set->shape.num_floats * capacity;
+ float * const floats = malloc(sizeof(float) * num_floats);
+ if (!floats) {
+ // return failure
+ return false;
+ }
+
+ // alloc ints
+ const size_t num_ints = set->shape.num_ints * capacity;
+ int * const ints = malloc(sizeof(int) * num_ints);
+ if (!ints) {
+ // return failure
+ return false;
+ }
+
+ // update set
+ set->floats = floats;
+ set->ints = ints;
+ set->capacity = capacity;
+
+ // return success
+ return true;
+}
+
+// init data set with shape and initial size
+bool
+km_set_init(
+ km_set_t * const set,
+ const km_shape_t * const shape,
+ const size_t row_capacity
+) {
+ // alloc bounds
+ float * const bounds = malloc(2 * sizeof(float) * shape->num_floats);
+ if (!bounds) {
+ // return error
+ return false;
+ }
+
+ set->state = KM_SET_STATE_INIT;
+ set->floats = NULL;
+ set->ints = NULL;
+ set->shape = *shape;
+ set->num_rows = 0;
+ set->capacity = 0;
+ set->bounds = bounds;
+
+ return km_set_grow(set, MAX(MIN_ROWS, row_capacity + 1));
+}
+
+// finalize data set
+void
+km_set_fini(km_set_t * const set) {
+ if (set->state == KM_SET_STATE_FINI) {
+ return;
+ }
+
+ if (set->bounds) {
+ // free bounds
+ free(set->bounds);
+ set->bounds = NULL;
+ }
+
+ if (set->floats) {
+ // free floats
+ free(set->floats);
+ set->floats = NULL;
+ }
+
+ if (set->ints) {
+ // free ints
+ free(set->ints);
+ set->ints = NULL;
+ }
+
+ // shrink capacity
+ set->capacity = 0;
+
+ // set state
+ set->state = KM_SET_STATE_FINI;
+}
+
+// append rows to data set, growing set if necessary
+bool
+km_set_push(
+ km_set_t * const set,
+ const size_t num_rows,
+ const float * const floats,
+ const int * const ints
+) {
+ // check state
+ if (set->state != KM_SET_STATE_INIT) {
+ // return failure
+ return false;
+ }
+
+ const size_t capacity_needed = set->num_rows + num_rows;
+ // FIXME: potential overflow here
+ if (capacity_needed >= set->capacity) {
+ // crappy growth algorithm
+ const size_t new_capacity = 2 * capacity_needed + 1;
+
+ // resize storage
+ if (!km_set_grow(set, MAX(MIN_ROWS, new_capacity))) {
+ return false;
+ }
+ }
+
+ // copy floats
+ const size_t num_floats = set->shape.num_floats;
+ if (num_floats > 0) {
+ float * const dst = set->floats + num_floats * set->num_rows;
+ const size_t stride = sizeof(float) * num_floats;
+
+ // copy floats
+ memcpy(dst, floats, stride * num_rows);
+
+ if (!set->num_rows) {
+ // there were no rows, so populate bounds with first row
+ memcpy(set->bounds, floats, stride);
+ memcpy(set->bounds + num_floats, floats, stride);
+ }
+
+ for (size_t i = 0; i < num_rows; i++) {
+ for (size_t j = 0; j < num_floats; j++) {
+ const float val = floats[i * num_floats + j];
+
+ if (val < set->bounds[j]) {
+ // update min bound
+ set->bounds[j] = val;
+ }
+
+ if (val > set->bounds[num_floats + j]) {
+ // update max bound
+ set->bounds[num_floats + j] = val;
+ }
+ }
+ }
+ }
+
+ // copy ints
+ const size_t num_ints = set->shape.num_ints;
+ if (num_ints > 0) {
+ int * const dst = set->ints + num_ints * set->num_rows;
+ const size_t stride = sizeof(int) * num_ints;
+
+ // copy ints
+ memcpy(dst, ints, stride * num_rows);
+ }
+
+ // increment row count
+ set->num_rows += num_rows;
+
+ // return success
+ return true;
+}
+
+bool
+km_set_normalize(
+ km_set_t * const set
+) {
+ const size_t num_floats = set->shape.num_floats;
+
+ // check set state
+ if (set->state != KM_SET_STATE_INIT) {
+ // return failure
+ return false;
+ }
+
+ // normalize values
+ for (size_t i = 0; i < set->num_rows; i++) {
+ for (size_t j = 0; j < num_floats; j++) {
+ const size_t ofs = i * num_floats + j;
+ const float val = set->floats[ofs],
+ min = set->bounds[j],
+ max = set->bounds[num_floats + j];
+
+ // normalize and write value
+ set->floats[ofs] = (val - min) / (max - min);
+ }
+ }
+
+ // set state
+ set->state = KM_SET_STATE_NORMALIZED;
+
+ // return success
+ return true;
+}
+
+// get row from data set
+float *
+km_set_get_row(
+ const km_set_t * const set,
+ const size_t i
+) {
+ const size_t num_floats = set->shape.num_floats;
+ return set->floats + i * num_floats;
+}
+
+// init a set with num_clusters clusters of shape num_floats by picking
+// random cluster centers
+bool
+km_set_init_rand_clusters(
+ km_set_t * const cs,
+ const size_t num_floats,
+ const size_t num_clusters,
+ km_rand_src_t * const rs
+) {
+ // init cluster shape
+ const km_shape_t shape = {
+ .num_floats = num_floats,
+ .num_ints = 1,
+ };
+
+ // generate random cluster centers
+ float floats[num_floats * num_clusters];
+ if (!km_rand_src_fill(rs, num_floats * num_clusters, floats)) {
+ // return failure
+ return false;
+ }
+
+ // FIXME: should probably be heap-allocated
+ int ints[num_clusters];
+ memset(ints, 0, sizeof(ints));
+
+ // init cluster set
+ if (!km_set_init(cs, &shape, num_clusters)) {
+ // return failure
+ return false;
+ }
+
+ // add data, return result
+ return km_set_push(cs, num_clusters, floats, ints);
+}