diff options
Diffstat (limited to 'km-set.c')
-rw-r--r-- | km-set.c | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/km-set.c b/km-set.c new file mode 100644 index 0000000..5b9ae4c --- /dev/null +++ b/km-set.c @@ -0,0 +1,249 @@ +#include <stdbool.h> // bool +#include <stdint.h> // size_t +#include <string.h> // memcpy() +#include <stdlib.h> // rand() +#include "util.h" +#include "km.h" + +#define MIN_ROWS (4096 / sizeof(float)) + +// grow data set +static bool +km_set_grow( + km_set_t * const set, + const size_t capacity +) { + // alloc floats + const size_t num_floats = set->shape.num_floats * capacity; + float * const floats = malloc(sizeof(float) * num_floats); + if (!floats) { + // return failure + return false; + } + + // alloc ints + const size_t num_ints = set->shape.num_ints * capacity; + int * const ints = malloc(sizeof(int) * num_ints); + if (!ints) { + // return failure + return false; + } + + // update set + set->floats = floats; + set->ints = ints; + set->capacity = capacity; + + // return success + return true; +} + +// init data set with shape and initial size +bool +km_set_init( + km_set_t * const set, + const km_shape_t * const shape, + const size_t row_capacity +) { + // alloc bounds + float * const bounds = malloc(2 * sizeof(float) * shape->num_floats); + if (!bounds) { + // return error + return false; + } + + set->state = KM_SET_STATE_INIT; + set->floats = NULL; + set->ints = NULL; + set->shape = *shape; + set->num_rows = 0; + set->capacity = 0; + set->bounds = bounds; + + return km_set_grow(set, MAX(MIN_ROWS, row_capacity + 1)); +} + +// finalize data set +void +km_set_fini(km_set_t * const set) { + if (set->state == KM_SET_STATE_FINI) { + return; + } + + if (set->bounds) { + // free bounds + free(set->bounds); + set->bounds = NULL; + } + + if (set->floats) { + // free floats + free(set->floats); + set->floats = NULL; + } + + if (set->ints) { + // free ints + free(set->ints); + set->ints = NULL; + } + + // shrink capacity + set->capacity = 0; + + // set state + set->state = KM_SET_STATE_FINI; +} + +// append rows to data set, growing set if necessary +bool +km_set_push( + km_set_t * const set, + const size_t num_rows, + const float * const floats, + const int * const ints +) { + // check state + if (set->state != KM_SET_STATE_INIT) { + // return failure + return false; + } + + const size_t capacity_needed = set->num_rows + num_rows; + // FIXME: potential overflow here + if (capacity_needed >= set->capacity) { + // crappy growth algorithm + const size_t new_capacity = 2 * capacity_needed + 1; + + // resize storage + if (!km_set_grow(set, MAX(MIN_ROWS, new_capacity))) { + return false; + } + } + + // copy floats + const size_t num_floats = set->shape.num_floats; + if (num_floats > 0) { + float * const dst = set->floats + num_floats * set->num_rows; + const size_t stride = sizeof(float) * num_floats; + + // copy floats + memcpy(dst, floats, stride * num_rows); + + if (!set->num_rows) { + // there were no rows, so populate bounds with first row + memcpy(set->bounds, floats, stride); + memcpy(set->bounds + num_floats, floats, stride); + } + + for (size_t i = 0; i < num_rows; i++) { + for (size_t j = 0; j < num_floats; j++) { + const float val = floats[i * num_floats + j]; + + if (val < set->bounds[j]) { + // update min bound + set->bounds[j] = val; + } + + if (val > set->bounds[num_floats + j]) { + // update max bound + set->bounds[num_floats + j] = val; + } + } + } + } + + // copy ints + const size_t num_ints = set->shape.num_ints; + if (num_ints > 0) { + int * const dst = set->ints + num_ints * set->num_rows; + const size_t stride = sizeof(int) * num_ints; + + // copy ints + memcpy(dst, ints, stride * num_rows); + } + + // increment row count + set->num_rows += num_rows; + + // return success + return true; +} + +bool +km_set_normalize( + km_set_t * const set +) { + const size_t num_floats = set->shape.num_floats; + + // check set state + if (set->state != KM_SET_STATE_INIT) { + // return failure + return false; + } + + // normalize values + for (size_t i = 0; i < set->num_rows; i++) { + for (size_t j = 0; j < num_floats; j++) { + const size_t ofs = i * num_floats + j; + const float val = set->floats[ofs], + min = set->bounds[j], + max = set->bounds[num_floats + j]; + + // normalize and write value + set->floats[ofs] = (val - min) / (max - min); + } + } + + // set state + set->state = KM_SET_STATE_NORMALIZED; + + // return success + return true; +} + +// get row from data set +float * +km_set_get_row( + const km_set_t * const set, + const size_t i +) { + const size_t num_floats = set->shape.num_floats; + return set->floats + i * num_floats; +} + +// init a set with num_clusters clusters of shape num_floats by picking +// random cluster centers +bool +km_set_init_rand_clusters( + km_set_t * const cs, + const size_t num_floats, + const size_t num_clusters, + km_rand_src_t * const rs +) { + // init cluster shape + const km_shape_t shape = { + .num_floats = num_floats, + .num_ints = 1, + }; + + // generate random cluster centers + float floats[num_floats * num_clusters]; + if (!km_rand_src_fill(rs, num_floats * num_clusters, floats)) { + // return failure + return false; + } + + // FIXME: should probably be heap-allocated + int ints[num_clusters]; + memset(ints, 0, sizeof(ints)); + + // init cluster set + if (!km_set_init(cs, &shape, num_clusters)) { + // return failure + return false; + } + + // add data, return result + return km_set_push(cs, num_clusters, floats, ints); +} |