#include // bool #include // size_t #include // memcpy() #include // rand() #include // errno #include "util.h" #include "km.h" #define MIN_ROWS (4096 / sizeof(float)) // grow data set static bool km_set_grow( km_set_t * const set, const size_t capacity ) { float *floats = NULL; const size_t floats_size = sizeof(float) * set->shape.num_floats * capacity; // fprintf(stderr, "floats_size = %zu\n", floats_size); if (floats_size > 0) { // alloc floats floats = realloc(set->floats, floats_size); if (!floats) { // return failure return false; } } int *ints = NULL; const size_t ints_size = sizeof(int) * set->shape.num_ints * capacity; // fprintf(stderr, "ints_size = %zu\n", ints_size); if (ints_size > 0) { // alloc ints ints = realloc(set->ints, ints_size); if (!ints) { // return failure return false; } } // update set set->floats = floats; set->ints = ints; set->capacity = capacity; // return success return true; } // init data set with shape and initial size bool km_set_init( km_set_t * const set, const km_shape_t * const shape, const size_t row_capacity ) { // alloc bounds float * const bounds = malloc(2 * sizeof(float) * shape->num_floats); if (!bounds) { // return error return false; } set->state = KM_SET_STATE_INIT; set->floats = NULL; set->ints = NULL; set->shape = *shape; set->num_rows = 0; set->capacity = 0; set->bounds = bounds; return km_set_grow(set, MAX(MIN_ROWS, row_capacity + 1)); } // finalize data set void km_set_fini(km_set_t * const set) { if (set->state == KM_SET_STATE_FINI) { return; } if (set->bounds) { // free bounds free(set->bounds); set->bounds = NULL; } if (set->floats) { // free floats free(set->floats); set->floats = NULL; } if (set->ints) { // free ints free(set->ints); set->ints = NULL; } // shrink capacity set->capacity = 0; // set state set->state = KM_SET_STATE_FINI; } // append rows to data set, growing set if necessary bool km_set_push( km_set_t * const set, const size_t num_rows, const float * const floats, const int * const ints ) { // check state if (set->state != KM_SET_STATE_INIT) { // return failure return false; } const size_t capacity_needed = set->num_rows + num_rows; // FIXME: potential overflow here if (capacity_needed >= set->capacity) { // crappy growth algorithm const size_t new_capacity = 2 * capacity_needed + 1; // resize storage if (!km_set_grow(set, MAX(MIN_ROWS, new_capacity))) { return false; } } // copy floats const size_t num_floats = set->shape.num_floats; if (num_floats > 0) { float * const dst = set->floats + num_floats * set->num_rows; const size_t stride = sizeof(float) * num_floats; // copy floats memcpy(dst, floats, stride * num_rows); if (!set->num_rows) { // there were no rows, so populate bounds with first row memcpy(set->bounds, floats, stride); memcpy(set->bounds + num_floats, floats, stride); } for (size_t i = 0; i < num_rows; i++) { for (size_t j = 0; j < num_floats; j++) { const float val = floats[i * num_floats + j]; if (val < set->bounds[j]) { // update min bound set->bounds[j] = val; } if (val > set->bounds[num_floats + j]) { // update max bound set->bounds[num_floats + j] = val; } } } } // copy ints const size_t num_ints = set->shape.num_ints; if (num_ints > 0) { int * const dst = set->ints + num_ints * set->num_rows; const size_t stride = sizeof(int) * num_ints; // copy ints memcpy(dst, ints, stride * num_rows); } // increment row count set->num_rows += num_rows; // return success return true; } bool km_set_copy( km_set_t * const dst, const km_set_t * const src ) { if (src->state != KM_SET_STATE_INIT || src->state != KM_SET_STATE_NORMALIZED) { // return failure return false; } // init dst set if (!km_set_init(dst, &(src->shape), src->num_rows)) { // return failure return false; } // copy floats const size_t num_floats = src->shape.num_floats; if (num_floats > 0) { const size_t stride = sizeof(float) * num_floats; // copy floats memcpy(dst->floats, src->floats, stride * src->num_rows); // copy bounds memcpy(dst->bounds, src->bounds, 2 * stride); } // copy ints const size_t num_ints = src->shape.num_ints; if (num_ints > 0) { const size_t stride = sizeof(int) * num_ints; // copy ints memcpy(dst->ints, src->ints, stride * src->num_rows); } // increment row count dst->num_rows = src->num_rows; // return success return true; } bool km_set_normalize( km_set_t * const set ) { const size_t num_floats = set->shape.num_floats; // check set state if (set->state != KM_SET_STATE_INIT) { // return failure return false; } // normalize values for (size_t i = 0; i < set->num_rows; i++) { for (size_t j = 0; j < num_floats; j++) { const size_t ofs = i * num_floats + j; const float val = set->floats[ofs], min = set->bounds[j], max = set->bounds[num_floats + j]; // normalize and write value set->floats[ofs] = (val - min) / (max - min); } } // set state set->state = KM_SET_STATE_NORMALIZED; // return success return true; } // get row from data set float * km_set_get_row( const km_set_t * const set, const size_t i ) { const size_t num_floats = set->shape.num_floats; return set->floats + i * num_floats; } // init a set with num_clusters clusters of shape num_floats by picking // random cluster centers bool km_set_init_rand_clusters( km_set_t * const cs, const size_t num_floats, const size_t num_clusters, km_rand_t * const rs ) { // init cluster shape const km_shape_t shape = { .num_floats = num_floats, .num_ints = 1, }; // generate random cluster centers float floats[num_floats * num_clusters]; if (!km_rand_fill(rs, num_floats * num_clusters, floats)) { // return failure return false; } // FIXME: should probably be heap-allocated int ints[num_clusters]; memset(ints, 0, sizeof(ints)); // init cluster set if (!km_set_init(cs, &shape, num_clusters)) { // return failure return false; } // add data, return result return km_set_push(cs, num_clusters, floats, ints); }