diff options
author | Paul Duncan <pabs@pablotron.org> | 2019-02-03 18:36:52 -0500 |
---|---|---|
committer | Paul Duncan <pabs@pablotron.org> | 2019-02-03 18:36:52 -0500 |
commit | 33a722132491ebdd31722f0cada0f81f6b082282 (patch) | |
tree | 389ae93db5d0a1ff8085ea5f38026d5e060deea1 | |
parent | aa74bd04f66217ff4d617924630b13d721578159 (diff) | |
download | kmeans-33a722132491ebdd31722f0cada0f81f6b082282.tar.bz2 kmeans-33a722132491ebdd31722f0cada0f81f6b082282.zip |
cluster init refactoring, fix best sort, add km_score()
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Makefile | 3 | ||||
-rw-r--r-- | km-find.c | 12 | ||||
-rw-r--r-- | km-init-forgy.c | 51 | ||||
-rw-r--r-- | km-init-rand.c | 40 | ||||
-rw-r--r-- | km-init-type.c | 32 | ||||
-rw-r--r-- | km-set.c | 83 | ||||
-rw-r--r-- | km.h | 22 | ||||
-rw-r--r-- | main.c | 154 | ||||
-rw-r--r-- | util.h | 8 |
10 files changed, 237 insertions, 169 deletions
@@ -1,2 +1,3 @@ *.o km-test +*.png @@ -1,6 +1,7 @@ APP=km-test CFLAGS=-W -Wall -Wextra -Werror -pedantic -std=c11 -O2 -OBJS=km-set.o km-draw.o km-load.o km-find.o km-rand.o km-solve.o main.o +OBJS=km-set.o km-draw.o km-load.o km-find.o km-rand.o km-solve.o \ + km-init-type.o km-init-rand.o km-init-forgy.o main.o LIBS=-lm .PHONY=all clean @@ -5,14 +5,6 @@ #define MIN_CLUSTER_DISTANCE 0.0001 -static float -get_score( - const float mean_distance, - const size_t num_empty -) { - return 1.0 / (mean_distance + num_empty); -} - typedef struct { float distance_sum, variance_sum; @@ -108,7 +100,7 @@ km_find( for (size_t j = 0; j < cbs->num_tests; j++) { // init cluster set km_set_t cs; - if (!cbs->on_init(&cs, set->shape.num_floats, i, cb_data)) { + if (!cbs->on_init(&cs, i, set, cb_data)) { // return failure return false; } @@ -140,7 +132,7 @@ km_find( cbs->on_data(&result, cb_data); // score result - float score = get_score(result.mean_distance, result.num_empty_clusters); + float score = km_score(result.mean_distance, result.num_empty_clusters); if (score > best_score) { // emit new best result diff --git a/km-init-forgy.c b/km-init-forgy.c new file mode 100644 index 0000000..687c528 --- /dev/null +++ b/km-init-forgy.c @@ -0,0 +1,51 @@ +#include <stdbool.h> // bool +#include <string.h> // memset() +#include "util.h" +#include "km.h" + +// init a set with num_clusters clusters of shape num_floats by picking +// random initial points from the set +bool +km_init_forgy( + km_set_t * const cs, + const size_t num_clusters, + const km_set_t * const set, + km_rand_t * const rs +) { + const size_t num_floats = set->shape.num_floats, + stride = sizeof(float) * num_floats; + + // init cluster shape + const km_shape_t shape = { + .num_floats = num_floats, + .num_ints = 1, + }; + + // get random row offsets + size_t rows[num_clusters]; + if (!km_rand_fill_sizes(rs, num_clusters, rows)) { + // return failure + return false; + } + + // generate random cluster centers + float floats[num_floats * num_clusters]; + for (size_t i = 0; i < num_clusters; i++) { + const size_t row_num = rows[i] % set->num_rows; + const float * const row_floats = km_set_get_row(set, row_num); + memcpy(floats + i * num_floats, row_floats, stride); + } + + // FIXME: should probably be heap-allocated + int ints[num_clusters]; + memset(ints, 0, sizeof(ints)); + + // init cluster set + if (!km_set_init(cs, &shape, num_clusters)) { + // return failure + return false; + } + + // add data, return result + return km_set_push(cs, num_clusters, floats, ints); +} diff --git a/km-init-rand.c b/km-init-rand.c new file mode 100644 index 0000000..f9d6eb6 --- /dev/null +++ b/km-init-rand.c @@ -0,0 +1,40 @@ +#include <stdbool.h> // bool +#include <string.h> // memset() +#include "util.h" +#include "km.h" + +// init a set with num_clusters clusters of shape num_floats by picking +// random cluster centers +bool +km_init_rand( + km_set_t * const cs, + const size_t num_clusters, + const size_t num_floats, + km_rand_t * const rs +) { + // init cluster shape + const km_shape_t shape = { + .num_floats = num_floats, + .num_ints = 1, + }; + + // generate random cluster centers + float floats[num_floats * num_clusters]; + if (!km_rand_fill(rs, num_floats * num_clusters, floats)) { + // return failure + return false; + } + + // FIXME: should probably be heap-allocated + int ints[num_clusters]; + memset(ints, 0, sizeof(ints)); + + // init cluster set + if (!km_set_init(cs, &shape, num_clusters)) { + // return failure + return false; + } + + // add data, return result + return km_set_push(cs, num_clusters, floats, ints); +} diff --git a/km-init-type.c b/km-init-type.c new file mode 100644 index 0000000..27fc1ee --- /dev/null +++ b/km-init-type.c @@ -0,0 +1,32 @@ +#include <stddef.h> // size_t +#include <string.h> // strcmp() +#include "km.h" + +static const struct { + const km_init_type_t type; + const char * const name; +} TYPES[] = {{ + .name = "rand", + .type = KM_INIT_TYPE_RAND, +}, { + .name = "forgy", + .type = KM_INIT_TYPE_FORGY, +}}; + +#define NUM_TYPES (sizeof(TYPES) / sizeof(TYPES[0])) + +km_init_type_t +km_get_init_type( + const char * const s +) { + // find init method + for (size_t i = 0; i < NUM_TYPES; i++) { + if (!strcmp(s, TYPES[i].name)) { + // return type + return TYPES[i].type; + } + } + + // return failure + return KM_INIT_TYPE_LAST; +} @@ -265,86 +265,3 @@ km_set_get_row( const size_t num_floats = set->shape.num_floats; return set->floats + i * num_floats; } - -// init a set with num_clusters clusters of shape num_floats by picking -// random cluster centers -bool -km_set_init_rand_clusters( - km_set_t * const cs, - const size_t num_floats, - const size_t num_clusters, - km_rand_t * const rs -) { - // init cluster shape - const km_shape_t shape = { - .num_floats = num_floats, - .num_ints = 1, - }; - - // generate random cluster centers - float floats[num_floats * num_clusters]; - if (!km_rand_fill(rs, num_floats * num_clusters, floats)) { - // return failure - return false; - } - - // FIXME: should probably be heap-allocated - int ints[num_clusters]; - memset(ints, 0, sizeof(ints)); - - // init cluster set - if (!km_set_init(cs, &shape, num_clusters)) { - // return failure - return false; - } - - // add data, return result - return km_set_push(cs, num_clusters, floats, ints); -} - -// init a set with num_clusters clusters of shape num_floats by picking -// random initial points from the set -bool -km_set_init_rand_points( - km_set_t * const cs, - const km_set_t * const set, - const size_t num_clusters, - km_rand_t * const rs -) { - const size_t num_floats = set->shape.num_floats, - stride = sizeof(float) * num_floats; - - // init cluster shape - const km_shape_t shape = { - .num_floats = num_floats, - .num_ints = 1, - }; - - // get random row offsets - size_t rows[num_clusters]; - if (!km_rand_fill_sizes(rs, num_clusters, rows)) { - // return failure - return false; - } - - // generate random cluster centers - float floats[num_floats * num_clusters]; - for (size_t i = 0; i < num_clusters; i++) { - const size_t row_num = rows[i] % set->num_rows; - const float * const row_floats = km_set_get_row(set, row_num); - memcpy(floats + i * num_floats, row_floats, stride); - } - - // FIXME: should probably be heap-allocated - int ints[num_clusters]; - memset(ints, 0, sizeof(ints)); - - // init cluster set - if (!km_set_init(cs, &shape, num_clusters)) { - // return failure - return false; - } - - // add data, return result - return km_set_push(cs, num_clusters, floats, ints); -} @@ -5,6 +5,14 @@ #include <stdint.h> // uint8_t #include <stdio.h> // FILE +typedef enum { + KM_INIT_TYPE_RAND, + KM_INIT_TYPE_FORGY, + KM_INIT_TYPE_LAST, +} km_init_type_t; + +km_init_type_t km_get_init_type(const char * const); + // forward typedef for callbacks typedef struct km_rand_t_ km_rand_t; @@ -77,21 +85,19 @@ _Bool km_set_normalize(km_set_t * const); // get pointer to data set row float *km_set_get_row(const km_set_t *, const size_t); -// init a set with num_clusters clusters of shape num_floats by picking -// random cluster centers -_Bool km_set_init_rand_clusters( +// init cluster set by picking random centroids +_Bool km_init_rand( km_set_t *, const size_t, const size_t, km_rand_t * ); -// init a set with num_clusters clusters of random points from the -// given set -_Bool km_set_init_rand_points( +// init cluster set by picking random rows from set +_Bool km_init_forgy( km_set_t * const, - const km_set_t * const, const size_t, + const km_set_t * const, km_rand_t * ); @@ -134,8 +140,8 @@ typedef struct { typedef _Bool (*km_find_init_cb_t)( km_set_t * const cluster_set, - const size_t num_floats, const size_t num_clusters, + const km_set_t * const data_set, void *cb_data ); @@ -14,7 +14,7 @@ #define MAX_CLUSTERS 10 #define NUM_TESTS 100 -#define MAX_BEST 4 +#define MAX_BEST 10 #define IM_WIDTH 128 #define IM_HEIGHT 128 @@ -26,6 +26,10 @@ typedef struct { } best_item_t; typedef struct { + // cluster initialization method + km_init_type_t init_type; + + // random number source km_rand_t rs; struct { @@ -35,9 +39,10 @@ typedef struct { size_t num_empty_clusters; } rows[MAX_CLUSTERS - 2]; + // best clusters best_item_t best[MAX_BEST]; size_t num_best; -} find_t; +} ctx_t; static int best_score_cmp( @@ -47,32 +52,32 @@ best_score_cmp( const best_item_t * const a = ap; const best_item_t * const b = bp; - return (a->score > b->score) ? -1 : 1; + return (a->score > b->score) ? 1 : -1; } static void -find_sort_best( - find_t * const find_data +ctx_best_sort( + ctx_t * const ctx ) { // sort best sets by ascending score (worst to best) qsort( - find_data->best, - (find_data->num_best % MAX_BEST), + ctx->best, + (ctx->num_best % MAX_BEST), sizeof(best_item_t), best_score_cmp ); } static void -find_each_best( - const find_t * const find_data, +ctx_best_each( + const ctx_t * const ctx, void (*on_best)(const km_set_t * const, const size_t, const float, void *), void * const cb_data ) { if (on_best) { // walk best sets and emit each one - for (size_t i = 0; i < MIN(find_data->num_best, MAX_BEST); i++) { - on_best(&(find_data->best[i].set), i, find_data->best[i].score, cb_data); + for (size_t i = 0; i < MIN(ctx->num_best, MAX_BEST); i++) { + on_best(&(ctx->best[i].set), i, ctx->best[i].score, cb_data); } } } @@ -84,7 +89,7 @@ load_on_shape( ) { km_set_t * const set = cb_data; - D("shape: %zu floats, %zu ints", shape->num_floats, shape->num_ints); + // D("shape: %zu floats, %zu ints", shape->num_floats, shape->num_ints); // init set if (!km_set_init(set, shape, 100)) { @@ -131,12 +136,22 @@ LOAD_CBS = { static bool find_on_init( km_set_t * const cs, - const size_t num_floats, const size_t num_clusters, + const km_set_t * const set, void *cb_data ) { - find_t *data = cb_data; - return km_set_init_rand_clusters(cs, num_floats, num_clusters, &(data->rs)); + ctx_t * const ctx = cb_data; + km_rand_t * const rs = &(ctx->rs); + + switch(ctx->init_type) { + case KM_INIT_TYPE_RAND: + return km_init_rand(cs, num_clusters, set->shape.num_floats, rs); + case KM_INIT_TYPE_FORGY: + return km_init_forgy(cs, num_clusters, set, rs); + default: + die("unknown cluster init method"); + return false; + } } static bool @@ -154,13 +169,13 @@ find_on_data( const km_find_data_t * const data, void *cb_data ) { - find_t * const find_data = cb_data; + ctx_t * const ctx = cb_data; const size_t ofs = data->num_clusters - 2; - find_data->rows[ofs].distance += data->mean_distance; - find_data->rows[ofs].variance += data->mean_variance; - find_data->rows[ofs].cluster_size += data->mean_cluster_size; - find_data->rows[ofs].num_empty_clusters += data->num_empty_clusters; + ctx->rows[ofs].distance += data->mean_distance; + ctx->rows[ofs].variance += data->mean_variance; + ctx->rows[ofs].cluster_size += data->mean_cluster_size; + ctx->rows[ofs].num_empty_clusters += data->num_empty_clusters; } static bool @@ -169,17 +184,17 @@ find_on_best( const km_set_t * const cs, void *cb_data ) { - find_t * const find_data = cb_data; + ctx_t * const ctx = cb_data; D("best score = %0.3f, num_clusters = %zu", score, cs->num_rows); // get pointer to destination set // (note: data->best is a ring buffer) - km_set_t *dst = &(find_data->best[find_data->num_best % MAX_BEST].set); + km_set_t * const dst = &(ctx->best[ctx->num_best % MAX_BEST].set); - if (find_data->num_best >= MAX_BEST) { + if (ctx->num_best >= MAX_BEST) { // finalize old best data set - // D("finalizing old best %zu", find_data->num_best); + // D("finalizing old best %zu", ctx->num_best); km_set_fini(dst); } @@ -189,7 +204,7 @@ find_on_best( } // increment best count - find_data->num_best++; + ctx->num_best++; // return success return true; @@ -207,33 +222,22 @@ FIND_CBS = { .on_best = find_on_best, }; -static float -get_score( - const size_t ofs, - const find_t * const find_data -) { - // const size_t num_clusters = ofs + 2; - const float mean_distance = find_data->rows[ofs].distance / NUM_TESTS, - mean_empty = 1.0 * find_data->rows[ofs].num_empty_clusters / NUM_TESTS; - - return 1.0 / (mean_distance + mean_empty); -} - static void print_csv_row( - const size_t i, - const find_t * const find_data + const ctx_t * const ctx, + const size_t i ) { const size_t num_clusters = i + 2; - const float mean_distance = find_data->rows[i].distance / NUM_TESTS, - mean_variance = find_data->rows[i].variance / NUM_TESTS, - mean_cluster_size = find_data->rows[i].cluster_size / NUM_TESTS, - mean_empty_clusters = 1.0 * find_data->rows[i].num_empty_clusters / NUM_TESTS; + const float mean_distance = ctx->rows[i].distance / NUM_TESTS, + mean_variance = ctx->rows[i].variance / NUM_TESTS, + mean_cluster_size = ctx->rows[i].cluster_size / NUM_TESTS, + mean_empty_clusters = 1.0 * ctx->rows[i].num_empty_clusters / NUM_TESTS, + score = km_score(mean_distance, mean_empty_clusters); // print result printf("%zu,%0.3f,%0.3f,%0.3f,%0.3f,%0.3f\n", num_clusters, - get_score(i, find_data), + score, mean_distance, mean_variance, mean_cluster_size, @@ -243,7 +247,7 @@ print_csv_row( static void print_csv( - const find_t * const find_data + const ctx_t * const ctx ) { // print headers printf( @@ -256,7 +260,7 @@ print_csv( ); for (size_t i = 0; i < MAX_CLUSTERS - 2; i++) { - print_csv_row(i, find_data); + print_csv_row(ctx, i); } } @@ -269,36 +273,36 @@ save_on_best( const float score, void * const cb_data ) { + UNUSED(score); UNUSED(cb_data); - // convert rank to channel brightness const uint8_t ch = 0x66 + (0xff - 0x66) * (1.0 * rank) / (MAX_BEST - 1); const uint32_t color = (ch & 0xff) << 16; // const uint32_t color = 0xff0000; - D("rank = %zu, score = %0.3f, size = %zu, color = %06x", rank, score, set->num_rows, color); + // D("rank = %zu, score = %0.3f, size = %zu, color = %06x", rank, score, set->num_rows, color); // draw clusters km_set_draw(set, im_data, IM_WIDTH, IM_HEIGHT, 3, color); } static void -save_png( +ctx_save_png( + const ctx_t * const ctx, const char * const png_path, - const km_set_t * const set, - const find_t * const find_data + const km_set_t * const set ) { // clear image data to white memset(im_data, 0xff, sizeof(im_data)); - // draw red points + // draw data points km_set_draw(set, im_data, IM_WIDTH, IM_HEIGHT, 1, 0x000000); if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) { die("stbi_write_png(\"%s\")", png_path); } // draw best cluster points - find_each_best(find_data, save_on_best, NULL); + ctx_best_each(ctx, save_on_best, NULL); // save png if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) { @@ -306,45 +310,61 @@ save_png( } } +static const char USAGE_FORMAT[] = + "Usage: %s [init] [data_path] <png_path>\n" + "\n" + "Arguments:\n" + "* init: Cluster init method (one of \"rand\" or \"set\").\n" + "* data_path: Path to input data file.\n" + "* png_path: Path to output file (optional).\n" + ""; + int main(int argc, char *argv[]) { - // check command-line - if (argc < 2) { - fprintf(stderr, "Usage: %s <data_path> <png_path>\n", argv[0]); + // check command-line arguments + if (argc < 3) { + fprintf(stderr, USAGE_FORMAT, argv[0]); return EXIT_FAILURE; } + // get command-line arguments + const char * const init_type_name = argv[1]; + const char * const data_path = argv[2]; + const char * const png_path = (argc > 3) ? argv[3] : NULL; + // init random seed srand(getpid()); - // init find data - find_t find_data; - memset(&find_data, 0, sizeof(find_t)); - km_rand_init_system(&(find_data.rs)); + // init context + ctx_t ctx; + memset(&ctx, 0, sizeof(ctx_t)); + km_rand_init_system(&(ctx.rs)); + ctx.init_type = km_get_init_type(init_type_name); // init data set km_set_t set; - if (!km_load_path(argv[1], &LOAD_CBS, &set)) { - die("km_load_path() failed"); + if (!km_load_path(data_path, &LOAD_CBS, &set)) { + die("km_load_path(\"%s\") failed", data_path); } + // init data set if (!km_set_normalize(&set)) { die("km_set_normalize() failed"); } // find best solutions - if (!km_find(&set, &FIND_CBS, &find_data)) { + if (!km_find(&set, &FIND_CBS, &ctx)) { die("km_find()"); } // print csv - print_csv(&find_data); + print_csv(&ctx); // sort best results from lowest to highest - find_sort_best(&find_data); + ctx_best_sort(&ctx); - if (argc > 2) { + if (png_path) { // save png of normalized data set and best clusters - save_png(argv[2], &set, &find_data); + ctx_save_png(&ctx, png_path, &set); } // finalize data set @@ -21,4 +21,12 @@ exit(EXIT_FAILURE); \ } while (0) +static inline float +km_score( + const float mean_distance, + const size_t num_empty +) { + return 1.0 / (mean_distance + num_empty); +} + #endif /* UTIL_H */ |