aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--Makefile3
-rw-r--r--km-find.c12
-rw-r--r--km-init-forgy.c51
-rw-r--r--km-init-rand.c40
-rw-r--r--km-init-type.c32
-rw-r--r--km-set.c83
-rw-r--r--km.h22
-rw-r--r--main.c154
-rw-r--r--util.h8
10 files changed, 237 insertions, 169 deletions
diff --git a/.gitignore b/.gitignore
index 07edfc0..fa86ed6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
*.o
km-test
+*.png
diff --git a/Makefile b/Makefile
index 9415e83..f3a71d5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,7 @@
APP=km-test
CFLAGS=-W -Wall -Wextra -Werror -pedantic -std=c11 -O2
-OBJS=km-set.o km-draw.o km-load.o km-find.o km-rand.o km-solve.o main.o
+OBJS=km-set.o km-draw.o km-load.o km-find.o km-rand.o km-solve.o \
+ km-init-type.o km-init-rand.o km-init-forgy.o main.o
LIBS=-lm
.PHONY=all clean
diff --git a/km-find.c b/km-find.c
index 7ed9a6a..8e7f9ac 100644
--- a/km-find.c
+++ b/km-find.c
@@ -5,14 +5,6 @@
#define MIN_CLUSTER_DISTANCE 0.0001
-static float
-get_score(
- const float mean_distance,
- const size_t num_empty
-) {
- return 1.0 / (mean_distance + num_empty);
-}
-
typedef struct {
float distance_sum,
variance_sum;
@@ -108,7 +100,7 @@ km_find(
for (size_t j = 0; j < cbs->num_tests; j++) {
// init cluster set
km_set_t cs;
- if (!cbs->on_init(&cs, set->shape.num_floats, i, cb_data)) {
+ if (!cbs->on_init(&cs, i, set, cb_data)) {
// return failure
return false;
}
@@ -140,7 +132,7 @@ km_find(
cbs->on_data(&result, cb_data);
// score result
- float score = get_score(result.mean_distance, result.num_empty_clusters);
+ float score = km_score(result.mean_distance, result.num_empty_clusters);
if (score > best_score) {
// emit new best result
diff --git a/km-init-forgy.c b/km-init-forgy.c
new file mode 100644
index 0000000..687c528
--- /dev/null
+++ b/km-init-forgy.c
@@ -0,0 +1,51 @@
+#include <stdbool.h> // bool
+#include <string.h> // memset()
+#include "util.h"
+#include "km.h"
+
+// init a set with num_clusters clusters of shape num_floats by picking
+// random initial points from the set
+bool
+km_init_forgy(
+ km_set_t * const cs,
+ const size_t num_clusters,
+ const km_set_t * const set,
+ km_rand_t * const rs
+) {
+ const size_t num_floats = set->shape.num_floats,
+ stride = sizeof(float) * num_floats;
+
+ // init cluster shape
+ const km_shape_t shape = {
+ .num_floats = num_floats,
+ .num_ints = 1,
+ };
+
+ // get random row offsets
+ size_t rows[num_clusters];
+ if (!km_rand_fill_sizes(rs, num_clusters, rows)) {
+ // return failure
+ return false;
+ }
+
+ // generate random cluster centers
+ float floats[num_floats * num_clusters];
+ for (size_t i = 0; i < num_clusters; i++) {
+ const size_t row_num = rows[i] % set->num_rows;
+ const float * const row_floats = km_set_get_row(set, row_num);
+ memcpy(floats + i * num_floats, row_floats, stride);
+ }
+
+ // FIXME: should probably be heap-allocated
+ int ints[num_clusters];
+ memset(ints, 0, sizeof(ints));
+
+ // init cluster set
+ if (!km_set_init(cs, &shape, num_clusters)) {
+ // return failure
+ return false;
+ }
+
+ // add data, return result
+ return km_set_push(cs, num_clusters, floats, ints);
+}
diff --git a/km-init-rand.c b/km-init-rand.c
new file mode 100644
index 0000000..f9d6eb6
--- /dev/null
+++ b/km-init-rand.c
@@ -0,0 +1,40 @@
+#include <stdbool.h> // bool
+#include <string.h> // memset()
+#include "util.h"
+#include "km.h"
+
+// init a set with num_clusters clusters of shape num_floats by picking
+// random cluster centers
+bool
+km_init_rand(
+ km_set_t * const cs,
+ const size_t num_clusters,
+ const size_t num_floats,
+ km_rand_t * const rs
+) {
+ // init cluster shape
+ const km_shape_t shape = {
+ .num_floats = num_floats,
+ .num_ints = 1,
+ };
+
+ // generate random cluster centers
+ float floats[num_floats * num_clusters];
+ if (!km_rand_fill(rs, num_floats * num_clusters, floats)) {
+ // return failure
+ return false;
+ }
+
+ // FIXME: should probably be heap-allocated
+ int ints[num_clusters];
+ memset(ints, 0, sizeof(ints));
+
+ // init cluster set
+ if (!km_set_init(cs, &shape, num_clusters)) {
+ // return failure
+ return false;
+ }
+
+ // add data, return result
+ return km_set_push(cs, num_clusters, floats, ints);
+}
diff --git a/km-init-type.c b/km-init-type.c
new file mode 100644
index 0000000..27fc1ee
--- /dev/null
+++ b/km-init-type.c
@@ -0,0 +1,32 @@
+#include <stddef.h> // size_t
+#include <string.h> // strcmp()
+#include "km.h"
+
+static const struct {
+ const km_init_type_t type;
+ const char * const name;
+} TYPES[] = {{
+ .name = "rand",
+ .type = KM_INIT_TYPE_RAND,
+}, {
+ .name = "forgy",
+ .type = KM_INIT_TYPE_FORGY,
+}};
+
+#define NUM_TYPES (sizeof(TYPES) / sizeof(TYPES[0]))
+
+km_init_type_t
+km_get_init_type(
+ const char * const s
+) {
+ // find init method
+ for (size_t i = 0; i < NUM_TYPES; i++) {
+ if (!strcmp(s, TYPES[i].name)) {
+ // return type
+ return TYPES[i].type;
+ }
+ }
+
+ // return failure
+ return KM_INIT_TYPE_LAST;
+}
diff --git a/km-set.c b/km-set.c
index 535b46b..69ae6c0 100644
--- a/km-set.c
+++ b/km-set.c
@@ -265,86 +265,3 @@ km_set_get_row(
const size_t num_floats = set->shape.num_floats;
return set->floats + i * num_floats;
}
-
-// init a set with num_clusters clusters of shape num_floats by picking
-// random cluster centers
-bool
-km_set_init_rand_clusters(
- km_set_t * const cs,
- const size_t num_floats,
- const size_t num_clusters,
- km_rand_t * const rs
-) {
- // init cluster shape
- const km_shape_t shape = {
- .num_floats = num_floats,
- .num_ints = 1,
- };
-
- // generate random cluster centers
- float floats[num_floats * num_clusters];
- if (!km_rand_fill(rs, num_floats * num_clusters, floats)) {
- // return failure
- return false;
- }
-
- // FIXME: should probably be heap-allocated
- int ints[num_clusters];
- memset(ints, 0, sizeof(ints));
-
- // init cluster set
- if (!km_set_init(cs, &shape, num_clusters)) {
- // return failure
- return false;
- }
-
- // add data, return result
- return km_set_push(cs, num_clusters, floats, ints);
-}
-
-// init a set with num_clusters clusters of shape num_floats by picking
-// random initial points from the set
-bool
-km_set_init_rand_points(
- km_set_t * const cs,
- const km_set_t * const set,
- const size_t num_clusters,
- km_rand_t * const rs
-) {
- const size_t num_floats = set->shape.num_floats,
- stride = sizeof(float) * num_floats;
-
- // init cluster shape
- const km_shape_t shape = {
- .num_floats = num_floats,
- .num_ints = 1,
- };
-
- // get random row offsets
- size_t rows[num_clusters];
- if (!km_rand_fill_sizes(rs, num_clusters, rows)) {
- // return failure
- return false;
- }
-
- // generate random cluster centers
- float floats[num_floats * num_clusters];
- for (size_t i = 0; i < num_clusters; i++) {
- const size_t row_num = rows[i] % set->num_rows;
- const float * const row_floats = km_set_get_row(set, row_num);
- memcpy(floats + i * num_floats, row_floats, stride);
- }
-
- // FIXME: should probably be heap-allocated
- int ints[num_clusters];
- memset(ints, 0, sizeof(ints));
-
- // init cluster set
- if (!km_set_init(cs, &shape, num_clusters)) {
- // return failure
- return false;
- }
-
- // add data, return result
- return km_set_push(cs, num_clusters, floats, ints);
-}
diff --git a/km.h b/km.h
index c70e13e..6f09009 100644
--- a/km.h
+++ b/km.h
@@ -5,6 +5,14 @@
#include <stdint.h> // uint8_t
#include <stdio.h> // FILE
+typedef enum {
+ KM_INIT_TYPE_RAND,
+ KM_INIT_TYPE_FORGY,
+ KM_INIT_TYPE_LAST,
+} km_init_type_t;
+
+km_init_type_t km_get_init_type(const char * const);
+
// forward typedef for callbacks
typedef struct km_rand_t_ km_rand_t;
@@ -77,21 +85,19 @@ _Bool km_set_normalize(km_set_t * const);
// get pointer to data set row
float *km_set_get_row(const km_set_t *, const size_t);
-// init a set with num_clusters clusters of shape num_floats by picking
-// random cluster centers
-_Bool km_set_init_rand_clusters(
+// init cluster set by picking random centroids
+_Bool km_init_rand(
km_set_t *,
const size_t,
const size_t,
km_rand_t *
);
-// init a set with num_clusters clusters of random points from the
-// given set
-_Bool km_set_init_rand_points(
+// init cluster set by picking random rows from set
+_Bool km_init_forgy(
km_set_t * const,
- const km_set_t * const,
const size_t,
+ const km_set_t * const,
km_rand_t *
);
@@ -134,8 +140,8 @@ typedef struct {
typedef _Bool (*km_find_init_cb_t)(
km_set_t * const cluster_set,
- const size_t num_floats,
const size_t num_clusters,
+ const km_set_t * const data_set,
void *cb_data
);
diff --git a/main.c b/main.c
index 73d5655..f071804 100644
--- a/main.c
+++ b/main.c
@@ -14,7 +14,7 @@
#define MAX_CLUSTERS 10
#define NUM_TESTS 100
-#define MAX_BEST 4
+#define MAX_BEST 10
#define IM_WIDTH 128
#define IM_HEIGHT 128
@@ -26,6 +26,10 @@ typedef struct {
} best_item_t;
typedef struct {
+ // cluster initialization method
+ km_init_type_t init_type;
+
+ // random number source
km_rand_t rs;
struct {
@@ -35,9 +39,10 @@ typedef struct {
size_t num_empty_clusters;
} rows[MAX_CLUSTERS - 2];
+ // best clusters
best_item_t best[MAX_BEST];
size_t num_best;
-} find_t;
+} ctx_t;
static int
best_score_cmp(
@@ -47,32 +52,32 @@ best_score_cmp(
const best_item_t * const a = ap;
const best_item_t * const b = bp;
- return (a->score > b->score) ? -1 : 1;
+ return (a->score > b->score) ? 1 : -1;
}
static void
-find_sort_best(
- find_t * const find_data
+ctx_best_sort(
+ ctx_t * const ctx
) {
// sort best sets by ascending score (worst to best)
qsort(
- find_data->best,
- (find_data->num_best % MAX_BEST),
+ ctx->best,
+ (ctx->num_best % MAX_BEST),
sizeof(best_item_t),
best_score_cmp
);
}
static void
-find_each_best(
- const find_t * const find_data,
+ctx_best_each(
+ const ctx_t * const ctx,
void (*on_best)(const km_set_t * const, const size_t, const float, void *),
void * const cb_data
) {
if (on_best) {
// walk best sets and emit each one
- for (size_t i = 0; i < MIN(find_data->num_best, MAX_BEST); i++) {
- on_best(&(find_data->best[i].set), i, find_data->best[i].score, cb_data);
+ for (size_t i = 0; i < MIN(ctx->num_best, MAX_BEST); i++) {
+ on_best(&(ctx->best[i].set), i, ctx->best[i].score, cb_data);
}
}
}
@@ -84,7 +89,7 @@ load_on_shape(
) {
km_set_t * const set = cb_data;
- D("shape: %zu floats, %zu ints", shape->num_floats, shape->num_ints);
+ // D("shape: %zu floats, %zu ints", shape->num_floats, shape->num_ints);
// init set
if (!km_set_init(set, shape, 100)) {
@@ -131,12 +136,22 @@ LOAD_CBS = {
static bool
find_on_init(
km_set_t * const cs,
- const size_t num_floats,
const size_t num_clusters,
+ const km_set_t * const set,
void *cb_data
) {
- find_t *data = cb_data;
- return km_set_init_rand_clusters(cs, num_floats, num_clusters, &(data->rs));
+ ctx_t * const ctx = cb_data;
+ km_rand_t * const rs = &(ctx->rs);
+
+ switch(ctx->init_type) {
+ case KM_INIT_TYPE_RAND:
+ return km_init_rand(cs, num_clusters, set->shape.num_floats, rs);
+ case KM_INIT_TYPE_FORGY:
+ return km_init_forgy(cs, num_clusters, set, rs);
+ default:
+ die("unknown cluster init method");
+ return false;
+ }
}
static bool
@@ -154,13 +169,13 @@ find_on_data(
const km_find_data_t * const data,
void *cb_data
) {
- find_t * const find_data = cb_data;
+ ctx_t * const ctx = cb_data;
const size_t ofs = data->num_clusters - 2;
- find_data->rows[ofs].distance += data->mean_distance;
- find_data->rows[ofs].variance += data->mean_variance;
- find_data->rows[ofs].cluster_size += data->mean_cluster_size;
- find_data->rows[ofs].num_empty_clusters += data->num_empty_clusters;
+ ctx->rows[ofs].distance += data->mean_distance;
+ ctx->rows[ofs].variance += data->mean_variance;
+ ctx->rows[ofs].cluster_size += data->mean_cluster_size;
+ ctx->rows[ofs].num_empty_clusters += data->num_empty_clusters;
}
static bool
@@ -169,17 +184,17 @@ find_on_best(
const km_set_t * const cs,
void *cb_data
) {
- find_t * const find_data = cb_data;
+ ctx_t * const ctx = cb_data;
D("best score = %0.3f, num_clusters = %zu", score, cs->num_rows);
// get pointer to destination set
// (note: data->best is a ring buffer)
- km_set_t *dst = &(find_data->best[find_data->num_best % MAX_BEST].set);
+ km_set_t * const dst = &(ctx->best[ctx->num_best % MAX_BEST].set);
- if (find_data->num_best >= MAX_BEST) {
+ if (ctx->num_best >= MAX_BEST) {
// finalize old best data set
- // D("finalizing old best %zu", find_data->num_best);
+ // D("finalizing old best %zu", ctx->num_best);
km_set_fini(dst);
}
@@ -189,7 +204,7 @@ find_on_best(
}
// increment best count
- find_data->num_best++;
+ ctx->num_best++;
// return success
return true;
@@ -207,33 +222,22 @@ FIND_CBS = {
.on_best = find_on_best,
};
-static float
-get_score(
- const size_t ofs,
- const find_t * const find_data
-) {
- // const size_t num_clusters = ofs + 2;
- const float mean_distance = find_data->rows[ofs].distance / NUM_TESTS,
- mean_empty = 1.0 * find_data->rows[ofs].num_empty_clusters / NUM_TESTS;
-
- return 1.0 / (mean_distance + mean_empty);
-}
-
static void
print_csv_row(
- const size_t i,
- const find_t * const find_data
+ const ctx_t * const ctx,
+ const size_t i
) {
const size_t num_clusters = i + 2;
- const float mean_distance = find_data->rows[i].distance / NUM_TESTS,
- mean_variance = find_data->rows[i].variance / NUM_TESTS,
- mean_cluster_size = find_data->rows[i].cluster_size / NUM_TESTS,
- mean_empty_clusters = 1.0 * find_data->rows[i].num_empty_clusters / NUM_TESTS;
+ const float mean_distance = ctx->rows[i].distance / NUM_TESTS,
+ mean_variance = ctx->rows[i].variance / NUM_TESTS,
+ mean_cluster_size = ctx->rows[i].cluster_size / NUM_TESTS,
+ mean_empty_clusters = 1.0 * ctx->rows[i].num_empty_clusters / NUM_TESTS,
+ score = km_score(mean_distance, mean_empty_clusters);
// print result
printf("%zu,%0.3f,%0.3f,%0.3f,%0.3f,%0.3f\n",
num_clusters,
- get_score(i, find_data),
+ score,
mean_distance,
mean_variance,
mean_cluster_size,
@@ -243,7 +247,7 @@ print_csv_row(
static void
print_csv(
- const find_t * const find_data
+ const ctx_t * const ctx
) {
// print headers
printf(
@@ -256,7 +260,7 @@ print_csv(
);
for (size_t i = 0; i < MAX_CLUSTERS - 2; i++) {
- print_csv_row(i, find_data);
+ print_csv_row(ctx, i);
}
}
@@ -269,36 +273,36 @@ save_on_best(
const float score,
void * const cb_data
) {
+ UNUSED(score);
UNUSED(cb_data);
-
// convert rank to channel brightness
const uint8_t ch = 0x66 + (0xff - 0x66) * (1.0 * rank) / (MAX_BEST - 1);
const uint32_t color = (ch & 0xff) << 16;
// const uint32_t color = 0xff0000;
- D("rank = %zu, score = %0.3f, size = %zu, color = %06x", rank, score, set->num_rows, color);
+ // D("rank = %zu, score = %0.3f, size = %zu, color = %06x", rank, score, set->num_rows, color);
// draw clusters
km_set_draw(set, im_data, IM_WIDTH, IM_HEIGHT, 3, color);
}
static void
-save_png(
+ctx_save_png(
+ const ctx_t * const ctx,
const char * const png_path,
- const km_set_t * const set,
- const find_t * const find_data
+ const km_set_t * const set
) {
// clear image data to white
memset(im_data, 0xff, sizeof(im_data));
- // draw red points
+ // draw data points
km_set_draw(set, im_data, IM_WIDTH, IM_HEIGHT, 1, 0x000000);
if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) {
die("stbi_write_png(\"%s\")", png_path);
}
// draw best cluster points
- find_each_best(find_data, save_on_best, NULL);
+ ctx_best_each(ctx, save_on_best, NULL);
// save png
if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) {
@@ -306,45 +310,61 @@ save_png(
}
}
+static const char USAGE_FORMAT[] =
+ "Usage: %s [init] [data_path] <png_path>\n"
+ "\n"
+ "Arguments:\n"
+ "* init: Cluster init method (one of \"rand\" or \"set\").\n"
+ "* data_path: Path to input data file.\n"
+ "* png_path: Path to output file (optional).\n"
+ "";
+
int main(int argc, char *argv[]) {
- // check command-line
- if (argc < 2) {
- fprintf(stderr, "Usage: %s <data_path> <png_path>\n", argv[0]);
+ // check command-line arguments
+ if (argc < 3) {
+ fprintf(stderr, USAGE_FORMAT, argv[0]);
return EXIT_FAILURE;
}
+ // get command-line arguments
+ const char * const init_type_name = argv[1];
+ const char * const data_path = argv[2];
+ const char * const png_path = (argc > 3) ? argv[3] : NULL;
+
// init random seed
srand(getpid());
- // init find data
- find_t find_data;
- memset(&find_data, 0, sizeof(find_t));
- km_rand_init_system(&(find_data.rs));
+ // init context
+ ctx_t ctx;
+ memset(&ctx, 0, sizeof(ctx_t));
+ km_rand_init_system(&(ctx.rs));
+ ctx.init_type = km_get_init_type(init_type_name);
// init data set
km_set_t set;
- if (!km_load_path(argv[1], &LOAD_CBS, &set)) {
- die("km_load_path() failed");
+ if (!km_load_path(data_path, &LOAD_CBS, &set)) {
+ die("km_load_path(\"%s\") failed", data_path);
}
+ // init data set
if (!km_set_normalize(&set)) {
die("km_set_normalize() failed");
}
// find best solutions
- if (!km_find(&set, &FIND_CBS, &find_data)) {
+ if (!km_find(&set, &FIND_CBS, &ctx)) {
die("km_find()");
}
// print csv
- print_csv(&find_data);
+ print_csv(&ctx);
// sort best results from lowest to highest
- find_sort_best(&find_data);
+ ctx_best_sort(&ctx);
- if (argc > 2) {
+ if (png_path) {
// save png of normalized data set and best clusters
- save_png(argv[2], &set, &find_data);
+ ctx_save_png(&ctx, png_path, &set);
}
// finalize data set
diff --git a/util.h b/util.h
index cef238c..fa54ce0 100644
--- a/util.h
+++ b/util.h
@@ -21,4 +21,12 @@
exit(EXIT_FAILURE); \
} while (0)
+static inline float
+km_score(
+ const float mean_distance,
+ const size_t num_empty
+) {
+ return 1.0 / (mean_distance + num_empty);
+}
+
#endif /* UTIL_H */