aboutsummaryrefslogtreecommitdiff
path: root/src/km.h
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2019-02-05 00:22:15 -0500
committerPaul Duncan <pabs@pablotron.org>2019-02-05 00:22:15 -0500
commitf557d1f49a2914c6084dd18efc783395228d8ce0 (patch)
tree51111fc899f01956cbab948b87c241478576870d /src/km.h
parentb5065ea43cb13c0b553874305b53963176c70f59 (diff)
downloadkmeans-f557d1f49a2914c6084dd18efc783395228d8ce0.tar.bz2
kmeans-f557d1f49a2914c6084dd18efc783395228d8ce0.zip
mv *.[hc] src/
Diffstat (limited to 'src/km.h')
-rw-r--r--src/km.h230
1 files changed, 230 insertions, 0 deletions
diff --git a/src/km.h b/src/km.h
new file mode 100644
index 0000000..5a34261
--- /dev/null
+++ b/src/km.h
@@ -0,0 +1,230 @@
+#ifndef KM_H
+#define KM_H
+
+#include <stddef.h> // size_t
+#include <stdint.h> // uint8_t
+#include <stdio.h> // FILE
+
+// forward typedef for callbacks
+typedef struct km_rand_t_ km_rand_t;
+
+// random number source callbacks
+typedef struct {
+ _Bool (*get_floats)(km_rand_t * const, const size_t, float * const);
+ _Bool (*get_sizes)(km_rand_t * const, const size_t, size_t * const);
+ void (*fini)(km_rand_t * const);
+} km_rand_cbs_t;
+
+// random number source
+struct km_rand_t_ {
+ const km_rand_cbs_t *cbs;
+ void *data;
+};
+
+// fill buffer with N random floats
+_Bool km_rand_get_floats(km_rand_t * const, const size_t, float * const);
+
+// fill buffer with N random size_ts
+_Bool km_rand_get_sizes(km_rand_t * const, const size_t, size_t * const);
+
+// finalize random source
+void km_rand_fini(km_rand_t * const);
+
+// init libc rng (use libc rand())
+void km_rand_init_libc(km_rand_t *);
+
+// init file source rng (e.g. "/dev/urandom")
+_Bool km_rand_init_path(km_rand_t * const, const char *);
+
+// init erand48 rng (use erand48())
+_Bool km_rand_init_erand48(km_rand_t * const, const uint64_t);
+
+// shape of data set
+typedef struct {
+ size_t num_floats,
+ num_ints;
+} km_shape_t;
+
+typedef enum {
+ KM_SET_STATE_INIT,
+ KM_SET_STATE_NORMALIZED,
+ KM_SET_STATE_FINI,
+ KM_SET_STATE_LAST,
+} km_set_state_t;
+
+// data set
+typedef struct {
+ km_set_state_t state;
+
+ float *floats,
+ *bounds;
+ int *ints;
+
+ km_shape_t shape;
+
+ size_t num_rows,
+ capacity;
+} km_set_t;
+
+// init data set with shape and initial size
+_Bool km_set_init(km_set_t * const, const km_shape_t *, const size_t);
+// finalize data set
+void km_set_fini(km_set_t * const);
+
+// append rows to data set, growing set if necessary
+_Bool km_set_push(km_set_t *, const size_t, const float *, const int *);
+
+// deep copy data set
+_Bool km_set_copy(km_set_t * const, const km_set_t * const);
+
+// normalize data set
+_Bool km_set_normalize(km_set_t * const);
+
+// get pointer to data set row
+float *km_set_get_row(const km_set_t *, const size_t);
+
+typedef struct {
+ float d2,
+ d2_near;
+ size_t cluster;
+} km_row_map_t;
+
+typedef struct {
+ const float sum;
+
+ const float silouette;
+
+ const float *mean_dists;
+ const float *mean_nears;
+
+ const size_t num_clusters;
+} km_solve_stats_t;
+
+typedef struct {
+ void (*on_step)(const km_set_t *, const km_row_map_t *, void *);
+ void (*on_stats)(const km_set_t *, const km_solve_stats_t *, void *);
+} km_solve_cbs_t;
+
+// use k-means to iteratively update the cluster centroids until there
+// are no more changes to the centroids
+_Bool
+km_solve(
+ km_set_t * const,
+ const km_set_t * const,
+ const km_solve_cbs_t * const,
+ void * const
+);
+
+typedef struct {
+ const km_set_t * const cluster_set;
+
+ float distance_sum,
+ silouette,
+ mean_cluster_size;
+
+ const size_t num_clusters,
+ num_empty_clusters;
+} km_find_data_t;
+
+typedef _Bool (*km_find_init_cb_t)(
+ km_set_t * const cluster_set,
+ const size_t num_clusters,
+ const km_set_t * const data_set,
+ void *cb_data
+);
+
+typedef _Bool (*km_find_fini_cb_t)(
+ km_set_t * const cluster_set,
+ void *cb_data
+);
+
+typedef _Bool (*km_find_best_cb_t)(
+ const float score,
+ const km_set_t * const cluster_set,
+ void *cb_data
+);
+
+typedef void (*km_find_data_cb_t)(
+ const km_find_data_t * const,
+ void *
+);
+
+typedef struct {
+ // maximum number of clusters
+ size_t max_clusters;
+
+ // maximum number of tests per cluster
+ size_t num_tests;
+
+ km_find_init_cb_t on_init;
+ km_find_fini_cb_t on_fini;
+ km_find_data_cb_t on_data;
+ km_find_best_cb_t on_best;
+} km_find_cbs_t;
+
+// repeatedly test different cluster sizes and report results
+_Bool
+km_find(
+ const km_set_t * const set,
+ const km_find_cbs_t * const cbs,
+ void * const cb_data
+);
+
+// render set to rgb buffer
+void
+km_set_draw(
+ const km_set_t * const set,
+ uint8_t * const rgb,
+ const size_t width,
+ const size_t height,
+ const int dot_size,
+ const uint32_t color
+);
+
+// load callbacks
+typedef struct {
+ _Bool (*on_shape)(const km_shape_t * const, void *);
+ _Bool (*on_row)(const float *, const int *, void *);
+ void (*on_error)(const char * const, void *);
+} km_load_cbs_t;
+
+// load set from file handle
+_Bool
+km_load(
+ FILE * const fh,
+ const km_load_cbs_t * const cbs,
+ void * const cb_data
+);
+
+// load set from path
+_Bool km_load_path(
+ const char * const path,
+ const km_load_cbs_t * const cbs,
+ void * const cb_data
+);
+
+typedef enum {
+ // rand: init cluster set by picking random centroids
+ KM_INIT_TYPE_RAND,
+
+ // forgy: init cluster set by picking random rows from set
+ KM_INIT_TYPE_FORGY,
+
+ // kmeans++: ...
+ KM_INIT_TYPE_KMEANS,
+
+ KM_INIT_TYPE_LAST,
+} km_init_type_t;
+
+km_init_type_t km_init_get_type(const char * const);
+
+// init cluster set using given method
+_Bool km_init(
+ km_set_t * const,
+ const km_init_type_t,
+ const size_t,
+ const km_set_t * const,
+ km_rand_t * const
+);
+
+#endif /* KM_H */