aboutsummaryrefslogtreecommitdiff
path: root/src/km.h
blob: 83d7be0bc9c34458ec85b6ec915fee0d489beee5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#ifndef KM_H
#define KM_H

#include <stddef.h> // size_t
#include <stdint.h> // uint8_t
#include <stdio.h> // FILE

// forward typedef for callbacks
typedef struct km_rand_t_ km_rand_t;

// random number source callbacks
typedef struct {
  _Bool (*get_floats)(km_rand_t * const, const size_t, float * const);
  _Bool (*get_sizes)(km_rand_t * const, const size_t, size_t * const);
  void (*fini)(km_rand_t * const);
} km_rand_cbs_t;

// random number source
struct km_rand_t_ {
  const km_rand_cbs_t *cbs;
  void *data;
};

// fill buffer with N random floats
_Bool km_rand_get_floats(km_rand_t * const, const size_t, float * const);

// fill buffer with N random size_ts
_Bool km_rand_get_sizes(km_rand_t * const, const size_t, size_t * const);

// finalize random source
void km_rand_fini(km_rand_t * const);

// init libc rng (use libc rand())
void km_rand_init_libc(km_rand_t *);

// init file source rng (e.g. "/dev/urandom")
_Bool km_rand_init_path(km_rand_t * const, const char *);

// init erand48 rng (use erand48())
_Bool km_rand_init_erand48(km_rand_t * const, const uint64_t);

// shape of data set
typedef struct {
  size_t num_floats,
         num_ints;
} km_shape_t;

typedef enum {
  KM_SET_STATE_INIT,
  KM_SET_STATE_NORMALIZED,
  KM_SET_STATE_FINI,
  KM_SET_STATE_LAST,
} km_set_state_t;

// data set
typedef struct {
  km_set_state_t state;

  float *floats,
        *bounds;
  int *ints;

  km_shape_t shape;

  size_t num_rows,
         capacity;
} km_set_t;

// init data set with shape and initial size
_Bool km_set_init(km_set_t * const, const km_shape_t *, const size_t);
// finalize data set
void km_set_fini(km_set_t * const);

// append rows to data set, growing set if necessary
_Bool km_set_push(km_set_t *, const size_t, const float *, const int *);

// deep copy data set
_Bool km_set_copy(km_set_t * const, const km_set_t * const);

// normalize data set
_Bool km_set_normalize(km_set_t * const);

// get pointer to data set row
float *km_set_get_row(const km_set_t *, const size_t);

// get pointer to data set row ints
int *km_set_get_row_ints(const km_set_t *, const size_t);

typedef struct {
  float d2,
        d2_near;
  size_t cluster;
} km_row_map_t;

typedef struct {
  const float sum;
  const float silhouette;
  const size_t num_clusters;
} km_solve_stats_t;

typedef struct {
  void (*on_step)(const km_set_t *, const km_row_map_t *, void *);
  void (*on_stats)(const km_set_t *, const km_solve_stats_t *, void *);
} km_solve_cbs_t;

// use k-means to iteratively update the cluster centroids until there
// are no more changes to the centroids
_Bool
km_solve(
  km_set_t * const,
  const km_set_t * const,
  km_row_map_t * const,
  const km_solve_cbs_t * const,
  void * const
);

typedef struct {
  const km_set_t * const cluster_set;

  float distance_sum,
        silhouette,
        mean_cluster_size;

  const size_t num_clusters,
               num_empty_clusters;
} km_find_data_t;

typedef _Bool (*km_find_init_cb_t)(
  km_set_t * const cluster_set,
  const size_t num_clusters,
  const km_set_t * const data_set,
  void *cb_data
);

typedef _Bool (*km_find_fini_cb_t)(
  km_set_t * const cluster_set,
  void *cb_data
);

typedef _Bool (*km_find_best_cb_t)(
  const float score,
  const km_set_t * const cluster_set,
  void *cb_data
);

typedef void (*km_find_data_cb_t)(
  const km_find_data_t * const,
  void *
);

typedef struct {
  // maximum number of clusters
  size_t max_clusters;

  // maximum number of tests per cluster
  size_t num_tests;

  km_find_init_cb_t on_init;
  km_find_fini_cb_t on_fini;
  km_find_data_cb_t on_data;
  km_find_best_cb_t on_best;
} km_find_cbs_t;

// repeatedly test different cluster sizes and report results
_Bool
km_find(
  const km_set_t * const set,
  const km_find_cbs_t * const cbs,
  void * const cb_data
);

// render set to rgb buffer
void
km_set_draw(
  const km_set_t * const set,
  uint8_t * const rgb,
  const size_t width,
  const size_t height,
  const int dot_size,
  const uint32_t color
);

// load callbacks
typedef struct {
  _Bool (*on_shape)(const km_shape_t * const, void *);
  _Bool (*on_row)(const float *, const int *, void *);
  void (*on_error)(const char * const, void *);
} km_load_cbs_t;

// load set from file handle
_Bool
km_load(
  FILE * const fh,
  const km_load_cbs_t * const cbs,
  void * const cb_data
);

// print set to file handle
_Bool
km_set_print(
  const km_set_t * const,
  FILE * const fh
);

// load set from path
_Bool km_load_path(
  const char * const path,
  const km_load_cbs_t * const cbs,
  void * const cb_data
);

typedef enum {
  // rand: init cluster set by picking random centroids
  KM_INIT_TYPE_RAND,

  // forgy: init cluster set by picking random rows from set
  KM_INIT_TYPE_FORGY,

  // kmeans++: ...
  KM_INIT_TYPE_KMEANS,

  KM_INIT_TYPE_LAST,
} km_init_type_t;

km_init_type_t km_init_get_type(const char * const);

// init cluster set using given method
_Bool km_init(
  km_set_t * const,
  const km_init_type_t,
  const size_t,
  const km_set_t * const,
  km_rand_t * const
);

#endif /* KM_H */