aboutsummaryrefslogtreecommitdiff
path: root/main.c
blob: 73d565555de656864211138a1d62943b22db7634 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
#include <stdbool.h> // bool
#include <stdio.h> // fprintf()
#include <unistd.h> // EXIT_{FAILURE,SUCCESS}
#include <stdlib.h> // exit()
#include <string.h> // memset()
#include <math.h>  // fabsf()

#define STB_IMAGE_WRITE_IMPLEMENTATION
#define STB_ONLY_PNG
#include "stb_image_write.h"

#include "util.h"
#include "km.h"

#define MAX_CLUSTERS 10
#define NUM_TESTS 100
#define MAX_BEST 4

#define IM_WIDTH 128
#define IM_HEIGHT 128
#define IM_STRIDE (3 * IM_WIDTH)

typedef struct {
  float score;
  km_set_t set;
} best_item_t;

typedef struct {
  km_rand_t rs;

  struct {
    float distance,
          variance,
          cluster_size;
    size_t num_empty_clusters;
  } rows[MAX_CLUSTERS - 2];

  best_item_t best[MAX_BEST];
  size_t num_best;
} find_t;

static int
best_score_cmp(
  const void * const ap,
  const void * const bp
) {
  const best_item_t * const a = ap;
  const best_item_t * const b = bp;

  return (a->score > b->score) ? -1 : 1;
}

static void
find_sort_best(
  find_t * const find_data
) {
  // sort best sets by ascending score (worst to best)
  qsort(
    find_data->best,
    (find_data->num_best % MAX_BEST),
    sizeof(best_item_t),
    best_score_cmp
  );
}

static void
find_each_best(
  const find_t * const find_data,
  void (*on_best)(const km_set_t * const, const size_t, const float, void *),
  void * const cb_data
) {
  if (on_best) {
    // walk best sets and emit each one
    for (size_t i = 0; i < MIN(find_data->num_best, MAX_BEST); i++) {
      on_best(&(find_data->best[i].set), i, find_data->best[i].score, cb_data);
    }
  }
}

static bool
load_on_shape(
  const km_shape_t * const shape,
  void * const cb_data
) {
  km_set_t * const set = cb_data;

  D("shape: %zu floats, %zu ints", shape->num_floats, shape->num_ints);

  // init set
  if (!km_set_init(set, shape, 100)) {
    die("km_set_init() failed");
  }

  // return success
  return true;
}

static bool
load_on_row(
  const float * const floats,
  const int * const ints,
  void * const cb_data
) {
  km_set_t * const set = cb_data;

  // push row
  if (!km_set_push(set, 1, floats, ints)) {
    die("km_set_push_rows() failed");
  }

  // return success
  return true;
}

static void
load_on_error(
  const char * const err,
  void * const cb_data
) {
  UNUSED(cb_data);
  die("load failed: %s", err);
}

static const km_load_cbs_t
LOAD_CBS = {
  .on_shape = load_on_shape,
  .on_row   = load_on_row,
  .on_error = load_on_error,
};

static bool
find_on_init(
  km_set_t * const cs,
  const size_t num_floats,
  const size_t num_clusters,
  void *cb_data
) {
  find_t *data = cb_data;
  return km_set_init_rand_clusters(cs, num_floats, num_clusters, &(data->rs));
}

static bool
find_on_fini(
  km_set_t * const cs,
  void *cb_data
) {
  UNUSED(cb_data);
  km_set_fini(cs);
  return true;
}

static void
find_on_data(
  const km_find_data_t * const data,
  void *cb_data
) {
  find_t * const find_data = cb_data;
  const size_t ofs = data->num_clusters - 2;

  find_data->rows[ofs].distance += data->mean_distance;
  find_data->rows[ofs].variance += data->mean_variance;
  find_data->rows[ofs].cluster_size += data->mean_cluster_size;
  find_data->rows[ofs].num_empty_clusters += data->num_empty_clusters;
}

static bool
find_on_best(
  const float score,
  const km_set_t * const cs,
  void *cb_data
) {
  find_t * const find_data = cb_data;

  D("best score = %0.3f, num_clusters = %zu", score, cs->num_rows);

  // get pointer to destination set
  // (note: data->best is a ring buffer)
  km_set_t *dst = &(find_data->best[find_data->num_best % MAX_BEST].set);

  if (find_data->num_best >= MAX_BEST) {
    // finalize old best data set
    // D("finalizing old best %zu", find_data->num_best);
    km_set_fini(dst);
  }

  // copy data set to best ring buffer
  if (!km_set_copy(dst, cs)) {
    die("km_set_copy()");
  }

  // increment best count
  find_data->num_best++;

  // return success
  return true;
}

// init find config
static const km_find_cbs_t
FIND_CBS = {
  .max_clusters = MAX_CLUSTERS,
  .num_tests    = NUM_TESTS,

  .on_init      = find_on_init,
  .on_fini      = find_on_fini,
  .on_data      = find_on_data,
  .on_best      = find_on_best,
};

static float
get_score(
  const size_t ofs,
  const find_t * const find_data
) {
  // const size_t num_clusters = ofs + 2;
  const float mean_distance = find_data->rows[ofs].distance / NUM_TESTS,
              mean_empty = 1.0 * find_data->rows[ofs].num_empty_clusters / NUM_TESTS;

  return 1.0 / (mean_distance + mean_empty);
}

static void
print_csv_row(
  const size_t i,
  const find_t * const find_data
) {
  const size_t num_clusters = i + 2;
  const float mean_distance = find_data->rows[i].distance / NUM_TESTS,
              mean_variance = find_data->rows[i].variance / NUM_TESTS,
              mean_cluster_size = find_data->rows[i].cluster_size / NUM_TESTS,
              mean_empty_clusters = 1.0 * find_data->rows[i].num_empty_clusters / NUM_TESTS;

  // print result
  printf("%zu,%0.3f,%0.3f,%0.3f,%0.3f,%0.3f\n",
    num_clusters,
    get_score(i, find_data),
    mean_distance,
    mean_variance,
    mean_cluster_size,
    mean_empty_clusters
  );
}

static void
print_csv(
  const find_t * const find_data
) {
  // print headers
  printf(
    "#,"
    "score,"
    "distance,"
    "variance,"
    "cluster_size,"
    "empty_clusters\n"
  );

  for (size_t i = 0; i < MAX_CLUSTERS - 2; i++) {
    print_csv_row(i, find_data);
  }
}

static uint8_t im_data[3 * IM_WIDTH * IM_HEIGHT];

static void
save_on_best(
  const km_set_t * const set,
  const size_t rank,
  const float score,
  void * const cb_data
) {
  UNUSED(cb_data);


  // convert rank to channel brightness
  const uint8_t ch = 0x66 + (0xff - 0x66) * (1.0 * rank) / (MAX_BEST - 1);
  const uint32_t color = (ch & 0xff) << 16;
  // const uint32_t color = 0xff0000;
  D("rank = %zu, score = %0.3f, size = %zu, color = %06x", rank, score, set->num_rows, color);

  // draw clusters
  km_set_draw(set, im_data, IM_WIDTH, IM_HEIGHT, 3, color);
}

static void
save_png(
  const char * const png_path,
  const km_set_t * const set,
  const find_t * const find_data
) {
  // clear image data to white
  memset(im_data, 0xff, sizeof(im_data));

  // draw red points
  km_set_draw(set, im_data, IM_WIDTH, IM_HEIGHT, 1, 0x000000);
  if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) {
    die("stbi_write_png(\"%s\")", png_path);
  }

  // draw best cluster points
  find_each_best(find_data, save_on_best, NULL);

  // save png
  if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) {
    die("stbi_write_png(\"%s\")", png_path);
  }
}

int main(int argc, char *argv[]) {
  // check command-line
  if (argc < 2) {
    fprintf(stderr, "Usage: %s <data_path> <png_path>\n", argv[0]);
    return EXIT_FAILURE;
  }

  // init random seed
  srand(getpid());

  // init find data
  find_t find_data;
  memset(&find_data, 0, sizeof(find_t));
  km_rand_init_system(&(find_data.rs));

  // init data set
  km_set_t set;
  if (!km_load_path(argv[1], &LOAD_CBS, &set)) {
    die("km_load_path() failed");
  }

  if (!km_set_normalize(&set)) {
    die("km_set_normalize() failed");
  }

  // find best solutions
  if (!km_find(&set, &FIND_CBS, &find_data)) {
    die("km_find()");
  }

  // print csv
  print_csv(&find_data);

  // sort best results from lowest to highest
  find_sort_best(&find_data);

  if (argc > 2) {
    // save png of normalized data set and best clusters
    save_png(argv[2], &set, &find_data);
  }

  // finalize data set
  km_set_fini(&set);

  // return success
  return 0;
}