aboutsummaryrefslogtreecommitdiff
path: root/main.c
blob: 6acaf600480459108a5dd21c6faece7a37d42057 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#include <stdbool.h> // bool
#include <stdio.h> // fprintf()
#include <unistd.h> // EXIT_{FAILURE,SUCCESS}
#include <stdlib.h> // exit()
#include <string.h> // memset()
#include <math.h>  // fabsf()

#define STB_IMAGE_WRITE_IMPLEMENTATION
#define STB_ONLY_PNG
#include "stb_image_write.h"

#include "util.h"
#include "km.h"

#define MAX_CLUSTERS 10
#define NUM_TESTS 100
#define MAX_BEST 10

#define IM_WIDTH 128
#define IM_HEIGHT 128
#define IM_STRIDE (3 * IM_WIDTH)

typedef struct {
  float score;
  km_set_t set;
} best_item_t;

typedef struct {
  // cluster initialization method
  km_init_type_t init_type;

  // random number source
  km_rand_t rs;

  struct {
    float distance,
          variance,
          cluster_size;
    size_t num_empty_clusters;
  } rows[MAX_CLUSTERS - 2];

  // best clusters
  best_item_t best[MAX_BEST];
  size_t num_best;
} ctx_t;

static int
best_score_cmp(
  const void * const ap,
  const void * const bp
) {
  const best_item_t * const a = ap;
  const best_item_t * const b = bp;

  return (a->score > b->score) ? 1 : -1;
}

static void
ctx_best_sort(
  ctx_t * const ctx
) {
  // sort best sets by ascending score (worst to best)
  qsort(
    ctx->best,
    (ctx->num_best % MAX_BEST),
    sizeof(best_item_t),
    best_score_cmp
  );
}

static void
ctx_best_each(
  const ctx_t * const ctx,
  void (*on_best)(const km_set_t * const, const size_t, const float, void *),
  void * const cb_data
) {
  if (on_best) {
    // walk best sets and emit each one
    for (size_t i = 0; i < MIN(ctx->num_best, MAX_BEST); i++) {
      on_best(&(ctx->best[i].set), i, ctx->best[i].score, cb_data);
    }
  }
}

static bool
load_on_shape(
  const km_shape_t * const shape,
  void * const cb_data
) {
  km_set_t * const set = cb_data;

  // D("shape: %zu floats, %zu ints", shape->num_floats, shape->num_ints);

  // init set
  if (!km_set_init(set, shape, 100)) {
    die("km_set_init() failed");
  }

  // return success
  return true;
}

static bool
load_on_row(
  const float * const floats,
  const int * const ints,
  void * const cb_data
) {
  km_set_t * const set = cb_data;

  // push row
  if (!km_set_push(set, 1, floats, ints)) {
    die("km_set_push_rows() failed");
  }

  // return success
  return true;
}

static void
load_on_error(
  const char * const err,
  void * const cb_data
) {
  UNUSED(cb_data);
  die("load failed: %s", err);
}

static const km_load_cbs_t
LOAD_CBS = {
  .on_shape = load_on_shape,
  .on_row   = load_on_row,
  .on_error = load_on_error,
};

static bool
find_on_init(
  km_set_t * const cs,
  const size_t num_clusters,
  const km_set_t * const set,
  void *cb_data
) {
  ctx_t * const ctx = cb_data;
  return km_init(cs, ctx->init_type, num_clusters, set, &(ctx->rs));
}

static bool
find_on_fini(
  km_set_t * const cs,
  void *cb_data
) {
  UNUSED(cb_data);
  km_set_fini(cs);
  return true;
}

static void
find_on_data(
  const km_find_data_t * const data,
  void *cb_data
) {
  ctx_t * const ctx = cb_data;
  const size_t ofs = data->num_clusters - 2;

  ctx->rows[ofs].distance += data->mean_distance;
  ctx->rows[ofs].variance += data->mean_variance;
  ctx->rows[ofs].cluster_size += data->mean_cluster_size;
  ctx->rows[ofs].num_empty_clusters += data->num_empty_clusters;
}

static bool
find_on_best(
  const float score,
  const km_set_t * const cs,
  void *cb_data
) {
  ctx_t * const ctx = cb_data;

  D("best score = %0.3f, num_clusters = %zu", score, cs->num_rows);

  // get pointer to destination set
  // (note: data->best is a ring buffer)
  km_set_t * const dst = &(ctx->best[ctx->num_best % MAX_BEST].set);

  if (ctx->num_best >= MAX_BEST) {
    // finalize old best data set
    // D("finalizing old best %zu", ctx->num_best);
    km_set_fini(dst);
  }

  // copy data set to best ring buffer
  if (!km_set_copy(dst, cs)) {
    die("km_set_copy()");
  }

  // increment best count
  ctx->num_best++;

  // return success
  return true;
}

// init find config
static const km_find_cbs_t
FIND_CBS = {
  .max_clusters = MAX_CLUSTERS,
  .num_tests    = NUM_TESTS,

  .on_init      = find_on_init,
  .on_fini      = find_on_fini,
  .on_data      = find_on_data,
  .on_best      = find_on_best,
};

static void
print_csv_row(
  const ctx_t * const ctx,
  const size_t i
) {
  const size_t num_clusters = i + 2;
  const float mean_distance = ctx->rows[i].distance / NUM_TESTS,
              mean_variance = ctx->rows[i].variance / NUM_TESTS,
              mean_cluster_size = ctx->rows[i].cluster_size / NUM_TESTS,
              mean_empty_clusters = 1.0 * ctx->rows[i].num_empty_clusters / NUM_TESTS,
              score = km_score(mean_distance, mean_empty_clusters);

  // print result
  printf("%zu,%0.3f,%0.3f,%0.3f,%0.3f,%0.3f\n",
    num_clusters,
    score,
    mean_distance,
    mean_variance,
    mean_cluster_size,
    mean_empty_clusters
  );
}

static void
print_csv(
  const ctx_t * const ctx
) {
  // print headers
  printf(
    "#,"
    "score,"
    "distance,"
    "variance,"
    "cluster_size,"
    "empty_clusters\n"
  );

  for (size_t i = 0; i < MAX_CLUSTERS - 2; i++) {
    print_csv_row(ctx, i);
  }
}

static uint8_t im_data[3 * IM_WIDTH * IM_HEIGHT];

static void
save_on_best(
  const km_set_t * const set,
  const size_t rank,
  const float score,
  void * const cb_data
) {
  UNUSED(score);
  UNUSED(cb_data);

  // convert rank to channel brightness
  const uint8_t ch = 0x66 + (0xff - 0x66) * (1.0 * rank) / (MAX_BEST - 1);
  const uint32_t color = (ch & 0xff) << 16;
  // const uint32_t color = 0xff0000;
  // D("rank = %zu, score = %0.3f, size = %zu, color = %06x", rank, score, set->num_rows, color);

  // draw clusters
  km_set_draw(set, im_data, IM_WIDTH, IM_HEIGHT, 3, color);
}

static void
ctx_save_png(
  const ctx_t * const ctx,
  const char * const png_path,
  const km_set_t * const set
) {
  // clear image data to white
  memset(im_data, 0xff, sizeof(im_data));

  // draw data points
  km_set_draw(set, im_data, IM_WIDTH, IM_HEIGHT, 1, 0x000000);
  if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) {
    die("stbi_write_png(\"%s\")", png_path);
  }

  // draw best cluster points
  ctx_best_each(ctx, save_on_best, NULL);

  // save png
  if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) {
    die("stbi_write_png(\"%s\")", png_path);
  }
}

static const char USAGE_FORMAT[] =
  "Usage: %s [init] [data_path] <png_path>\n"
  "\n"
  "Arguments:\n"
  "* init:      Cluster init method (one of \"rand\" or \"set\").\n"
  "* data_path: Path to input data file.\n"
  "* png_path:  Path to output file (optional).\n"
  "";

int main(int argc, char *argv[]) {
  // check command-line arguments
  if (argc < 3) {
    fprintf(stderr, USAGE_FORMAT, argv[0]);
    return EXIT_FAILURE;
  }

  // get command-line arguments
  const char * const init_type_name = argv[1];
  const char * const data_path = argv[2];
  const char * const png_path = (argc > 3) ? argv[3] : NULL;

  // init random seed
  srand(getpid());

  // init context
  ctx_t ctx;
  memset(&ctx, 0, sizeof(ctx_t));
  km_rand_init_system(&(ctx.rs));
  ctx.init_type = km_init_get_type(init_type_name);

  // init data set
  km_set_t set;
  if (!km_load_path(data_path, &LOAD_CBS, &set)) {
    die("km_load_path(\"%s\") failed", data_path);
  }

  // normalize data set
  if (!km_set_normalize(&set)) {
    die("km_set_normalize() failed");
  }

  // find best solutions
  if (!km_find(&set, &FIND_CBS, &ctx)) {
    die("km_find()");
  }

  // print csv
  print_csv(&ctx);

  // sort best results from lowest to highest
  ctx_best_sort(&ctx);

  if (png_path) {
    // save png of normalized data set and best clusters
    ctx_save_png(&ctx, png_path, &set);
  }

  // finalize data set
  km_set_fini(&set);

  // return success
  return 0;
}