aboutsummaryrefslogtreecommitdiff
path: root/src/main.c
blob: 4c63d1678f1cb87b5bbef3f322a7e346fc949048 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
#include <stdbool.h> // bool
#include <stdio.h> // fprintf()
#include <unistd.h> // EXIT_{FAILURE,SUCCESS}
#include <stdlib.h> // exit()
#include <string.h> // memset()
#include <math.h>  // fabsf()

#define STB_IMAGE_WRITE_IMPLEMENTATION
#define STB_ONLY_PNG
#include "stb_image_write.h"

#include "util.h"
#include "km.h"

// maximum number of clusters to test
#define MAX_CLUSTERS 10

// number of tests to run on each cluster size
#define NUM_TESTS 100
#define MAX_BEST 10

// output image size
#define IM_WIDTH 128
#define IM_HEIGHT 128
#define IM_STRIDE (3 * IM_WIDTH)

// static output image data buffer
static uint8_t im_data[3 * IM_WIDTH * IM_HEIGHT];

typedef struct {
  float score;
  km_set_t set;
} best_item_t;

typedef struct {
  // cluster initialization method
  km_init_type_t init_type;

  // random number source
  km_rand_t rs;

  struct {
    float distance,
          silhouette,
          cluster_size;
    size_t num_empty;
  } rows[MAX_CLUSTERS - 2];

  // best clusters
  best_item_t best[MAX_BEST];
  size_t num_best;
} ctx_t;

static inline size_t
ctx_get_max_best(
  const ctx_t * const ctx
) {
  return MIN(ctx->num_best, MAX_BEST);
}

static int
best_score_cmp(
  const void * const ap,
  const void * const bp
) {
  const best_item_t * const a = ap;
  const best_item_t * const b = bp;

  return (a->score > b->score) ? 1 : -1;
}

static void
ctx_best_sort(
  ctx_t * const ctx
) {
  // sort best sets by ascending score (worst to best)
  qsort(
    ctx->best,
    ctx_get_max_best(ctx),
    sizeof(best_item_t),
    best_score_cmp
  );
}

static bool
load_on_shape(
  const km_shape_t * const shape,
  void * const cb_data
) {
  km_set_t * const set = cb_data;

  // D("shape: %zu floats, %zu ints", shape->num_floats, shape->num_ints);

  // init set
  if (!km_set_init(set, shape, 1000)) {
    die("km_set_init() failed");
  }

  // return success
  return true;
}

static bool
load_on_row(
  const float * const floats,
  const int * const ints,
  void * const cb_data
) {
  km_set_t * const set = cb_data;

  // push row
  if (!km_set_push(set, 1, floats, ints)) {
    die("km_set_push_rows() failed");
  }

  // return success
  return true;
}

static void
load_on_error(
  const char * const err,
  void * const cb_data
) {
  UNUSED(cb_data);
  die("load failed: %s", err);
}

static const km_load_cbs_t
LOAD_CBS = {
  .on_shape = load_on_shape,
  .on_row   = load_on_row,
  .on_error = load_on_error,
};

static bool
find_on_init(
  km_set_t * const cs,
  const size_t num_clusters,
  const km_set_t * const set,
  void *cb_data
) {
  ctx_t * const ctx = cb_data;
  return km_init(cs, ctx->init_type, num_clusters, set, &(ctx->rs));
}

static bool
find_on_fini(
  km_set_t * const cs,
  void *cb_data
) {
  UNUSED(cb_data);
  km_set_fini(cs);
  return true;
}

static void
find_on_data(
  const km_find_data_t * const data,
  void *cb_data
) {
  ctx_t * const ctx = cb_data;
  const size_t ofs = data->num_clusters - 2;

  ctx->rows[ofs].distance += data->distance_sum;
  ctx->rows[ofs].silhouette += data->silhouette;
  ctx->rows[ofs].cluster_size += data->mean_cluster_size;
  ctx->rows[ofs].num_empty += data->num_empty_clusters;
}

static bool
find_on_best(
  const float score,
  const km_set_t * const cs,
  void *cb_data
) {
  ctx_t * const ctx = cb_data;

  D("new best: score = %0.3f, num_clusters = %zu", score, cs->num_rows);

  // get pointer to destination set
  // (note: data->best is a ring buffer)
  const size_t ofs = ctx->num_best % MAX_BEST;
  ctx->best[ofs].score = score;
  km_set_t * const dst = &(ctx->best[ofs].set);

  if (ctx->num_best >= MAX_BEST) {
    // finalize old best data set
    km_set_fini(dst);
  }

  // copy data set to best ring buffer
  if (!km_set_copy(dst, cs)) {
    die("km_set_copy()");
  }

  // increment best count
  ctx->num_best++;

  // return success
  return true;
}

// init find config
static const km_find_cbs_t
FIND_CBS = {
  .max_clusters = MAX_CLUSTERS,
  .num_tests    = NUM_TESTS,

  .on_init      = find_on_init,
  .on_fini      = find_on_fini,
  .on_data      = find_on_data,
  .on_best      = find_on_best,
};

static void
ctx_save_png(
  const ctx_t * const ctx,
  const char * const png_path,
  const km_set_t * const set
) {
  // clear image data to white
  memset(im_data, 0xff, sizeof(im_data));

  // draw data points
  km_set_draw(set, im_data, IM_WIDTH, IM_HEIGHT, 1, 0x000000);
  if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) {
    die("stbi_write_png(\"%s\")", png_path);
  }

  // draw best centroids in green
  const km_set_t * const best_set = &(ctx->best[ctx_get_max_best(ctx) - 1].set);
  km_set_draw(best_set, im_data, IM_WIDTH, IM_HEIGHT, 3, 0x00ff00);

  // save png
  if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) {
    die("stbi_write_png(\"%s\")", png_path);
  }
}

static void
ctx_best_print(
  const ctx_t * const ctx,
  const float * const bounds,
  FILE * const fh
) {
  if (!km_set_print(&(ctx->best[ctx_get_max_best(ctx) - 1].set), bounds, fh)) {
    die("km_set_print()");
  }
}

static const char USAGE_FORMAT[] =
  "Usage: %s [init] [data_path] <png_path>\n"
  "\n"
  "Arguments:\n"
  "* init:      Cluster init method (one of \"rand\" or \"set\").\n"
  "* data_path: Path to input data file.\n"
  "* png_path:  Path to output PNG (optional).\n"
  "";

int main(int argc, char *argv[]) {
  // check command-line arguments
  if (argc < 3) {
    fprintf(stderr, USAGE_FORMAT, argv[0]);
    return EXIT_FAILURE;
  }

  // get command-line arguments
  const char * const init_type_name = argv[1];
  const char * const data_path = argv[2];
  const char * const png_path = (argc > 3) ? argv[3] : NULL;

  // init rng
  srand(getpid());
  const int rand_seed = rand();

  // init context
  ctx_t ctx;
  memset(&ctx, 0, sizeof(ctx_t));
  ctx.init_type = km_init_get_type(init_type_name);

  // init ctx rng
  km_rand_init_erand48(&(ctx.rs), rand_seed);

  // init data set
  km_set_t set;
  if (!km_load_path(data_path, &LOAD_CBS, &set)) {
    die("km_load_path(\"%s\") failed", data_path);
  }

  // normalize data set
  if (!km_set_normalize(&set)) {
    die("km_set_normalize() failed");
  }

  // find best solutions
  if (!km_find(&set, &FIND_CBS, &ctx)) {
    die("km_find()");
  }

  // sort best results from lowest to highest
  ctx_best_sort(&ctx);

  // print best
  ctx_best_print(&ctx, set.bounds, stdout);

  if (png_path) {
    // save png of normalized data set and best clusters
    ctx_save_png(&ctx, png_path, &set);
  }

  // finalize data set
  km_set_fini(&set);

  // return success
  return 0;
}