aboutsummaryrefslogtreecommitdiff
path: root/main.c
blob: ec62022b49345a4c48e020073b0955b9bfdec07b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#include <stdbool.h> // bool
#include <stdio.h> // fprintf()
#include <unistd.h> // EXIT_{FAILURE,SUCCESS}
#include <stdlib.h> // exit()
#include <string.h> // memset()
#include <math.h>  // fabsf()

#define STB_IMAGE_WRITE_IMPLEMENTATION
#define STB_ONLY_PNG
#include "stb_image_write.h"

#include "util.h"
#include "km.h"

#define MAX_CLUSTERS 10
#define NUM_TESTS 100

typedef struct {
  km_rand_t rs;

  struct {
    float distance,
          variance,
          cluster_size;
    size_t num_empty_clusters;
  } rows[MAX_CLUSTERS - 2];
} find_t;

static bool
load_on_shape(
  const km_shape_t * const shape,
  void * const cb_data
) {
  km_set_t * const set = cb_data;

  fprintf(stderr, "DEBUG: shape = { %zu, %zu }\n", shape->num_floats, shape->num_ints);

  // init set
  if (!km_set_init(set, shape, 100)) {
    die("km_set_init() failed");
  }

  // return success
  return true;
}

static bool
load_on_row(
  const float * const floats,
  const int * const ints,
  void * const cb_data
) {
  km_set_t * const set = cb_data;

  // push row
  if (!km_set_push(set, 1, floats, ints)) {
    die("km_set_push_rows() failed");
  }

  // return success
  return true;
}

static void
load_on_error(
  const char * const err,
  void * const cb_data
) {
  UNUSED(cb_data);
  die("load failed: %s", err);
}

static const km_load_cbs_t
LOAD_CBS = {
  .on_shape = load_on_shape,
  .on_row   = load_on_row,
  .on_error = load_on_error,
};

static bool
find_on_init(
  km_set_t * const cs,
  const size_t num_floats,
  const size_t num_clusters,
  void *cb_data
) {
  find_t *data = cb_data;
  return km_set_init_rand_clusters(cs, num_floats, num_clusters, &(data->rs));
}

static bool
find_on_fini(
  km_set_t * const cs,
  void *cb_data
) {
  UNUSED(cb_data);
  km_set_fini(cs);
  return true;
}

static void
find_on_data(
  const km_find_data_t * const data,
  void *cb_data
) {
  find_t * const find_data = cb_data;
  const size_t ofs = data->num_clusters - 2;

  find_data->rows[ofs].distance += data->mean_distance;
  find_data->rows[ofs].variance += data->mean_variance;
  find_data->rows[ofs].cluster_size += data->mean_cluster_size;
  find_data->rows[ofs].num_empty_clusters += data->num_empty_clusters;
}

// init find config
static const km_find_cbs_t
FIND_CBS = {
  .max_clusters = MAX_CLUSTERS,
  .num_tests    = NUM_TESTS,
  .on_init      = find_on_init,
  .on_fini      = find_on_fini,
  .on_data      = find_on_data,
};

static float
get_score(
  const size_t ofs,
  const find_t * const find_data
) {
  if (!ofs || ofs == MAX_CLUSTERS - 3) {
    return 0;
  }
  // const size_t num_clusters = ofs + 2;
  const float mean_distance = find_data->rows[ofs].distance / NUM_TESTS,
              mean_empty = 1.0 * find_data->rows[ofs].num_empty_clusters / NUM_TESTS;

  return 1.0 / (mean_distance + mean_empty);
}

static void
print_csv_row(
  const size_t i,
  const find_t * const find_data
) {
  const size_t num_clusters = i + 2;
  const float mean_distance = find_data->rows[i].distance / NUM_TESTS,
              mean_variance = find_data->rows[i].variance / NUM_TESTS,
              mean_cluster_size = find_data->rows[i].cluster_size / NUM_TESTS,
              mean_empty_clusters = 1.0 * find_data->rows[i].num_empty_clusters / NUM_TESTS;

  // print result
  printf("%zu,%0.3f,%0.3f,%0.3f,%0.3f,%0.3f\n",
    num_clusters,
    get_score(i, find_data),
    mean_distance,
    mean_variance,
    mean_cluster_size,
    mean_empty_clusters
  );
}

static void
print_csv(
  const find_t * const find_data
) {
  // print headers
  printf(
    "#,"
    "score,"
    "distance,"
    "variance,"
    "cluster_size,"
    "empty_clusters\n"
  );

  for (size_t i = 0; i < MAX_CLUSTERS - 2; i++) {
    print_csv_row(i, find_data);
  }
}

#define IM_WIDTH 128
#define IM_HEIGHT 128
#define IM_STRIDE (3 * IM_WIDTH)

static uint8_t im_data[3 * IM_WIDTH * IM_HEIGHT];

static void
save_png(
  const char * const png_path,
  const km_set_t * const set
) {
  // clear image data to white
  memset(im_data, 0xff, sizeof(im_data));

  // draw red points
  km_set_draw(set, im_data, IM_WIDTH, IM_HEIGHT, 0xff0000);
  if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) {
    die("stbi_write_png(\"%s\")", png_path);
  }
}

int main(int argc, char *argv[]) {
  // check command-line
  if (argc < 2) {
    fprintf(stderr, "Usage: %s <data>\n", argv[0]);
    return EXIT_FAILURE;
  }

  // init random seed
  srand(getpid());

  // init find data
  find_t find_data;
  memset(find_data.rows, 0, sizeof(find_data.rows));
  km_rand_init_system(&(find_data.rs));

  // init data set
  km_set_t set;
  if (!km_load_path(argv[1], &LOAD_CBS, &set)) {
    die("km_load_path() failed");
  }

  if (!km_set_normalize(&set)) {
    die("km_set_normalize() failed");
  }

  // find best solution
  if (!km_find(&set, &FIND_CBS, &find_data)) {
    die("km_find()");
  }

  // print csv
  print_csv(&find_data);

  // save png of data set
  save_png("data.png", &set);

  // finalize data set
  km_set_fini(&set);

  // return success
  return 0;
}