aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--km-find.c54
-rw-r--r--km-solve.c59
-rw-r--r--km.h13
-rw-r--r--main.c32
4 files changed, 69 insertions, 89 deletions
diff --git a/km-find.c b/km-find.c
index 7cd24f6..3716123 100644
--- a/km-find.c
+++ b/km-find.c
@@ -7,31 +7,11 @@
typedef struct {
float sum,
- mean_sum,
- variance_sum;
- size_t num_empty_clusters;
+ silouette;
} find_solve_data_t;
static float
-find_get_mean_distance(
- const find_solve_data_t * const d,
- const size_t num_clusters
-) {
- const bool num_filled = num_clusters - d->num_empty_clusters;
- return num_filled ? (d->mean_sum / num_filled) : 0;
-}
-
-static float
-find_get_mean_variance(
- const find_solve_data_t * const d,
- const size_t num_clusters
-) {
- const bool num_filled = num_clusters - d->num_empty_clusters;
- return num_filled ? (d->variance_sum / num_filled) : 0;
-}
-
-static float
-find_get_mean_cluster_size(
+get_mean_cluster_size(
const km_set_t * const set
) {
float sum = 0;
@@ -56,21 +36,9 @@ find_solve_on_stats(
find_solve_data_t * const solve_data = cb_data;
UNUSED(set);
- // save total sum
+ // save total sum and silouette
solve_data->sum = stats->sum;
-
- // calculate numerator for the average distance across all clusters in
- // this test
- for (size_t i = 0; i < stats->num_clusters; i++) {
- if (fabsf(stats->means[i]) > MIN_CLUSTER_DISTANCE) {
- // increment mean count
- solve_data->mean_sum += stats->means[i];
- solve_data->variance_sum += stats->variances[i];
- } else {
- // increment empty cluster count
- solve_data->num_empty_clusters++;
- }
- }
+ solve_data->silouette = stats->silouette;
}
static const km_solve_cbs_t
@@ -111,10 +79,8 @@ km_find(
// init solve data
find_solve_data_t solve_data = {
- .sum = 0,
- .mean_sum = 0,
- .variance_sum = 0,
- .num_empty_clusters = 0,
+ .sum = 0,
+ .silouette = 0,
};
// solve test
@@ -128,17 +94,15 @@ km_find(
.cluster_set = &cs,
.num_clusters = i,
.distance_sum = solve_data.sum,
- .mean_distance = find_get_mean_distance(&solve_data, i),
- .mean_variance = find_get_mean_variance(&solve_data, i),
- .mean_cluster_size = find_get_mean_cluster_size(&cs),
- .num_empty_clusters = solve_data.num_empty_clusters,
+ .silouette = solve_data.silouette,
+ .mean_cluster_size = get_mean_cluster_size(&cs),
};
// emit result
cbs->on_data(&result, cb_data);
// score result
- float score = km_score(result.distance_sum, result.num_empty_clusters);
+ const float score = solve_data.silouette;
if (score > best_score) {
// emit new best result
diff --git a/km-solve.c b/km-solve.c
index 2a4de16..8a70a48 100644
--- a/km-solve.c
+++ b/km-solve.c
@@ -5,6 +5,8 @@
#include "util.h"
#include "km.h"
+#define MIN_CLUSTER_DISTANCE 0.00001
+
// alloc and initialize row map
static km_row_map_t *
km_row_map_init(
@@ -19,9 +21,11 @@ km_row_map_init(
return false;
}
- // init row map by setting the maximum distance
+ // init row map
for (size_t i = 0; i < num_rows; i++) {
+ // setting distances to maximum
row_map[i].d2 = FLT_MAX;
+ row_map[i].d2_near = FLT_MAX;
}
// return row map
@@ -84,6 +88,13 @@ km_solve(
// flag change
changed = true;
}
+
+ if ((row_map[j].cluster != i) && (d2 < row_map[j].d2_near)) {
+ row_map[j].d2_near = d2;
+
+ // flag change
+ changed = true;
+ }
}
}
@@ -131,44 +142,48 @@ km_solve(
if (cbs && cbs->on_stats) {
float sum = 0,
- means[num_clusters],
- variances[num_clusters];
+ silouette = 0,
+ mean_dists[num_clusters],
+ mean_nears[num_clusters];
- memset(means, 0, sizeof(float) * num_clusters);
- memset(variances, 0, sizeof(float) * num_clusters);
+ memset(mean_dists, 0, sizeof(mean_dists));
+ memset(mean_nears, 0, sizeof(mean_nears));
// calculate sum of distances across all clusters
for (size_t i = 0; i < set->num_rows; i++) {
sum += row_map[i].d2;
}
- // calculate mean distances
+ // calculate mean numerators and silouette
for (size_t i = 0; i < set->num_rows; i++) {
- means[row_map[i].cluster] += row_map[i].d2;
- }
- // finalize means
- for (size_t i = 0; i < num_clusters; i++) {
- means[i] = (cs->ints[i]) ? (sqrt(means[i]) / cs->ints[i]) : 0;
- }
+ // distance squared (d2) to center of this cluster
+ mean_dists[row_map[i].cluster] += row_map[i].d2;
- // calculate variances
- for (size_t i = 0; i < set->num_rows; i++) {
- const size_t cluster = row_map[i].cluster;
- const float variance = (sqrt(row_map[i].d2) - means[cluster]) *
- (sqrt(row_map[i].d2) - means[cluster]);
- variances[cluster] += variance;
+ // distance squared (d2) to center of nearest cluster
+ mean_nears[row_map[i].cluster] += row_map[i].d2_near;
+
+ // calculate silouette denominator
+ const float delta = row_map[i].d2_near - row_map[i].d2;
+ if (fabsf(delta) > MIN_CLUSTER_DISTANCE) {
+ silouette += delta / MAX(row_map[i].d2, row_map[i].d2_near);
+ }
}
- // finalize variances
+ // finalize means (divide by row count)
for (size_t i = 0; i < num_clusters; i++) {
- variances[i] = (cs->ints[i]) ? (variances[i] / cs->ints[i]) : 0;
+ mean_dists[i] = (cs->ints[i]) ? (sqrt(mean_dists[i]) / cs->ints[i]) : 0;
+ mean_nears[i] = (cs->ints[i]) ? (sqrt(mean_nears[i]) / cs->ints[i]) : 0;
}
+ // finalize silouette
+ silouette /= set->num_rows;
+
// build stats
const km_solve_stats_t stats = {
.sum = sum,
- .means = means,
- .variances = variances,
+ .silouette = silouette,
+ .mean_dists = mean_dists,
+ .mean_nears = mean_nears,
.num_clusters = num_clusters,
};
diff --git a/km.h b/km.h
index bc2e3a6..bd3fe41 100644
--- a/km.h
+++ b/km.h
@@ -84,14 +84,18 @@ _Bool km_set_normalize(km_set_t * const);
float *km_set_get_row(const km_set_t *, const size_t);
typedef struct {
- float d2;
+ float d2, d2_near;
size_t cluster;
} km_row_map_t;
typedef struct {
const float sum;
- const float *means;
- const float *variances;
+
+ const float silouette;
+
+ const float *mean_dists;
+ const float *mean_nears;
+
const size_t num_clusters;
} km_solve_stats_t;
@@ -114,8 +118,7 @@ typedef struct {
const km_set_t * const cluster_set;
const float distance_sum,
- mean_distance,
- mean_variance,
+ silouette,
mean_cluster_size;
const size_t num_clusters,
diff --git a/main.c b/main.c
index 5629cd5..ed46af7 100644
--- a/main.c
+++ b/main.c
@@ -13,7 +13,7 @@
#include "km.h"
#define MAX_CLUSTERS 10
-#define NUM_TESTS 100
+#define NUM_TESTS 300
#define MAX_BEST 10
#define IM_WIDTH 128
@@ -34,9 +34,9 @@ typedef struct {
struct {
float distance,
- variance,
+ silouette,
cluster_size;
- size_t num_empty_clusters;
+ size_t num_empty;
} rows[MAX_CLUSTERS - 2];
// best clusters
@@ -62,7 +62,7 @@ ctx_best_sort(
// sort best sets by ascending score (worst to best)
qsort(
ctx->best,
- (ctx->num_best % MAX_BEST),
+ MIN(ctx->num_best, MAX_BEST),
sizeof(best_item_t),
best_score_cmp
);
@@ -162,10 +162,10 @@ find_on_data(
ctx_t * const ctx = cb_data;
const size_t ofs = data->num_clusters - 2;
- ctx->rows[ofs].distance += data->mean_distance;
- ctx->rows[ofs].variance += data->mean_variance;
+ ctx->rows[ofs].distance += data->distance_sum;
+ ctx->rows[ofs].silouette += data->silouette;
ctx->rows[ofs].cluster_size += data->mean_cluster_size;
- ctx->rows[ofs].num_empty_clusters += data->num_empty_clusters;
+ ctx->rows[ofs].num_empty += data->num_empty_clusters;
}
static bool
@@ -184,7 +184,6 @@ find_on_best(
if (ctx->num_best >= MAX_BEST) {
// finalize old best data set
- // D("finalizing old best %zu", ctx->num_best);
km_set_fini(dst);
}
@@ -220,17 +219,15 @@ ctx_csv_print_row(
) {
const size_t num_clusters = i + 2;
const float mean_distance = ctx->rows[i].distance / NUM_TESTS,
- mean_variance = ctx->rows[i].variance / NUM_TESTS,
mean_cluster_size = ctx->rows[i].cluster_size / NUM_TESTS,
- mean_empty = 1.0 * ctx->rows[i].num_empty_clusters / NUM_TESTS,
- score = km_score(mean_distance, mean_empty);
+ mean_empty = 1.0 * ctx->rows[i].num_empty / NUM_TESTS,
+ score = ctx->rows[i].silouette / NUM_TESTS;
// print result
- fprintf(fh, "%zu,%0.3f,%0.3f,%0.3f,%0.3f,%0.3f\n",
+ fprintf(fh, "%zu,%0.3f,%0.3f,%0.3f,%0.3f\n",
num_clusters,
score,
mean_distance,
- mean_variance,
mean_cluster_size,
mean_empty
);
@@ -242,7 +239,7 @@ ctx_csv_print(
FILE * const fh
) {
// print headers
- fprintf(fh, "#,score,distance,variance,cluster_size,empty_clusters\n");
+ fprintf(fh, "#,score,distance,size,empty\n");
// print rows
for (size_t i = 0; i < MAX_CLUSTERS - 2; i++) {
@@ -260,12 +257,13 @@ save_on_best(
const float score,
void * const cb_data
) {
+ const ctx_t * const ctx = cb_data;
UNUSED(score);
- UNUSED(cb_data);
// convert rank to channel brightness
const uint8_t ch = 0x33 + (0xff - 0x33) * (1.0 * rank + 1) / (MAX_BEST);
- const uint32_t color = (ch & 0xff) << 16;
+ const uint8_t shift = (rank == MIN(ctx->num_best, MAX_BEST) - 1) ? 8 : 16;
+ const uint32_t color = (ch & 0xff) << shift;
// const uint32_t color = 0xff0000;
// D("rank = %zu, score = %0.3f, size = %zu, color = %06x", rank, score, set->num_rows, color);
@@ -289,7 +287,7 @@ ctx_save_png(
}
// draw best cluster points
- ctx_best_each(ctx, save_on_best, NULL);
+ ctx_best_each(ctx, save_on_best, (void*) ctx);
// save png
if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) {