From dbebf993978def0134af9d0b98a9cdb028551e9b Mon Sep 17 00:00:00 2001 From: Paul Duncan Date: Mon, 4 Feb 2019 22:43:47 -0500 Subject: use silouette method for scoring --- km-find.c | 54 +++++++++--------------------------------------------- km-solve.c | 59 +++++++++++++++++++++++++++++++++++++---------------------- km.h | 13 ++++++++----- main.c | 32 +++++++++++++++----------------- 4 files changed, 69 insertions(+), 89 deletions(-) diff --git a/km-find.c b/km-find.c index 7cd24f6..3716123 100644 --- a/km-find.c +++ b/km-find.c @@ -7,31 +7,11 @@ typedef struct { float sum, - mean_sum, - variance_sum; - size_t num_empty_clusters; + silouette; } find_solve_data_t; static float -find_get_mean_distance( - const find_solve_data_t * const d, - const size_t num_clusters -) { - const bool num_filled = num_clusters - d->num_empty_clusters; - return num_filled ? (d->mean_sum / num_filled) : 0; -} - -static float -find_get_mean_variance( - const find_solve_data_t * const d, - const size_t num_clusters -) { - const bool num_filled = num_clusters - d->num_empty_clusters; - return num_filled ? (d->variance_sum / num_filled) : 0; -} - -static float -find_get_mean_cluster_size( +get_mean_cluster_size( const km_set_t * const set ) { float sum = 0; @@ -56,21 +36,9 @@ find_solve_on_stats( find_solve_data_t * const solve_data = cb_data; UNUSED(set); - // save total sum + // save total sum and silouette solve_data->sum = stats->sum; - - // calculate numerator for the average distance across all clusters in - // this test - for (size_t i = 0; i < stats->num_clusters; i++) { - if (fabsf(stats->means[i]) > MIN_CLUSTER_DISTANCE) { - // increment mean count - solve_data->mean_sum += stats->means[i]; - solve_data->variance_sum += stats->variances[i]; - } else { - // increment empty cluster count - solve_data->num_empty_clusters++; - } - } + solve_data->silouette = stats->silouette; } static const km_solve_cbs_t @@ -111,10 +79,8 @@ km_find( // init solve data find_solve_data_t solve_data = { - .sum = 0, - .mean_sum = 0, - .variance_sum = 0, - .num_empty_clusters = 0, + .sum = 0, + .silouette = 0, }; // solve test @@ -128,17 +94,15 @@ km_find( .cluster_set = &cs, .num_clusters = i, .distance_sum = solve_data.sum, - .mean_distance = find_get_mean_distance(&solve_data, i), - .mean_variance = find_get_mean_variance(&solve_data, i), - .mean_cluster_size = find_get_mean_cluster_size(&cs), - .num_empty_clusters = solve_data.num_empty_clusters, + .silouette = solve_data.silouette, + .mean_cluster_size = get_mean_cluster_size(&cs), }; // emit result cbs->on_data(&result, cb_data); // score result - float score = km_score(result.distance_sum, result.num_empty_clusters); + const float score = solve_data.silouette; if (score > best_score) { // emit new best result diff --git a/km-solve.c b/km-solve.c index 2a4de16..8a70a48 100644 --- a/km-solve.c +++ b/km-solve.c @@ -5,6 +5,8 @@ #include "util.h" #include "km.h" +#define MIN_CLUSTER_DISTANCE 0.00001 + // alloc and initialize row map static km_row_map_t * km_row_map_init( @@ -19,9 +21,11 @@ km_row_map_init( return false; } - // init row map by setting the maximum distance + // init row map for (size_t i = 0; i < num_rows; i++) { + // setting distances to maximum row_map[i].d2 = FLT_MAX; + row_map[i].d2_near = FLT_MAX; } // return row map @@ -84,6 +88,13 @@ km_solve( // flag change changed = true; } + + if ((row_map[j].cluster != i) && (d2 < row_map[j].d2_near)) { + row_map[j].d2_near = d2; + + // flag change + changed = true; + } } } @@ -131,44 +142,48 @@ km_solve( if (cbs && cbs->on_stats) { float sum = 0, - means[num_clusters], - variances[num_clusters]; + silouette = 0, + mean_dists[num_clusters], + mean_nears[num_clusters]; - memset(means, 0, sizeof(float) * num_clusters); - memset(variances, 0, sizeof(float) * num_clusters); + memset(mean_dists, 0, sizeof(mean_dists)); + memset(mean_nears, 0, sizeof(mean_nears)); // calculate sum of distances across all clusters for (size_t i = 0; i < set->num_rows; i++) { sum += row_map[i].d2; } - // calculate mean distances + // calculate mean numerators and silouette for (size_t i = 0; i < set->num_rows; i++) { - means[row_map[i].cluster] += row_map[i].d2; - } - // finalize means - for (size_t i = 0; i < num_clusters; i++) { - means[i] = (cs->ints[i]) ? (sqrt(means[i]) / cs->ints[i]) : 0; - } + // distance squared (d2) to center of this cluster + mean_dists[row_map[i].cluster] += row_map[i].d2; - // calculate variances - for (size_t i = 0; i < set->num_rows; i++) { - const size_t cluster = row_map[i].cluster; - const float variance = (sqrt(row_map[i].d2) - means[cluster]) * - (sqrt(row_map[i].d2) - means[cluster]); - variances[cluster] += variance; + // distance squared (d2) to center of nearest cluster + mean_nears[row_map[i].cluster] += row_map[i].d2_near; + + // calculate silouette denominator + const float delta = row_map[i].d2_near - row_map[i].d2; + if (fabsf(delta) > MIN_CLUSTER_DISTANCE) { + silouette += delta / MAX(row_map[i].d2, row_map[i].d2_near); + } } - // finalize variances + // finalize means (divide by row count) for (size_t i = 0; i < num_clusters; i++) { - variances[i] = (cs->ints[i]) ? (variances[i] / cs->ints[i]) : 0; + mean_dists[i] = (cs->ints[i]) ? (sqrt(mean_dists[i]) / cs->ints[i]) : 0; + mean_nears[i] = (cs->ints[i]) ? (sqrt(mean_nears[i]) / cs->ints[i]) : 0; } + // finalize silouette + silouette /= set->num_rows; + // build stats const km_solve_stats_t stats = { .sum = sum, - .means = means, - .variances = variances, + .silouette = silouette, + .mean_dists = mean_dists, + .mean_nears = mean_nears, .num_clusters = num_clusters, }; diff --git a/km.h b/km.h index bc2e3a6..bd3fe41 100644 --- a/km.h +++ b/km.h @@ -84,14 +84,18 @@ _Bool km_set_normalize(km_set_t * const); float *km_set_get_row(const km_set_t *, const size_t); typedef struct { - float d2; + float d2, d2_near; size_t cluster; } km_row_map_t; typedef struct { const float sum; - const float *means; - const float *variances; + + const float silouette; + + const float *mean_dists; + const float *mean_nears; + const size_t num_clusters; } km_solve_stats_t; @@ -114,8 +118,7 @@ typedef struct { const km_set_t * const cluster_set; const float distance_sum, - mean_distance, - mean_variance, + silouette, mean_cluster_size; const size_t num_clusters, diff --git a/main.c b/main.c index 5629cd5..ed46af7 100644 --- a/main.c +++ b/main.c @@ -13,7 +13,7 @@ #include "km.h" #define MAX_CLUSTERS 10 -#define NUM_TESTS 100 +#define NUM_TESTS 300 #define MAX_BEST 10 #define IM_WIDTH 128 @@ -34,9 +34,9 @@ typedef struct { struct { float distance, - variance, + silouette, cluster_size; - size_t num_empty_clusters; + size_t num_empty; } rows[MAX_CLUSTERS - 2]; // best clusters @@ -62,7 +62,7 @@ ctx_best_sort( // sort best sets by ascending score (worst to best) qsort( ctx->best, - (ctx->num_best % MAX_BEST), + MIN(ctx->num_best, MAX_BEST), sizeof(best_item_t), best_score_cmp ); @@ -162,10 +162,10 @@ find_on_data( ctx_t * const ctx = cb_data; const size_t ofs = data->num_clusters - 2; - ctx->rows[ofs].distance += data->mean_distance; - ctx->rows[ofs].variance += data->mean_variance; + ctx->rows[ofs].distance += data->distance_sum; + ctx->rows[ofs].silouette += data->silouette; ctx->rows[ofs].cluster_size += data->mean_cluster_size; - ctx->rows[ofs].num_empty_clusters += data->num_empty_clusters; + ctx->rows[ofs].num_empty += data->num_empty_clusters; } static bool @@ -184,7 +184,6 @@ find_on_best( if (ctx->num_best >= MAX_BEST) { // finalize old best data set - // D("finalizing old best %zu", ctx->num_best); km_set_fini(dst); } @@ -220,17 +219,15 @@ ctx_csv_print_row( ) { const size_t num_clusters = i + 2; const float mean_distance = ctx->rows[i].distance / NUM_TESTS, - mean_variance = ctx->rows[i].variance / NUM_TESTS, mean_cluster_size = ctx->rows[i].cluster_size / NUM_TESTS, - mean_empty = 1.0 * ctx->rows[i].num_empty_clusters / NUM_TESTS, - score = km_score(mean_distance, mean_empty); + mean_empty = 1.0 * ctx->rows[i].num_empty / NUM_TESTS, + score = ctx->rows[i].silouette / NUM_TESTS; // print result - fprintf(fh, "%zu,%0.3f,%0.3f,%0.3f,%0.3f,%0.3f\n", + fprintf(fh, "%zu,%0.3f,%0.3f,%0.3f,%0.3f\n", num_clusters, score, mean_distance, - mean_variance, mean_cluster_size, mean_empty ); @@ -242,7 +239,7 @@ ctx_csv_print( FILE * const fh ) { // print headers - fprintf(fh, "#,score,distance,variance,cluster_size,empty_clusters\n"); + fprintf(fh, "#,score,distance,size,empty\n"); // print rows for (size_t i = 0; i < MAX_CLUSTERS - 2; i++) { @@ -260,12 +257,13 @@ save_on_best( const float score, void * const cb_data ) { + const ctx_t * const ctx = cb_data; UNUSED(score); - UNUSED(cb_data); // convert rank to channel brightness const uint8_t ch = 0x33 + (0xff - 0x33) * (1.0 * rank + 1) / (MAX_BEST); - const uint32_t color = (ch & 0xff) << 16; + const uint8_t shift = (rank == MIN(ctx->num_best, MAX_BEST) - 1) ? 8 : 16; + const uint32_t color = (ch & 0xff) << shift; // const uint32_t color = 0xff0000; // D("rank = %zu, score = %0.3f, size = %zu, color = %06x", rank, score, set->num_rows, color); @@ -289,7 +287,7 @@ ctx_save_png( } // draw best cluster points - ctx_best_each(ctx, save_on_best, NULL); + ctx_best_each(ctx, save_on_best, (void*) ctx); // save png if (!stbi_write_png(png_path, IM_WIDTH, IM_HEIGHT, 3, im_data, IM_STRIDE)) { -- cgit v1.2.3