aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Duncan <pabs@pablotron.org>2019-02-05 11:08:51 -0500
committerPaul Duncan <pabs@pablotron.org>2019-02-05 11:08:51 -0500
commit42a5ea32a5bbba10b112a601ea8f34412842f1c3 (patch)
tree91b6ac1e940cc3d24fb2258df03bd16d6a11cbdd
parent7907db7627aafa35a45f7246e79f5369b6714828 (diff)
downloadkmeans-42a5ea32a5bbba10b112a601ea8f34412842f1c3.tar.bz2
kmeans-42a5ea32a5bbba10b112a601ea8f34412842f1c3.zip
use openmp, s/silouette/silhouette/, move row_map allocation outside of km_solve()
-rw-r--r--Makefile2
-rw-r--r--src/km-find.c28
-rw-r--r--src/km-set.c7
-rw-r--r--src/km-solve.c72
-rw-r--r--src/km.h5
-rw-r--r--src/main.c4
6 files changed, 56 insertions, 62 deletions
diff --git a/Makefile b/Makefile
index f69b526..11e4f11 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
APP=km-test
-CFLAGS=-W -Wall -Wextra -Werror -pedantic -std=c11 -O2
+CFLAGS=-W -Wall -Wextra -Werror -pedantic -std=c11 -O2 -fopenmp
OBJS=src/km-set.o src/km-draw.o src/km-load.o src/km-find.o \
src/km-rand.o src/km-solve.o src/km-init-rand.o src/main.o \
src/km-init-forgy.o src/km-init-kmeans.o src/km-init.o \
diff --git a/src/km-find.c b/src/km-find.c
index 5ae5948..e9fed0f 100644
--- a/src/km-find.c
+++ b/src/km-find.c
@@ -28,9 +28,9 @@ find_solve_on_stats(
km_find_data_t * const find_data = cb_data;
UNUSED(set);
- // save total sum and silouette
+ // save total sum and silhouette
find_data->distance_sum = stats->sum;
- find_data->silouette = stats->silouette;
+ find_data->silhouette = stats->silhouette;
}
static const km_solve_cbs_t
@@ -59,7 +59,14 @@ km_find(
return false;
}
- float best_silouette = -2.0;
+ // allocate row map: row => distance, cluster ID
+ km_row_map_t * const row_map = calloc(set->num_rows, sizeof(km_row_map_t));
+ if (!row_map) {
+ // return failure
+ return false;
+ }
+
+ float best_silhouette = -2.0;
for (size_t i = 2; i < cbs->max_clusters; i++) {
for (size_t j = 0; j < cbs->num_tests; j++) {
// init cluster set
@@ -76,8 +83,8 @@ km_find(
};
// solve test
- // (populates sum and silouette)
- if (!km_solve(&cs, set, &FIND_SOLVE_CBS, &find_data)) {
+ // (populates sum and silhouette)
+ if (!km_solve(&cs, set, row_map, &FIND_SOLVE_CBS, &find_data)) {
// return failure
return false;
}
@@ -88,15 +95,15 @@ km_find(
// emit result
cbs->on_data(&find_data, cb_data);
- if (find_data.silouette > best_silouette) {
+ if (find_data.silhouette > best_silhouette) {
// emit new best result
- if (cbs->on_best && !cbs->on_best(find_data.silouette, &cs, cb_data)) {
+ if (cbs->on_best && !cbs->on_best(find_data.silhouette, &cs, cb_data)) {
// return failure
return false;
}
- // update best silouette
- best_silouette = find_data.silouette;
+ // update best silhouette
+ best_silhouette = find_data.silhouette;
}
// finalize cluster set
@@ -107,6 +114,9 @@ km_find(
}
}
+ // free row map
+ free(row_map);
+
// return success
return true;
}
diff --git a/src/km-set.c b/src/km-set.c
index fcf1054..ca98a2b 100644
--- a/src/km-set.c
+++ b/src/km-set.c
@@ -237,6 +237,7 @@ km_set_normalize(
}
// normalize values
+ #pragma omp parallel for collapse(2)
for (size_t i = 0; i < set->num_rows; i++) {
for (size_t j = 0; j < num_floats; j++) {
const size_t ofs = i * num_floats + j;
@@ -257,21 +258,23 @@ km_set_normalize(
}
// get row from data set
+// unsafe (no bounds checking)
float *
km_set_get_row(
const km_set_t * const set,
const size_t i
) {
const size_t num_floats = set->shape.num_floats;
- return (num_floats) ? (set->floats + i * num_floats) : NULL;
+ return set->floats + i * num_floats;
}
// get row from data set
+// unsafe (no bounds checking)
int *
km_set_get_row_ints(
const km_set_t * const set,
const size_t i
) {
const size_t num_ints = set->shape.num_ints;
- return (num_ints) ? (set->ints + i * num_ints) : NULL;
+ return set->ints + i * num_ints;
}
diff --git a/src/km-solve.c b/src/km-solve.c
index b6d98d6..271795b 100644
--- a/src/km-solve.c
+++ b/src/km-solve.c
@@ -6,36 +6,18 @@
#define MIN_DELTA 0.00001
-// alloc and initialize row map
-static km_row_map_t *
-km_row_map_init(
+static void
+km_row_map_clear(
+ km_row_map_t * const row_map,
const size_t num_rows
) {
- // alloc row map
- km_row_map_t * const row_map = malloc(sizeof(km_row_map_t) * num_rows);
-
- // check for error
- if (!row_map) {
- // return failure
- return false;
- }
-
// init row map
+ #pragma omp parallel for
for (size_t i = 0; i < num_rows; i++) {
// setting distances to maximum
row_map[i].d2 = FLT_MAX;
row_map[i].d2_near = FLT_MAX;
}
-
- // return row map
- return row_map;
-}
-
-static void
-km_row_map_fini(
- km_row_map_t * const row_map
-) {
- free(row_map);
}
// use k-means to iteratively update the cluster centroids until there
@@ -44,18 +26,15 @@ bool
km_solve(
km_set_t * const cs,
const km_set_t * const set,
+ km_row_map_t * const row_map,
const km_solve_cbs_t * const cbs,
void * const cb_data
) {
const size_t num_clusters = cs->num_rows,
num_floats = set->shape.num_floats;
- // row map: row => distance, cluster ID
- km_row_map_t *row_map = km_row_map_init(set->num_rows);
- if (!row_map) {
- // return failure
- return false;
- }
+ // clear row map distances
+ km_row_map_clear(row_map, set->num_rows);
// calculate clusters by doing the following:
// * walk all clusters and all rows
@@ -68,11 +47,13 @@ km_solve(
// no changes yet
changed = false;
+ #pragma omp parallel for collapse(2)
for (size_t i = 0; i < num_clusters; i++) {
- // get the floats for this cluster
- const float * const floats = km_set_get_row(cs, i);
-
for (size_t j = 0; j < set->num_rows; j++) {
+ // get the floats for this cluster
+ // NOTE: moved to inner loop for "collapse(2)" above
+ const float * const floats = km_set_get_row(cs, i);
+
// get row values
const float * const vals = km_set_get_row(set, j);
@@ -101,7 +82,8 @@ km_solve(
// if there were changes, then we need to calculate the new
// cluster centers
- // calculate new center
+ // calculate new cluster centers
+ #pragma omp parallel for
for (size_t i = 0; i < num_clusters; i++) {
size_t num_rows = 0;
float * const floats = km_set_get_row(cs, i);
@@ -141,30 +123,31 @@ km_solve(
if (cbs && cbs->on_stats) {
float sum = 0,
- silouette = 0;
+ silhouette = 0;
- // calculate mean numerators and silouette
+ // calculate mean numerators and silhouette
+ #pragma omp parallel for reduction(+:sum,silhouette)
for (size_t i = 0; i < set->num_rows; i++) {
- // sum distances
- sum += row_map[i].d2;
-
- // calculate silouette denominator
+ // calculate silhouette denominator
// (https://en.wikipedia.org/wiki/Silhouette_%28clustering%29)
const float delta = (row_map[i].d2_near - row_map[i].d2);
- // sum silouette
- silouette += (delta < -MIN_DELTA || delta > MIN_DELTA)
+ // sum silhouette
+ silhouette += (delta < -MIN_DELTA || delta > MIN_DELTA)
? (delta / MAX(row_map[i].d2, row_map[i].d2_near))
: 0.0;
+
+ // sum distances
+ sum += row_map[i].d2;
}
- // finalize silouette
- silouette /= set->num_rows;
+ // finalize silhouette
+ silhouette /= set->num_rows;
// build stats
const km_solve_stats_t stats = {
.sum = sum,
- .silouette = silouette,
+ .silhouette = silhouette,
.num_clusters = num_clusters,
};
@@ -172,9 +155,6 @@ km_solve(
cbs->on_stats(cs, &stats, cb_data);
}
- // free row_map
- km_row_map_fini(row_map);
-
// return success
return true;
}
diff --git a/src/km.h b/src/km.h
index 7906f00..83d7be0 100644
--- a/src/km.h
+++ b/src/km.h
@@ -94,7 +94,7 @@ typedef struct {
typedef struct {
const float sum;
- const float silouette;
+ const float silhouette;
const size_t num_clusters;
} km_solve_stats_t;
@@ -109,6 +109,7 @@ _Bool
km_solve(
km_set_t * const,
const km_set_t * const,
+ km_row_map_t * const,
const km_solve_cbs_t * const,
void * const
);
@@ -117,7 +118,7 @@ typedef struct {
const km_set_t * const cluster_set;
float distance_sum,
- silouette,
+ silhouette,
mean_cluster_size;
const size_t num_clusters,
diff --git a/src/main.c b/src/main.c
index 8bae71c..bc8c1f1 100644
--- a/src/main.c
+++ b/src/main.c
@@ -41,7 +41,7 @@ typedef struct {
struct {
float distance,
- silouette,
+ silhouette,
cluster_size;
size_t num_empty;
} rows[MAX_CLUSTERS - 2];
@@ -163,7 +163,7 @@ find_on_data(
const size_t ofs = data->num_clusters - 2;
ctx->rows[ofs].distance += data->distance_sum;
- ctx->rows[ofs].silouette += data->silouette;
+ ctx->rows[ofs].silhouette += data->silhouette;
ctx->rows[ofs].cluster_size += data->mean_cluster_size;
ctx->rows[ofs].num_empty += data->num_empty_clusters;
}