From 61e10dccf3455e43f53489e3fae65344a1092f1c Mon Sep 17 00:00:00 2001
From: Paul Duncan <pabs@pablotron.org>
Date: Tue, 5 Feb 2019 13:48:07 -0500
Subject: add "Test Data" section to README, add more tests and tests/bm.csv

---
 README.md | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

(limited to 'README.md')
diff --git a/README.md b/README.md
index e43568f..4554d22 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,10 @@ K-Means Clustering
 
 [K-Means][kmeans] clustering implementation.  Features:
 
-* Initialization methods: random, [forgy][forgy], and [k-means++][kmeanspp].
-* Uses [Silhouette method][silhouette] to determine the optimum cluster
-  count.
+* Initialization methods: random, [forgy][forgy], and
+  [k-means++][kmeanspp].
+* Uses the [Silhouette method][silhouette] to determine the optimum
+  cluster count.
 * No external dependencies.
 
 Examples
@@ -18,15 +19,19 @@ Generate 1000 rows of test data, clustered around 3 points:
     # classify data, plot results as png
     ./km-test kmeans example.dat example.png
 
-Run all tests and save best clustering in current directory along with
+Run all tests and save best results in current directory along with
 PNGs of results:
 
     for i in tests/*dat; do
-      # build paths
-      png_path=kmeans-$(basename ${i/dat/png})
+      # path to output data file
+      # (ex: src: "tests/c3-1e2-0.dat", dst: "kmeans-c3-1e3-0.dat")
       dst_path=kmeans-$(basename $i)
 
-      # run test
+      # path to output png
+      # (ex: src: "tests/c3-1e2-0.dat", dst: "kmeans-c3-1e3-0.png")
+      png_path=kmeans-$(basename ${i/dat/png})
+
+      # run test (use kmeans for initialization)
       echo $i
       ./km-test kmeans $i $png_path > $dst_path
     done
@@ -42,6 +47,9 @@ Supported initialization methods:
   to pick the initial cluster centroids.  This is the recommended
   initialization method.
 
+It's probably best to just stick with [kmeans][kmeanspp] unless you know
+what you're doing.
+
 Data File Format
 ----------------
 Reads and writes newline-delimited plain text files in the following
@@ -66,8 +74,23 @@ Example data file:
     9.8 6.5 4.3
     3.2 5.6 8.7
 
-See the files in `tests/` for additional examples.  You can also use
-the top-level `gen-data.rb` script to generate additional test data.
+See the files in `tests/` for additional example data files.  You can
+also use the top-level `gen-data.rb` script to generate additional test
+data.
+
+Test Data
+---------
+The test data files in the `tests/` directory use the following naming
+convention:
+
+    c<num_clusters>-<num_rows>-<N>.dat
+
+Where:
+
+* `num_clusters`: Number of clusters.
+* `num_rows`: Number of rows, in exponent notation (`1e3` = 1000, `1e4`
+  = 10000, etc).
+* `N`: Distinguishing suffix (usually 0).
 
   [kmeans]: https://en.wikipedia.org/wiki/K-means_clustering "K-Means clustering"
   [kmeanspp]: https://en.wikipedia.org/wiki/K-means%2B%2B "k-means++ initialization method"
-- 
cgit v1.2.3