Cleanups to demos

HolyLab · Nov 8, 2023 · aba559f · aba559f
1 parent b86ca52
commit aba559f
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -39,4 +39,10 @@ For these algorithms, any data set with more than 3000 points was excluded. Here
    5 │ dbscan        26
 ```
 
-It is worth noting that with the exception of `dbscan`, all of these algorithms received one or more "hints" from the reference clustering (e.g., the number of clusters). `dbscan`'s hyperparameters were set to have `2d` neighbors in `d` dimensions, and the radius was set as the mean distance to the `2d`th neighbor.
+Algorithms (or variants) marked with a `*` are fully automated, and do not require anything beyond the input data. The remainding algorithms received one or more "hints" from the reference clustering (e.g., the number of clusters).
+Some choices were made for the fully-automated variants:
+
+- `dbscan`'s hyperparameters were set to have `2d` neighbors in `d` dimensions, and the radius was set as the mean distance to the `2d`th neighbor.
+- `hclust*` split at the longest gap between splits in the [dendrogram](https://en.wikipedia.org/wiki/Dendrogram).
+
+There may be more optimal automation strategies than these. In particular, all algorithms can be automated by optimizing [evaluation metrics](https://juliastats.org/Clustering.jl/stable/validate.html), although one still needs to choose, e.g., the range of numbers of clusters considered.
diff --git a/demos/ami.png b/demos/ami.png
diff --git a/demos/bench_clustering.jl b/demos/bench_clustering.jl
@@ -31,9 +31,19 @@ for (battery, pattern) in gdatasets
     end
 end
 
+# Algorithms that require "hints" from the reference clustering (except for evaluation)
 evaluate_kmeans(X, refclust) = ami(refclust, kmeans(X, nclust(refclust)).assignments)
 evaluate_kmedoids(X, refclust) = size(X, 2) < 3000 ? ami(refclust, kmedoids(pairwise(Euclidean(), X), nclust(refclust)).assignments) : missing
 evaluate_hclust(X, refclust) = size(X, 2) < 3000 ? ami(refclust, cutree(hclust(pairwise(Euclidean(), X)); k=nclust(refclust))) : missing
+
+# Algorithms that work without reference to the reference clustering (except for evaluation)
+function evaluate_hclust_auto(X, refclust)
+    size(X, 2) < 3000 || return missing
+    hc = hclust(pairwise(Euclidean(), X))
+    # Split at the largest gap in the dendrogram
+    idx = argmax(diff(hc.heights))
+    return ami(refclust, cutree(hc; h=mean(hc.heights[idx:idx+1])))
+end
 evaluate_affprop(X, refclust) = size(X, 2) < 3000 ? ami(refclust, affinityprop(-pairwise(Euclidean(), X)).assignments) : missing
 function evaluate_dbscan(X, refclust)
     size(X, 2) < 3000 || return missing
@@ -63,7 +73,12 @@ end
 dspairs = [battery * "/" * dataset => Union{Float64,Missing}[] for (battery, dataset) in datasets]
 pushfirst!(dspairs, "Algorithm" => String[])
 df = DataFrame(dspairs)
-@showprogress desc="Algorithm" for (f, name) in ((evaluate_kmeans, "kmeans"), (evaluate_kmedoids, "kmedoids"), (evaluate_hclust, "hclust"), (evaluate_affprop, "affprop"), (evaluate_dbscan, "dbscan"))
+@showprogress desc="Algorithm" for (f, name) in ((evaluate_kmeans, "kmeans"),
+                                                 (evaluate_kmedoids, "kmedoids"),
+                                                 (evaluate_hclust, "hclust"),
+                                                 (evaluate_hclust_auto, "hclust*"),
+                                                 (evaluate_affprop, "affprop*"),
+                                                 (evaluate_dbscan, "dbscan*"))
     add_algorithm!(df, f, name, datasets)
 end