Vespucci  1.0.0
agglomerativeclustering.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  Copyright (C) 2014-2016 Wright State University - All Rights Reserved
3  Daniel P. Foose - Maintainer/Lead Developer
4 
5  This file is part of Vespucci.
6 
7  Vespucci is free software: you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation, either version 3 of the License, or
10  (at your option) any later version.
11 
12  Vespucci is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  GNU General Public License for more details.
16 
17  You should have received a copy of the GNU General Public License
18  along with Vespucci. If not, see <http://www.gnu.org/licenses/>.
19 *******************************************************************************/
21 #include <mlpack/core/metrics/mahalanobis_distance.hpp>
23 
49 Vespucci::Math::Clustering::AHCA::AHCA(std::string linkage_method, std::string metric_type)
50  : metric_type_(metric_type), linkage_method_(linkage_method)
51 {
52  std::set<std::string> valid_metrics = {"euclidean",
53  "squaredeuclidean",
54  "manhattan",
55  "chebyshev",
56  "cosine",
57  "correlation"};
58  std::set<std::string> valid_linkages = {"average",
59  "centroid",
60  "ward",
61  "complete",
62  "single"};
63  if (!valid_metrics.count(metric_type))
64  throw std::invalid_argument("Unsupported metric type " + metric_type);
65  if (!valid_linkages.count(linkage_method))
66  throw std::invalid_argument("Unsupported linkage method " + linkage_method);
67  if (linkage_method_ == "ward" && metric_type_ != "squaredeuclidean")
68  std::cerr << "Using Ward's method requires squared Euclidean distance."
69  "Squared Euclidean distance will be used in lieu of " + metric_type_ + "\n";
70 }
71 
73 {
74 
75 }
76 
81 void Vespucci::Math::Clustering::AHCA::Link(const arma::mat &data)
82 {
83  std::cout << "Begin AHCA::Link()" << std::endl;
84  arma::wall_clock timer;
85  timer.tic();
86  observations_ = data.n_cols;
87  merge_data_ = arma::mat(observations_, 2);
88  merge_data_.col(0) = arma::linspace(1, observations_, observations_);
89 
90  // The distance between each pair of points
91  if (linkage_method_ == "ward")
92  dist_ = Vespucci::Math::Clustering::pdist(data, "squaredeuclidean", 1, "square");
93  else
94  dist_ = Vespucci::Math::Clustering::pdist(data, metric_type_, 1, "square");
95 
96  //clusters holds the clustering nodes at this level
97  //we start with every observation in its own node
98  nodevec nodes; //clusters at this level
99  for (arma::uword i = 0; i < data.n_cols; ++i)
100  nodes.push_back(new AHCALeaf(i));
101  while (nodes.size() > 1){
102  clusters_[nodes.size()] = nodes;
103  //find the pairs with the closest distance in clusters
104  double distance;
105  arma::uvec to_merge = FindClosest(data, dist_, nodes, distance);
106  merge_data_(nodes.size() - 1, 1) = distance;
107  //create a new node in the list with two child nodes
108  node_t *node = new node_t(nodes[to_merge(0)], nodes[to_merge(1)], distance);
109  merge_data_(nodes.size() - 1, 1) = distance;
110  if (to_merge(0) > to_merge(1)){
111  nodes.erase(nodes.begin() + to_merge(0));
112  nodes.erase(nodes.begin() + to_merge(1));
113  }
114  else{
115  nodes.erase(nodes.begin() + to_merge(1));
116  nodes.erase(nodes.begin() + to_merge(0));
117  }
118 
119  nodes.push_back(node); //add merged cluster to vector and proceed
120  }
121  root_node_ = nodes[0]; //there is only one cluster left at this point
122  clusters_[1] = nodes;
123  merge_data_.shed_row(0);
124  std::cout << "End of AHCA::Link()" << std::endl;
125  std::cout << "took " << timer.toc() << " s" << std::endl;
126 }
127 
129 {
130  std::set<std::string> valid_linkages = {"average",
131  "centroid",
132  "ward",
133  "complete",
134  "single"};
135  if (!valid_linkages.count(linkage_method))
136  throw std::invalid_argument("Unsupported linkage method " + linkage_method);
137  linkage_method_ = linkage_method_;
138 }
139 
141 {
142  std::set<std::string> valid_metrics = {"euclidean",
143  "squaredeuclidean",
144  "manhattan",
145  "chebyshev",
146  "cosine",
147  "correlation"};
148  if (!valid_metrics.count(metric_type))
149  throw std::invalid_argument("Unsupported metric type " + metric_type);
150  if (linkage_method_ == "ward" && metric_type_ != "squaredeuclidean")
151  std::cerr << "Using Ward's method requires squared Euclidean distance."
152  "Squared Euclidean distance will be used in lieu of " + metric_type_ + "\n";
153  metric_type_ = metric_type;
154 }
155 
162 {
163  std::cout << "Begin AHCA::Cluster()" << std::endl;
164  arma::wall_clock timer;
165  timer.tic();
166  k = (k > clusters_.size() ? clusters_.size() : k);
167  arma::mat assignments = arma::zeros(dist_.n_rows, k);
168  for (arma::uword i = 1; i <= k; ++i){
169  nodevec cluster = clusters_.at(i);
170  for (arma::uword j = 0; j < cluster.size(); ++j){
171  arma::uvec column = {i - 1};
172  arma::uvec rows = cluster[j]->GetChildIndices();
173  assignments(rows, column).fill(j+1);
174  }
175  }
176  std::cout << "End AHCA::Cluster()" << std::endl;
177  std::cout << "Took " << timer.toc() << " s" << std::endl;
178  return assignments;
179 }
180 
186 {
187  return metric_type_;
188 }
189 
195 {
196  return linkage_method_;
197 }
198 
200 {
201  return merge_data_;
202 }
203 
205 {
206  return dist_;
207 }
208 
210 {
211  return clusters_;
212 }
213 
221 arma::uvec Vespucci::Math::Clustering::AHCA::FindClosest(const arma::mat &data,
222  const arma::mat &dist,
223  const nodevec &clusters,
224  double &distance)
225 {
226  if (clusters.size() < 2) throw std::invalid_argument("Only one cluster to merge");
227  arma::mat cluster_dist(clusters.size(), clusters.size());
228  for (size_t i = 0; i < clusters.size(); ++i){
229  #ifdef _WIN32
230  #pragma omp parallel for default(none) \
231  shared(cluster_dist, clusters, i, data, dist)
232  for (intmax_t j = 0; j < (intmax_t) clusters.size(); ++j)
233  #else
234  #pragma omp parallel for default(none) \
235  shared(cluster_dist, clusters, i, data, dist)
236  for (size_t j = 0; j < clusters.size(); ++ j)
237  #endif
238  {
239  cluster_dist(i, j) = ClusterDistance(i, j, data, dist, clusters);
240  cluster_dist(j, i) = ClusterDistance(i, j, data, dist, clusters);
241  }
242  }
243  //to prevent the trivial case of a cluster merging with itself, we fill
244  //the (i,i) pairs with the maxium value;
245  cluster_dist.diag(0).fill(cluster_dist.max());
246 
247  arma::uword min_index = cluster_dist.index_min();
248  distance = cluster_dist.min();
249  //if the total number of clusters is 2, then the matrix is filled with one value
250  if (clusters.size() == 2) return arma::uvec({0, 1});
251  return arma::ind2sub(arma::size(cluster_dist), min_index);
252 }
253 
262 double Vespucci::Math::Clustering::AHCA::ClusterDistance(int i, int j,
263  const arma::mat &data,
264  const arma::mat &dist,
265  const nodevec &clusters)
266 {
267  arma::uvec I = clusters[i]->GetChildIndices();
268  arma::uvec J = clusters[j]->GetChildIndices();
269  arma::mat distances = dist.submat(I, J);
270  if (linkage_method_ == "single") return distances.min();
271  if (linkage_method_ == "complete") return distances.max();
272  if (linkage_method_ == "average") return arma::mean(mean(distances));
273  if (linkage_method_ == "ward"){
274  arma::vec i_centroid = arma::mean(data.cols(I), 1);
275  arma::vec j_centroid = arma::mean(data.cols(J), 1);
276  Vespucci::Math::DistanceMetricWrapper metric(metric_type_);
277  return (metric.Evaluate(i_centroid, j_centroid) * I.n_rows * J.n_rows)/(I.n_rows + J.n_rows);
278  }
279  if (linkage_method_ == "centroid"){
280  arma::vec i_centroid = arma::mean(data.cols(I), 1);
281  arma::vec j_centroid = arma::mean(data.cols(J), 1);
282  Vespucci::Math::DistanceMetricWrapper metric(metric_type_);
283  return metric.Evaluate(i_centroid, j_centroid);
284  }
285  else
286  return distances.min(); //defaulting to single. Unlikely to occur if this is used properly.
287 }
288 
289 arma::mat Vespucci::Math::Clustering::pdist(const arma::mat &x,
290  std::string metric_type,
291  arma::uword dim,
292  std::string shape)
293 {
294  if (dim > 1) throw std::invalid_argument("dim must be 0 or 1");
295  std::set<std::string> valid_types = {"euclidean",
296  "squaredeuclidean",
297  "manhattan",
298  "chebyshev",
299  "cosine",
300  "correlation"};
301  if (!valid_types.count(metric_type))
302  throw std::invalid_argument("Unsupported metric type " + metric_type);
303 
304  Vespucci::Math::DistanceMetricWrapper metric(metric_type);
305  arma::uword size = (dim == 1 ? x.n_cols : x.n_rows);
306 
307  arma::mat pdist_mat(size, size);
308  pdist_mat.diag(0) = arma::zeros(size);
309 
310  for (arma::uword i = 0; i < size; ++i){
311  #ifdef _WIN32
312  #pragma omp parallel for default(none) \
313  shared(pdist_mat, x, size, i, metric, dim)
314  for (intmax_t j = 0; j < (intmax_t) size; ++j)
315  #else
316  #pragma omp parallel for default(none) \
317  shared(pdist_mat, x, size, i, metric, dim)
318  for (size_t j = 0; j < size; ++ j)
319  #endif
320  {
321  arma::vec first = (dim == 1 ? x.col(i) : arma::vec(x.row(i).t()));
322  arma::vec second = (dim == 1 ? x.col(j) : arma::vec(x.row(j).t()));
323  double distance = metric.Evaluate(first, second);
324  pdist_mat(i, j) = distance;
325  pdist_mat(j, i) = distance;
326  }
327  }
328  if (shape == "row"){
329  pdist_mat.reshape(1, pdist_mat.n_elem);
330  return pdist_mat;
331  }
332  if (shape == "column"){
333  pdist_mat.reshape(pdist_mat.n_elem, 1);
334  return pdist_mat;
335  }
336  return pdist_mat;
337 }
VESPUCCI_EXPORT arma::mat pdist(const arma::mat &x, std::string metric_type, arma::uword dim=1, std::string shape="row")
The AHCANode class This class stores a node in the data structure representing the AHCA dendrogram Th...
Definition: ahcanode.h:34
void Link(const arma::mat &data)
Vespucci::Math::Clustering::AHCA::Link.
void SetLinkage(std::string linkage_method)
std::vector< node_t * > nodevec
std::string linkage_method()
Vespucci::Math::Clustering::AHCA::linkage_method.
void SetMetric(std::string metric_type)
std::map< size_t, nodevec > clusters()
std::string metric_type()
Vespucci::Math::Clustering::AHCA::metric_type.
double Evaluate(arma::vec &first, arma::vec &second)
Vespucci::Math::DistanceMetricWrapper::Evaluate.
arma::mat Cluster(arma::uword k)
Vespucci::Math::Clustering::AHCA::Cluster Generate cluster assignments from tree. ...
Vespucci::Math::Clustering::AHCANode node_t