21 #include <mlpack/core/metrics/mahalanobis_distance.hpp> 50 : metric_type_(metric_type), linkage_method_(linkage_method)
52 std::set<std::string> valid_metrics = {
"euclidean",
58 std::set<std::string> valid_linkages = {
"average",
63 if (!valid_metrics.count(metric_type))
64 throw std::invalid_argument(
"Unsupported metric type " + metric_type);
65 if (!valid_linkages.count(linkage_method))
66 throw std::invalid_argument(
"Unsupported linkage method " + linkage_method);
67 if (linkage_method_ ==
"ward" && metric_type_ !=
"squaredeuclidean")
68 std::cerr <<
"Using Ward's method requires squared Euclidean distance." 69 "Squared Euclidean distance will be used in lieu of " + metric_type_ +
"\n";
83 std::cout <<
"Begin AHCA::Link()" << std::endl;
84 arma::wall_clock timer;
86 observations_ = data.n_cols;
87 merge_data_ = arma::mat(observations_, 2);
88 merge_data_.col(0) = arma::linspace(1, observations_, observations_);
91 if (linkage_method_ ==
"ward")
99 for (arma::uword i = 0; i < data.n_cols; ++i)
101 while (nodes.size() > 1){
102 clusters_[nodes.size()] = nodes;
105 arma::uvec to_merge = FindClosest(data, dist_, nodes, distance);
106 merge_data_(nodes.size() - 1, 1) = distance;
108 node_t *node =
new node_t(nodes[to_merge(0)], nodes[to_merge(1)], distance);
109 merge_data_(nodes.size() - 1, 1) = distance;
110 if (to_merge(0) > to_merge(1)){
111 nodes.erase(nodes.begin() + to_merge(0));
112 nodes.erase(nodes.begin() + to_merge(1));
115 nodes.erase(nodes.begin() + to_merge(1));
116 nodes.erase(nodes.begin() + to_merge(0));
119 nodes.push_back(node);
121 root_node_ = nodes[0];
122 clusters_[1] = nodes;
123 merge_data_.shed_row(0);
124 std::cout <<
"End of AHCA::Link()" << std::endl;
125 std::cout <<
"took " << timer.toc() <<
" s" << std::endl;
130 std::set<std::string> valid_linkages = {
"average",
135 if (!valid_linkages.count(linkage_method))
136 throw std::invalid_argument(
"Unsupported linkage method " + linkage_method);
137 linkage_method_ = linkage_method_;
142 std::set<std::string> valid_metrics = {
"euclidean",
148 if (!valid_metrics.count(metric_type))
149 throw std::invalid_argument(
"Unsupported metric type " + metric_type);
150 if (linkage_method_ ==
"ward" && metric_type_ !=
"squaredeuclidean")
151 std::cerr <<
"Using Ward's method requires squared Euclidean distance." 152 "Squared Euclidean distance will be used in lieu of " + metric_type_ +
"\n";
163 std::cout <<
"Begin AHCA::Cluster()" << std::endl;
164 arma::wall_clock timer;
166 k = (k > clusters_.size() ? clusters_.size() : k);
167 arma::mat assignments = arma::zeros(dist_.n_rows, k);
168 for (arma::uword i = 1; i <= k; ++i){
169 nodevec cluster = clusters_.at(i);
170 for (arma::uword j = 0; j < cluster.size(); ++j){
171 arma::uvec column = {i - 1};
172 arma::uvec rows = cluster[j]->GetChildIndices();
173 assignments(rows, column).fill(j+1);
176 std::cout <<
"End AHCA::Cluster()" << std::endl;
177 std::cout <<
"Took " << timer.toc() <<
" s" << std::endl;
196 return linkage_method_;
221 arma::uvec Vespucci::Math::Clustering::AHCA::FindClosest(
const arma::mat &data,
222 const arma::mat &
dist,
226 if (
clusters.size() < 2)
throw std::invalid_argument(
"Only one cluster to merge");
228 for (
size_t i = 0; i <
clusters.size(); ++i){
230 #pragma omp parallel for default(none) \ 231 shared(cluster_dist, clusters, i, data, dist) 232 for (intmax_t j = 0; j < (intmax_t)
clusters.size(); ++j)
234 #pragma omp parallel
for default(none) \
236 for (
size_t j = 0; j <
clusters.size(); ++ j)
239 cluster_dist(i, j) = ClusterDistance(i, j, data,
dist,
clusters);
240 cluster_dist(j, i) = ClusterDistance(i, j, data,
dist,
clusters);
245 cluster_dist.diag(0).fill(cluster_dist.max());
247 arma::uword min_index = cluster_dist.index_min();
248 distance = cluster_dist.min();
250 if (
clusters.size() == 2)
return arma::uvec({0, 1});
251 return arma::ind2sub(arma::size(cluster_dist), min_index);
262 double Vespucci::Math::Clustering::AHCA::ClusterDistance(
int i,
int j,
263 const arma::mat &data,
264 const arma::mat &
dist,
267 arma::uvec I = clusters[i]->GetChildIndices();
268 arma::uvec J = clusters[j]->GetChildIndices();
269 arma::mat distances =
dist.submat(I, J);
270 if (linkage_method_ ==
"single")
return distances.min();
271 if (linkage_method_ ==
"complete")
return distances.max();
272 if (linkage_method_ ==
"average")
return arma::mean(mean(distances));
273 if (linkage_method_ ==
"ward"){
274 arma::vec i_centroid = arma::mean(data.cols(I), 1);
275 arma::vec j_centroid = arma::mean(data.cols(J), 1);
277 return (metric.Evaluate(i_centroid, j_centroid) * I.n_rows * J.n_rows)/(I.n_rows + J.n_rows);
279 if (linkage_method_ ==
"centroid"){
280 arma::vec i_centroid = arma::mean(data.cols(I), 1);
281 arma::vec j_centroid = arma::mean(data.cols(J), 1);
283 return metric.Evaluate(i_centroid, j_centroid);
286 return distances.min();
294 if (dim > 1)
throw std::invalid_argument(
"dim must be 0 or 1");
295 std::set<std::string> valid_types = {
"euclidean",
301 if (!valid_types.count(metric_type))
302 throw std::invalid_argument(
"Unsupported metric type " + metric_type);
305 arma::uword size = (dim == 1 ? x.n_cols : x.n_rows);
307 arma::mat pdist_mat(size, size);
308 pdist_mat.diag(0) = arma::zeros(size);
310 for (arma::uword i = 0; i < size; ++i){
312 #pragma omp parallel for default(none) \ 313 shared(pdist_mat, x, size, i, metric, dim) 314 for (intmax_t j = 0; j < (intmax_t) size; ++j)
316 #pragma omp parallel
for default(none) \
317 shared(pdist_mat, x, size, i, metric, dim)
318 for (
size_t j = 0; j < size; ++ j)
321 arma::vec first = (dim == 1 ? x.col(i) : arma::vec(x.row(i).t()));
322 arma::vec second = (dim == 1 ? x.col(j) : arma::vec(x.row(j).t()));
323 double distance = metric.
Evaluate(first, second);
324 pdist_mat(i, j) = distance;
325 pdist_mat(j, i) = distance;
329 pdist_mat.reshape(1, pdist_mat.n_elem);
332 if (shape ==
"column"){
333 pdist_mat.reshape(pdist_mat.n_elem, 1);
VESPUCCI_EXPORT arma::mat pdist(const arma::mat &x, std::string metric_type, arma::uword dim=1, std::string shape="row")
The AHCANode class This class stores a node in the data structure representing the AHCA dendrogram Th...
void Link(const arma::mat &data)
Vespucci::Math::Clustering::AHCA::Link.
void SetLinkage(std::string linkage_method)
std::vector< node_t * > nodevec
std::string linkage_method()
Vespucci::Math::Clustering::AHCA::linkage_method.
void SetMetric(std::string metric_type)
std::map< size_t, nodevec > clusters()
std::string metric_type()
Vespucci::Math::Clustering::AHCA::metric_type.
double Evaluate(arma::vec &first, arma::vec &second)
Vespucci::Math::DistanceMetricWrapper::Evaluate.
arma::mat Cluster(arma::uword k)
Vespucci::Math::Clustering::AHCA::Cluster Generate cluster assignments from tree. ...
Vespucci::Math::Clustering::AHCANode node_t