diff --git a/include/boost/algorithm/cluster/dbscan.hpp b/include/boost/algorithm/cluster/dbscan.hpp index 6de93c7..a7a9480 100644 --- a/include/boost/algorithm/cluster/dbscan.hpp +++ b/include/boost/algorithm/cluster/dbscan.hpp @@ -8,6 +8,7 @@ #include #include +#include #include namespace boost @@ -23,29 +24,6 @@ namespace detail int const UNCLASSIFIED = -1; int const NOISE = 0; -// TODO: Replace this naive query function w/ R*-tree or fractional cascading. -// This query mechanism makes the runtime quadratic. -template -static void query( - NTupleIterT const & query_pt, - NTupleIterT const & begin, - NTupleIterT const & end, - typename NTupleIterT::difference_type eps, - DistFunT const & d, - std::vector & v) -{ - for(NTupleIterT cur_pt = begin; cur_pt != end; ++cur_pt) - { - if (query_pt == cur_pt) - continue; - - if (d(*query_pt->tuple, *cur_pt->tuple) > eps) - continue; - - v.push_back(cur_pt); - } -} - // TODO: Replace this so we don't have to store the cluster info for each tuple? template struct node @@ -107,7 +85,7 @@ dbscan(NTupleIterT const & begin, // Expand cluster. std::vector seeds; - detail::query(it, tuples.begin(), tuples.end(), eps, d, seeds); + detail::naive_query(it, tuples.begin(), tuples.end(), eps, d, seeds); // If the neighborhood of this tuple is too small, then mark it as noise. if (seeds.size() < min_points) { @@ -137,7 +115,7 @@ dbscan(NTupleIterT const & begin, seeds.pop_back(); std::vector results; - detail::query(cur, tuples.begin(), tuples.end(), eps, d, results); + detail::naive_query(cur, tuples.begin(), tuples.end(), eps, d, results); if (results.size() >= min_points) { diff --git a/include/boost/algorithm/cluster/detail/naive_query.hpp b/include/boost/algorithm/cluster/detail/naive_query.hpp new file mode 100644 index 0000000..7ba780c --- /dev/null +++ b/include/boost/algorithm/cluster/detail/naive_query.hpp @@ -0,0 +1,50 @@ +// (C) Copyright Jonathan Franklin 2008. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#if ! defined BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP +#define BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP + +#include +#include +#include + +namespace boost +{ +namespace algorithm +{ +namespace cluster +{ +namespace detail +{ + +// TODO: Replace this naive query function w/ R*-tree or fractional cascading. +// This query mechanism makes the runtime quadratic. +template +static void naive_query( + NTupleIterT const & query_pt, + NTupleIterT const & begin, + NTupleIterT const & end, + typename NTupleIterT::difference_type eps, + DistFunT const & d, + std::vector & v) +{ + for(NTupleIterT cur_pt = begin; cur_pt != end; ++cur_pt) + { + if (query_pt == cur_pt) + continue; + + if (d(*query_pt->tuple, *cur_pt->tuple) > eps) + continue; + + v.push_back(cur_pt); + } +} + +} // End of namespace detail. +} // End of namespace cluster +} // End of namespace algorithm +} // End of namespace boost + +#endif // BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP diff --git a/include/boost/algorithm/cluster/k_means.hpp b/include/boost/algorithm/cluster/k_means.hpp new file mode 100644 index 0000000..12505e8 --- /dev/null +++ b/include/boost/algorithm/cluster/k_means.hpp @@ -0,0 +1,195 @@ +/***** +** References +** - J. MacQueen, "Some methods for classification and analysis +** of multivariate observations", Fifth Berkeley Symposium on +** Math Statistics and Probability, 281-297, 1967. +** - I.S. Dhillon and D.S. Modha, "A data-clustering algorithm +** on distributed memory multiprocessors", +** Large-Scale Parallel Data Mining, 245-260, 1999. +** Yuanming Chen, 2008-05-08 +*/ + +#ifndef BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP +#define BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP + +#include +#include +//#include "common.hpp" +#include +#include +#include + +namespace boost { + namespace algorithm { + namespace cluster { + namespace detail { + template + //The original C function + int *k_means(AttributeType **data, int n, int m, int k, differenceType eps, AttributeType **centroids) + { + /* output cluster label for each data point */ + int *labels = (int*)calloc(n, sizeof(int)); + + int h, i, j; /* loop counters, of course :) */ + int *counts = (int*)calloc(k, sizeof(int)); /* size of each cluster */ + AttributeType old_error, error = FLT_MAX; /* sum of squared euclidean distance */ + AttributeType **c = centroids ? centroids : (AttributeType**)calloc(k, sizeof(AttributeType*)); + AttributeType **c1 = (AttributeType**)calloc(k, sizeof(AttributeType*)); /* temp centroids */ + + //assert(data && k > 0 && k <= n && m > 0 && t >= 0); /* for debugging */ + + /**** + ** initialization */ + + for (h = i = 0; i < k; h += n / k, i++) { + c1[i] = (AttributeType*)calloc(m, sizeof(AttributeType)); + if (!centroids) { + c[i] = (AttributeType*)calloc(m, sizeof(AttributeType)); + } + /* pick k points as initial centroids */ + for (j = m; j-- > 0; c[i][j] = data[h][j]); + } + + /**** + ** main loop */ + + do { + /* save error from last step */ + old_error = error, error = 0; + + /* clear old counts and temp centroids */ + for (i = 0; i < k; counts[i++] = 0) { + for (j = 0; j < m; c1[i][j++] = 0); + } + + for (h = 0; h < n; h++) { + /* identify the closest cluster */ + AttributeType min_distance = FLT_MAX; + for (i = 0; i < k; i++) { + AttributeType distance = 0; + for (j = m; j-- > 0; distance += pow(data[h][j] - c[i][j], 2)); + if (distance < min_distance) { + labels[h] = i; + min_distance = distance; + } + } + /* update size and temp centroid of the destination cluster */ + for (j = m; j-- > 0; c1[labels[h]][j] += data[h][j]); + counts[labels[h]]++; + /* update standard error */ + error += min_distance; + } + + for (i = 0; i < k; i++) { /* update all centroids */ + for (j = 0; j < m; j++) { + c[i][j] = counts[i] ? c1[i][j] / counts[i] : c1[i][j]; + } + } + + } while (fabs(error - old_error) > eps); + + /**** + ** housekeeping */ + + for (i = 0; i < k; i++) { + if (!centroids) { + free(c[i]); + } + free(c1[i]); + } + + if (!centroids) { + free(c); + } + free(c1); + + free(counts); + + return labels; + } + } //End of details namespace + + template + struct KMeansCluster { + PointType centroid; + std::vector points; //The indice of points are stored here + }; + + template + struct KMeansClustering { + typedef std::vector< KMeansCluster > type; + type clusters; + }; + + /** + * @param first: the first data point's iterator + * @param last: the last data point's iterator + * @param k: the k value for the k-mean algorithm + * @return collections of clusters + */ + template + typename KMeansClustering< typename KMeansCluster > + k_means(NTupleIter first, NTupleIter last, unsigned k, + typename NTupleIter::difference_type const & eps) + { + typedef NTupleIter::difference_type DistanceType; + typedef NTupleIter::value_type PointType; + typedef PointType::value_type AttributeType; //For the c funtion test, it will be a double type + const DistanceType knumOfPoints = last - first; //The n variable in the C function + const size_t knDimension = PointType::size(); //The m variable in the C function + + AttributeType** ppData = new AttributeType* [knumOfPoints]; + AttributeType** centroids = new AttributeType* [k]; + //Pre-allocate the result array + for(size_t nCentroid = 0; nCentroid < k; nCentroid++) + { + centroids[nCentroid] = new AttributeType[knDimension]; + } + + int nIndex = 0; + for(NTupleIter iter = first; iter != last; iter++, nIndex++) + { + PointType& pt= *iter; //A point + ppData[nIndex] = new AttributeType[knDimension]; + for(unsigned int nAttribute = 0; nAttribute < knDimension; nAttribute++) + { + ppData[nIndex][nAttribute] = pt[nAttribute]; + } + } + + int* labels = detail::k_means(ppData, (int) knumOfPoints, (int) knDimension, k, eps, centroids); + + typedef KMeansCluster KMeansClusterType; + KMeansClustering< KMeansClusterType > clustering; + for(size_t nCentroid = 0; nCentroid < k; nCentroid++) + { + + KMeansClusterType cluster; + PointType centroid; + for(unsigned int nAttribute = 0; nAttribute < knDimension; nAttribute++) + { + centroid[nAttribute] = centroids[nCentroid][nAttribute]; + } + cluster.centroid = centroid; + clustering.clusters.push_back(cluster); + delete[] centroids[nCentroid]; + } + + for(int nPoint = 0; nPoint < knumOfPoints; nPoint++) + { + int nCentroidIndex = labels[nPoint]; + clustering.clusters[nCentroidIndex].points.push_back(nPoint); + delete[] ppData[nPoint]; + } + + delete[] centroids; + delete[] ppData; + delete[] labels; + + return clustering; + } + } //End of cluster namespace + } //End of algorithm namespace +} //End of boost namespace + +#endif // BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP