diff --git a/include/boost/algorithm/cluster/cluster_data.hpp b/include/boost/algorithm/cluster/cluster_data.hpp deleted file mode 100644 index 6e998fb..0000000 --- a/include/boost/algorithm/cluster/cluster_data.hpp +++ /dev/null @@ -1,69 +0,0 @@ -// (C) Copyright Jonathan Franklin 2008. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file -// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - -#if ! defined BOOST_ALGORITHM_CLUSTER_CLUSTER_DATA_HPP -#define BOOST_ALGORITHM_CLUSTER_CLUSTER_DATA_HPP - -#include -#include - -namespace boost -{ -namespace algorithm -{ -namespace cluster -{ - -/*! TODO: Document this type. - */ -template -struct cluster_data -{ - typedef ClusterT value_type; - typedef std::vector clusters; - cluster_data() : m_pClusters(new clusters) {} - ~cluster_data() {} - - cluster_data(cluster_data const & c) : m_pClusters(c.m_pClusters) {} - cluster_data const & cluster_data::operator=(cluster_data const & rhs) - { m_pClusters = rhs.m_pClusters; } - - typedef typename clusters::iterator iterator; - typedef typename clusters::const_iterator const_iterator; - typedef typename clusters::reverse_iterator reverse_iterator; - - iterator begin() { return m_pClusters->begin(); } - iterator end() { return m_pClusters->end(); } - - const_iterator begin() const { return m_pClusters->begin(); } - const_iterator end() const { return m_pClusters->end(); } - - iterator rbegin() { return m_pClusters->rbegin(); } - iterator rend() { return m_pClusters->rend(); } - - iterator insert(iterator loc, value_type const & val) - { return m_pClusters->insert(loc, val); } - - void push_back(value_type const & v) { m_pClusters->push_back(v); } - void pop_back() { m_pClusters->pop_back(); } - - value_type & back() { return m_pClusters->back(); } - value_type const & back() const { return m_pClusters->back(); } - - size_t size() const { return m_pClusters->size(); } -private: - boost::shared_ptr m_pClusters; -}; - -} // End of namespace cluster - -// TODO: Should we be exporting this? -using namespace cluster; - -} // End of namespace algorithm - -} // End of namespace boost - -#endif // BOOST_ALGORITHM_CLUSTER_CLUSTER_DATA_HPP diff --git a/include/boost/algorithm/cluster/concept.hpp b/include/boost/algorithm/cluster/concept.hpp deleted file mode 100644 index 9832756..0000000 --- a/include/boost/algorithm/cluster/concept.hpp +++ /dev/null @@ -1,38 +0,0 @@ -// (C) Copyright Jonathan Franklin 2008. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file -// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - -#if ! defined BOOST_ALGORITHM_CLUSTER_CONCEPT_HPP -#define BOOST_ALGORITHM_CLUSTER_CONCEPT_HPP - -#include - -namespace boost -{ -namespace algorithm -{ -namespace cluster -{ - - // TODO: Document the purpose of this concept. - template - struct DistanceComparableConcept - { - void constraints() - { - // Operation - d(t, t); - } - private: - T t; - DistanceFunT d; - }; - - // TODO: Add concepts here, then delete this comment. - -} // End of namespace cluster; -} // End of namespace algorithm; -} // End of namespace boost; - -#endif // BOOST_ALGORITHM_CLUSTER_CONCEPT_HPP diff --git a/include/boost/algorithm/cluster/dbscan.hpp b/include/boost/algorithm/cluster/dbscan.hpp deleted file mode 100644 index f0d8dcc..0000000 --- a/include/boost/algorithm/cluster/dbscan.hpp +++ /dev/null @@ -1,153 +0,0 @@ -// (C) Copyright Jonathan Franklin 2008. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file -// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - -#if ! defined BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP -#define BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP - -#include -#include -#include -#include -#include - -namespace boost -{ -namespace algorithm -{ -namespace cluster -{ - -namespace detail -{ -// TODO: Where should we put these? -int const UNCLASSIFIED = -1; -int const NOISE = 0; - -// TODO: Replace this so we don't have to store the cluster info for each tuple? -template -struct node -{ - node(NTupleIterT const & t) : tuple(t), cluster(UNCLASSIFIED) {} - - NTupleIterT tuple; - int cluster; -}; - -} // End of namespace detail. - -/*! DBSCAN density-based clustering algorithm. - * TODO: Document this function. - * \param[in] begin - * \param[in] end - * \param[in] eps - * \param[in] min_points - * \param[in] d - * \return The cluster data (partitioning of the tuples). - */ -template -cluster_data -dbscan(NTupleIterT const & begin, - NTupleIterT const & end, - DistanceT const & eps, - size_t min_points, - DistFunT const & d) -{ - // Concept check. - function_requires< - DistanceComparableConcept >(); - //DistanceComparableConcept >(); - function_requires< - DistanceComparableConcept >(); - - // TODO: Rework the algorithm to NOT make this extra collection? - typedef detail::node node; - typedef std::vector ntuple_nodes; - ntuple_nodes tuples; - - // Initialize algorithm. - //size_t num_elems = 0; - for(NTupleIterT it = begin; it != end; ++it) - { - //++num_elems; - tuples.push_back(node(it)); - } - - typedef cluster_data > cluster_data; - cluster_data p; - - // TODO: We should try to make cluster_num go away. - int cluster_num = 0; - for(ntuple_nodes::iterator it = tuples.begin(); it != tuples.end(); ++it) - { - // Skip this tuple if its already been classified as a cluster or noise. - if (it->cluster != detail::UNCLASSIFIED) - continue; - - // Expand cluster. - - std::vector seeds; - detail::naive_query(it, tuples.begin(), tuples.end(), eps, d, seeds); - // If the neighborhood of this tuple is too small, then mark it as noise. - if (seeds.size() < min_points) - { - it->cluster = detail::NOISE; - continue; - } - - // Start the next cluster. - ++cluster_num; - p.push_back(ClusterT()); // TODO: This is goofy. - ClusterT & cur_cluster = p.back(); - - // Mark entire neighborhood as part of the current cluster. - it->cluster = cluster_num; - cur_cluster.push_back(it->tuple); - for (size_t n = 0; n < seeds.size(); ++n) - { - seeds[n]->cluster = cluster_num; - cur_cluster.push_back(seeds[n]->tuple); - } - - // Keep adding seeds and processing them until we find all points that - // are Density Reachable. - while (! seeds.empty()) - { - ntuple_nodes::iterator cur = seeds.back(); - seeds.pop_back(); - - std::vector results; - detail::naive_query(cur, tuples.begin(), tuples.end(), eps, d, results); - - if (results.size() >= min_points) - { - for (size_t n = 0; n < results.size(); ++n) - { - if (results[n]->cluster < 1) // Not assigned to cluster yet. - { - if (detail::UNCLASSIFIED == results[n]->cluster) - seeds.push_back(results[n]); - results[n]->cluster = cluster_num; - cur_cluster.push_back(results[n]->tuple); - } - } - } - } - - } // Outer loop for all tuples. - - return p; -} - -} // End of namespace cluster - -// TODO: Should we be exporting this? -using namespace cluster; - -} // End of namespace algorithm - -} // End of namespace boost - -#endif // BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP diff --git a/include/boost/algorithm/cluster/detail/naive_query.hpp b/include/boost/algorithm/cluster/detail/naive_query.hpp deleted file mode 100644 index 5a05409..0000000 --- a/include/boost/algorithm/cluster/detail/naive_query.hpp +++ /dev/null @@ -1,50 +0,0 @@ -// (C) Copyright Jonathan Franklin 2008. -// Use, modification and distribution are subject to the -// Boost Software License, Version 1.0. (See accompanying file -// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - -#if ! defined BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP -#define BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP - -#include -#include -#include - -namespace boost -{ -namespace algorithm -{ -namespace cluster -{ -namespace detail -{ - -// TODO: Replace this naive query function w/ R*-tree or fractional cascading. -// This query mechanism makes the runtime quadratic. -template -static void naive_query( - NTupleIterT const & query_pt, - NTupleIterT const & begin, - NTupleIterT const & end, - DistanceT const & eps, - DistFunT const & d, - std::vector & v) -{ - for(NTupleIterT cur_pt = begin; cur_pt != end; ++cur_pt) - { - if (query_pt == cur_pt) - continue; - - if (d(*query_pt->tuple, *cur_pt->tuple) > eps) - continue; - - v.push_back(cur_pt); - } -} - -} // End of namespace detail. -} // End of namespace cluster -} // End of namespace algorithm -} // End of namespace boost - -#endif // BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP diff --git a/include/boost/algorithm/cluster/k_means.hpp b/include/boost/algorithm/cluster/k_means.hpp deleted file mode 100644 index 12505e8..0000000 --- a/include/boost/algorithm/cluster/k_means.hpp +++ /dev/null @@ -1,195 +0,0 @@ -/***** -** References -** - J. MacQueen, "Some methods for classification and analysis -** of multivariate observations", Fifth Berkeley Symposium on -** Math Statistics and Probability, 281-297, 1967. -** - I.S. Dhillon and D.S. Modha, "A data-clustering algorithm -** on distributed memory multiprocessors", -** Large-Scale Parallel Data Mining, 245-260, 1999. -** Yuanming Chen, 2008-05-08 -*/ - -#ifndef BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP -#define BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP - -#include -#include -//#include "common.hpp" -#include -#include -#include - -namespace boost { - namespace algorithm { - namespace cluster { - namespace detail { - template - //The original C function - int *k_means(AttributeType **data, int n, int m, int k, differenceType eps, AttributeType **centroids) - { - /* output cluster label for each data point */ - int *labels = (int*)calloc(n, sizeof(int)); - - int h, i, j; /* loop counters, of course :) */ - int *counts = (int*)calloc(k, sizeof(int)); /* size of each cluster */ - AttributeType old_error, error = FLT_MAX; /* sum of squared euclidean distance */ - AttributeType **c = centroids ? centroids : (AttributeType**)calloc(k, sizeof(AttributeType*)); - AttributeType **c1 = (AttributeType**)calloc(k, sizeof(AttributeType*)); /* temp centroids */ - - //assert(data && k > 0 && k <= n && m > 0 && t >= 0); /* for debugging */ - - /**** - ** initialization */ - - for (h = i = 0; i < k; h += n / k, i++) { - c1[i] = (AttributeType*)calloc(m, sizeof(AttributeType)); - if (!centroids) { - c[i] = (AttributeType*)calloc(m, sizeof(AttributeType)); - } - /* pick k points as initial centroids */ - for (j = m; j-- > 0; c[i][j] = data[h][j]); - } - - /**** - ** main loop */ - - do { - /* save error from last step */ - old_error = error, error = 0; - - /* clear old counts and temp centroids */ - for (i = 0; i < k; counts[i++] = 0) { - for (j = 0; j < m; c1[i][j++] = 0); - } - - for (h = 0; h < n; h++) { - /* identify the closest cluster */ - AttributeType min_distance = FLT_MAX; - for (i = 0; i < k; i++) { - AttributeType distance = 0; - for (j = m; j-- > 0; distance += pow(data[h][j] - c[i][j], 2)); - if (distance < min_distance) { - labels[h] = i; - min_distance = distance; - } - } - /* update size and temp centroid of the destination cluster */ - for (j = m; j-- > 0; c1[labels[h]][j] += data[h][j]); - counts[labels[h]]++; - /* update standard error */ - error += min_distance; - } - - for (i = 0; i < k; i++) { /* update all centroids */ - for (j = 0; j < m; j++) { - c[i][j] = counts[i] ? c1[i][j] / counts[i] : c1[i][j]; - } - } - - } while (fabs(error - old_error) > eps); - - /**** - ** housekeeping */ - - for (i = 0; i < k; i++) { - if (!centroids) { - free(c[i]); - } - free(c1[i]); - } - - if (!centroids) { - free(c); - } - free(c1); - - free(counts); - - return labels; - } - } //End of details namespace - - template - struct KMeansCluster { - PointType centroid; - std::vector points; //The indice of points are stored here - }; - - template - struct KMeansClustering { - typedef std::vector< KMeansCluster > type; - type clusters; - }; - - /** - * @param first: the first data point's iterator - * @param last: the last data point's iterator - * @param k: the k value for the k-mean algorithm - * @return collections of clusters - */ - template - typename KMeansClustering< typename KMeansCluster > - k_means(NTupleIter first, NTupleIter last, unsigned k, - typename NTupleIter::difference_type const & eps) - { - typedef NTupleIter::difference_type DistanceType; - typedef NTupleIter::value_type PointType; - typedef PointType::value_type AttributeType; //For the c funtion test, it will be a double type - const DistanceType knumOfPoints = last - first; //The n variable in the C function - const size_t knDimension = PointType::size(); //The m variable in the C function - - AttributeType** ppData = new AttributeType* [knumOfPoints]; - AttributeType** centroids = new AttributeType* [k]; - //Pre-allocate the result array - for(size_t nCentroid = 0; nCentroid < k; nCentroid++) - { - centroids[nCentroid] = new AttributeType[knDimension]; - } - - int nIndex = 0; - for(NTupleIter iter = first; iter != last; iter++, nIndex++) - { - PointType& pt= *iter; //A point - ppData[nIndex] = new AttributeType[knDimension]; - for(unsigned int nAttribute = 0; nAttribute < knDimension; nAttribute++) - { - ppData[nIndex][nAttribute] = pt[nAttribute]; - } - } - - int* labels = detail::k_means(ppData, (int) knumOfPoints, (int) knDimension, k, eps, centroids); - - typedef KMeansCluster KMeansClusterType; - KMeansClustering< KMeansClusterType > clustering; - for(size_t nCentroid = 0; nCentroid < k; nCentroid++) - { - - KMeansClusterType cluster; - PointType centroid; - for(unsigned int nAttribute = 0; nAttribute < knDimension; nAttribute++) - { - centroid[nAttribute] = centroids[nCentroid][nAttribute]; - } - cluster.centroid = centroid; - clustering.clusters.push_back(cluster); - delete[] centroids[nCentroid]; - } - - for(int nPoint = 0; nPoint < knumOfPoints; nPoint++) - { - int nCentroidIndex = labels[nPoint]; - clustering.clusters[nCentroidIndex].points.push_back(nPoint); - delete[] ppData[nPoint]; - } - - delete[] centroids; - delete[] ppData; - delete[] labels; - - return clustering; - } - } //End of cluster namespace - } //End of algorithm namespace -} //End of boost namespace - -#endif // BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP