diff --git a/include/boost/algorithm/cluster/dbscan.hpp b/include/boost/algorithm/cluster/dbscan.hpp new file mode 100644 index 0000000..808dc84 --- /dev/null +++ b/include/boost/algorithm/cluster/dbscan.hpp @@ -0,0 +1,190 @@ +#if ! defined BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP +#define BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP + +#include +#include +#include +#include +#include + +namespace boost +{ +namespace algorithm +{ +namespace cluster +{ + +namespace detail +{ + +// TODO: Replace this naive query function w/ R*-tree or fractional cascading. +// It makes the runtime quadratic. +template +static void query( + NTupleIter const & query_pt, + NTupleIter const & begin, + NTupleIter const & end, + float eps, + DistFun const & d, + std::vector & v) +{ + for(NTupleIter cur_pt = begin; cur_pt != end; ++cur_pt) + { + if (query_pt == cur_pt) + continue; + + if (d(*query_pt->tuple, *cur_pt->tuple) > eps) + continue; + + v.push_back(cur_pt); + } +} + +// TODO: Replace this so we don't have to store the cluster info for each tuple. +template +struct node +{ + node(NTupleIter const & t) : tuple(t), cluster(UNCLASSIFIED) {} + + NTupleIter tuple; + int cluster; +}; + +} // End of namespace detail. + +// TODO: Document this type. +template +struct cluster_data +{ + typedef Cluster value_type; + typedef std::vector clusters; + cluster_data() : m_pClusters(new clusters) {} + ~cluster_data() {} + + cluster_data(cluster_data const & c) : m_pClusters(c.m_pClusters) {} + cluster_data const & cluster_data::operator=(cluster_data const & rhs) + { m_pClusters = rhs.m_pClusters; } + + typedef typename clusters::iterator iterator; + typedef typename clusters::const_iterator const_iterator; + typedef typename clusters::reverse_iterator reverse_iterator; + + iterator begin() { return m_pClusters->begin(); } + iterator end() { return m_pClusters->end(); } + + const_iterator begin() const { return m_pClusters->begin(); } + const_iterator end() const { return m_pClusters->end(); } + + iterator rbegin() { return m_pClusters->rbegin(); } + iterator rend() { return m_pClusters->rend(); } + + iterator insert(iterator loc, value_type const & val) + { return m_pClusters->insert(loc, val); } + + void push_back(value_type const & v) { m_pClusters->push_back(v); } + void pop_back() { m_pClusters->pop_back(); } + + value_type & back() { return m_pClusters->back(); } + value_type const & back() const { return m_pClusters->back(); } + +private: + boost::shared_ptr m_pClusters; +}; + +/** + */ +template +cluster_data +dbscan(NTupleIter const & begin, + NTupleIter const & end, + typename NTupleIter::difference_type const & eps, + size_t min_points, + DistFun const & d) +{ + // TODO: Rework the algorithm to NOT make this extra collection. + typedef detail::node node; + typedef std::vector ntuple_nodes; + ntuple_nodes tuples; + + // Initialize algorithm. + //size_t num_elems = 0; + for(NTupleIter it = begin; it != end; ++it) + { + //++num_elems; + //it->cluster = UNCLASSIFIED; + tuples.push_back(node(it)); + } + + typedef cluster_data > cluster_data; + cluster_data p; + + // Do it... + int cluster_num = 0; + for(ntuple_nodes::iterator it = tuples.begin(); it != tuples.end(); ++it) + { + if (it->cluster != UNCLASSIFIED) // Been classified. + continue; + + // Expand cluster. + + std::vector seeds; + detail::query(it, tuples.begin(), tuples.end(), eps, d, seeds); + if (seeds.size() < min_points) + { + it->cluster = NOISE; + continue; + } + + // Start the next cluster. + ++cluster_num; + p.push_back(Cluster()); + Cluster & cur_cluster = p.back(); + + // Mark entire neighborhood as part of current cluster. + it->cluster = cluster_num; + cur_cluster.push_back(it->tuple); + // TODO: Remove it from noise. + for (size_t n = 0; n < seeds.size(); ++n) + { + seeds[n]->cluster = cluster_num; + cur_cluster.push_back(seeds[n]->tuple); + // TODO: Remove it from noise. + } + + while (! seeds.empty()) + { + ntuple_nodes::iterator cur = seeds.back(); + seeds.pop_back(); + + std::vector results; + detail::query(cur, tuples.begin(), tuples.end(), eps, d, results); + + if (results.size() >= min_points) + { + for (size_t n = 0; n < results.size(); ++n) + { + if (results[n]->cluster < 1) // Not assigned to cluster yet. + { + if (UNCLASSIFIED == results[n]->cluster) + seeds.push_back(results[n]); + results[n]->cluster = cluster_num; + cur_cluster.push_back(results[n]->tuple); + } + } + } + } + + } // Outer loop for all tuples. + + return p; +} + +} // End of namespace cluster + +using namespace cluster; + +} // End of namespace algorithm + +} // End of namespace boost + +#endif // BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP