// (C) Copyright Jonathan Franklin 2008. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #if ! defined BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP #define BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP #include #include #include namespace boost { namespace algorithm { namespace cluster { namespace detail { // TODO: Where should we put these? int const UNCLASSIFIED = -1; int const NOISE = 0; // TODO: Replace this naive query function w/ R*-tree or fractional cascading. // This query mechanism makes the runtime quadratic. template static void query( NTupleIterT const & query_pt, NTupleIterT const & begin, NTupleIterT const & end, typename NTupleIterT::difference_type eps, DistFunT const & d, std::vector & v) { for(NTupleIterT cur_pt = begin; cur_pt != end; ++cur_pt) { if (query_pt == cur_pt) continue; if (d(*query_pt->tuple, *cur_pt->tuple) > eps) continue; v.push_back(cur_pt); } } // TODO: Replace this so we don't have to store the cluster info for each tuple? template struct node { node(NTupleIterT const & t) : tuple(t), cluster(UNCLASSIFIED) {} NTupleIterT tuple; int cluster; }; } // End of namespace detail. /*! DBSCAN density-based clustering algorithm. * TODO: Document this function. * \param[in] begin * \param[in] end * \param[in] eps * \param[in] min_points * \param[in] d * \return The cluster data (partitioning of the tuples). */ template cluster_data dbscan(NTupleIterT const & begin, NTupleIterT const & end, typename NTupleIterT::difference_type const & eps, size_t min_points, DistFunT const & d) { // Concept check. function_requires< DistanceComparableConcept >(); //DistanceComparableConcept >(); // TODO: Rework the algorithm to NOT make this extra collection? typedef detail::node node; typedef std::vector ntuple_nodes; ntuple_nodes tuples; // Initialize algorithm. //size_t num_elems = 0; for(NTupleIterT it = begin; it != end; ++it) { //++num_elems; tuples.push_back(node(it)); } typedef cluster_data > cluster_data; cluster_data p; // TODO: We should try to make cluster_num go away. int cluster_num = 0; for(ntuple_nodes::iterator it = tuples.begin(); it != tuples.end(); ++it) { // Skip this tuple if its already been classified as a cluster or noise. if (it->cluster != detail::UNCLASSIFIED) continue; // Expand cluster. std::vector seeds; detail::query(it, tuples.begin(), tuples.end(), eps, d, seeds); // If the neighborhood of this tuple is too small, then mark it as noise. if (seeds.size() < min_points) { it->cluster = detail::NOISE; continue; } // Start the next cluster. ++cluster_num; p.push_back(ClusterT()); // TODO: This is goofy. ClusterT & cur_cluster = p.back(); // Mark entire neighborhood as part of the current cluster. it->cluster = cluster_num; cur_cluster.push_back(it->tuple); for (size_t n = 0; n < seeds.size(); ++n) { seeds[n]->cluster = cluster_num; cur_cluster.push_back(seeds[n]->tuple); } // Keep adding seeds and processing them until we find all points that // are Density Reachable. while (! seeds.empty()) { ntuple_nodes::iterator cur = seeds.back(); seeds.pop_back(); std::vector results; detail::query(cur, tuples.begin(), tuples.end(), eps, d, results); if (results.size() >= min_points) { for (size_t n = 0; n < results.size(); ++n) { if (results[n]->cluster < 1) // Not assigned to cluster yet. { if (detail::UNCLASSIFIED == results[n]->cluster) seeds.push_back(results[n]); results[n]->cluster = cluster_num; cur_cluster.push_back(results[n]->tuple); } } } } } // Outer loop for all tuples. return p; } } // End of namespace cluster // TODO: Should we be exporting this? using namespace cluster; } // End of namespace algorithm } // End of namespace boost #endif // BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP