2008-05-08 17:09:19 +00:00
|
|
|
#if ! defined BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP
|
|
|
|
#define BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP
|
|
|
|
|
2008-05-08 17:40:52 +00:00
|
|
|
#include <boost/algorithm/cluster/cluster_data.hpp>
|
2008-05-08 17:09:19 +00:00
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
namespace boost
|
|
|
|
{
|
|
|
|
namespace algorithm
|
|
|
|
{
|
|
|
|
namespace cluster
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace detail
|
|
|
|
{
|
|
|
|
|
|
|
|
// TODO: Replace this naive query function w/ R*-tree or fractional cascading.
|
2008-05-08 17:40:52 +00:00
|
|
|
// This query mechanism makes the runtime quadratic.
|
2008-05-08 17:09:19 +00:00
|
|
|
template<typename NTupleIter, typename DistFun>
|
|
|
|
static void query(
|
|
|
|
NTupleIter const & query_pt,
|
|
|
|
NTupleIter const & begin,
|
|
|
|
NTupleIter const & end,
|
2008-05-08 17:40:52 +00:00
|
|
|
typename NTupleIter::difference_type eps,
|
2008-05-08 17:09:19 +00:00
|
|
|
DistFun const & d,
|
|
|
|
std::vector<NTupleIter> & v)
|
|
|
|
{
|
|
|
|
for(NTupleIter cur_pt = begin; cur_pt != end; ++cur_pt)
|
|
|
|
{
|
|
|
|
if (query_pt == cur_pt)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (d(*query_pt->tuple, *cur_pt->tuple) > eps)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
v.push_back(cur_pt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-05-08 17:40:52 +00:00
|
|
|
// TODO: Replace this so we don't have to store the cluster info for each tuple?
|
2008-05-08 17:09:19 +00:00
|
|
|
template<typename NTupleIter>
|
|
|
|
struct node
|
|
|
|
{
|
|
|
|
node(NTupleIter const & t) : tuple(t), cluster(UNCLASSIFIED) {}
|
|
|
|
|
|
|
|
NTupleIter tuple;
|
|
|
|
int cluster;
|
|
|
|
};
|
|
|
|
|
|
|
|
} // End of namespace detail.
|
|
|
|
|
2008-05-08 17:40:52 +00:00
|
|
|
/*! DBSCAN density-based clustering algorithm.
|
|
|
|
* TODO: Document this function.
|
|
|
|
* \param[in] begin
|
|
|
|
* \param[in] end
|
|
|
|
* \param[in] eps
|
|
|
|
* \param[in] min_points
|
|
|
|
* \param[in] d
|
|
|
|
* \return The cluster data (partitioning of the tuples).
|
2008-05-08 17:09:19 +00:00
|
|
|
*/
|
|
|
|
template<typename Cluster, typename NTupleIter, typename DistFun>
|
|
|
|
cluster_data<Cluster>
|
|
|
|
dbscan(NTupleIter const & begin,
|
|
|
|
NTupleIter const & end,
|
|
|
|
typename NTupleIter::difference_type const & eps,
|
|
|
|
size_t min_points,
|
|
|
|
DistFun const & d)
|
|
|
|
{
|
2008-05-08 17:40:52 +00:00
|
|
|
int const UNCLASSIFIED = -1;
|
|
|
|
int const NOISE = 0;
|
|
|
|
|
|
|
|
// TODO: Rework the algorithm to NOT make this extra collection?
|
2008-05-08 17:09:19 +00:00
|
|
|
typedef detail::node<NTupleIter> node;
|
|
|
|
typedef std::vector<node> ntuple_nodes;
|
|
|
|
ntuple_nodes tuples;
|
|
|
|
|
|
|
|
// Initialize algorithm.
|
|
|
|
//size_t num_elems = 0;
|
|
|
|
for(NTupleIter it = begin; it != end; ++it)
|
|
|
|
{
|
|
|
|
//++num_elems;
|
|
|
|
tuples.push_back(node(it));
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef cluster_data<std::vector<NTupleIter> > cluster_data;
|
|
|
|
cluster_data p;
|
|
|
|
|
2008-05-08 17:40:52 +00:00
|
|
|
// TODO: We should try to make cluster_num go away.
|
2008-05-08 17:09:19 +00:00
|
|
|
int cluster_num = 0;
|
|
|
|
for(ntuple_nodes::iterator it = tuples.begin(); it != tuples.end(); ++it)
|
|
|
|
{
|
2008-05-08 17:40:52 +00:00
|
|
|
// Skip this tuple if its already been classified as a cluster or noise.
|
|
|
|
if (it->cluster != UNCLASSIFIED)
|
2008-05-08 17:09:19 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// Expand cluster.
|
|
|
|
|
|
|
|
std::vector<ntuple_nodes::iterator> seeds;
|
|
|
|
detail::query(it, tuples.begin(), tuples.end(), eps, d, seeds);
|
2008-05-08 17:40:52 +00:00
|
|
|
// If the neighborhood of this tuple is too small, then mark it as noise.
|
2008-05-08 17:09:19 +00:00
|
|
|
if (seeds.size() < min_points)
|
|
|
|
{
|
|
|
|
it->cluster = NOISE;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start the next cluster.
|
|
|
|
++cluster_num;
|
2008-05-08 17:40:52 +00:00
|
|
|
p.push_back(Cluster()); // TODO: This is goofy.
|
2008-05-08 17:09:19 +00:00
|
|
|
Cluster & cur_cluster = p.back();
|
|
|
|
|
2008-05-08 17:40:52 +00:00
|
|
|
// Mark entire neighborhood as part of the current cluster.
|
2008-05-08 17:09:19 +00:00
|
|
|
it->cluster = cluster_num;
|
|
|
|
cur_cluster.push_back(it->tuple);
|
|
|
|
for (size_t n = 0; n < seeds.size(); ++n)
|
|
|
|
{
|
|
|
|
seeds[n]->cluster = cluster_num;
|
|
|
|
cur_cluster.push_back(seeds[n]->tuple);
|
|
|
|
}
|
|
|
|
|
2008-05-08 17:40:52 +00:00
|
|
|
// Keep adding seeds and processing them until we find all points that
|
|
|
|
// are Density Reachable.
|
2008-05-08 17:09:19 +00:00
|
|
|
while (! seeds.empty())
|
|
|
|
{
|
|
|
|
ntuple_nodes::iterator cur = seeds.back();
|
|
|
|
seeds.pop_back();
|
|
|
|
|
|
|
|
std::vector<ntuple_nodes::iterator> results;
|
|
|
|
detail::query(cur, tuples.begin(), tuples.end(), eps, d, results);
|
|
|
|
|
|
|
|
if (results.size() >= min_points)
|
|
|
|
{
|
|
|
|
for (size_t n = 0; n < results.size(); ++n)
|
|
|
|
{
|
|
|
|
if (results[n]->cluster < 1) // Not assigned to cluster yet.
|
|
|
|
{
|
|
|
|
if (UNCLASSIFIED == results[n]->cluster)
|
|
|
|
seeds.push_back(results[n]);
|
|
|
|
results[n]->cluster = cluster_num;
|
|
|
|
cur_cluster.push_back(results[n]->tuple);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} // Outer loop for all tuples.
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // End of namespace cluster
|
|
|
|
|
2008-05-08 17:40:52 +00:00
|
|
|
// TODO: Should we be exporting this?
|
2008-05-08 17:09:19 +00:00
|
|
|
using namespace cluster;
|
|
|
|
|
|
|
|
} // End of namespace algorithm
|
|
|
|
|
|
|
|
} // End of namespace boost
|
|
|
|
|
|
|
|
#endif // BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP
|