Moved the cluster_data template into its own header.

Cleaned up the code slightly, and added some doc comments. [SVN r45221]
2008-05-08 17:40:52 +00:00
parent 51751da0fd
commit c91fe00184
2 changed files with 88 additions and 55 deletions
--- a/include/boost/algorithm/cluster/dbscan.hpp
+++ b/include/boost/algorithm/cluster/dbscan.hpp
@ -1,11 +1,8 @@
 #if ! defined BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP
 #define BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP

-#include <boost/range/begin.hpp>
-#include <boost/range/end.hpp>
-#include <boost/shared_ptr.hpp>
+#include <boost/algorithm/cluster/cluster_data.hpp>
 #include <vector>
-#include <list>

 namespace boost
 {
@ -18,13 +15,13 @@ namespace detail
 {

 // TODO: Replace this naive query function w/ R*-tree or fractional cascading.
-// It makes the runtime quadratic.
+// This query mechanism makes the runtime quadratic.
 template<typename NTupleIter, typename DistFun>
 static void query(
  NTupleIter const & query_pt,
  NTupleIter const & begin,
  NTupleIter const & end,
-  float eps,
+  typename NTupleIter::difference_type eps,
  DistFun const & d,
  std::vector<NTupleIter> & v)
 {
@ -40,7 +37,7 @@ static void query(
  }
 }

-// TODO: Replace this so we don't have to store the cluster info for each tuple.
+// TODO: Replace this so we don't have to store the cluster info for each tuple?
 template<typename NTupleIter>
 struct node
 {
@ -52,46 +49,14 @@ struct node

 } // End of namespace detail.

-// TODO: Document this type.
-template<typename Cluster>
-struct cluster_data
-{
-  typedef Cluster value_type;
-  typedef std::vector<value_type> clusters;
-  cluster_data() : m_pClusters(new clusters) {}
-  ~cluster_data() {}
-
-  cluster_data(cluster_data const & c) : m_pClusters(c.m_pClusters) {}
-  cluster_data const & cluster_data::operator=(cluster_data const & rhs)
-  { m_pClusters = rhs.m_pClusters; }
-
-  typedef typename clusters::iterator iterator;
-  typedef typename clusters::const_iterator const_iterator;
-  typedef typename clusters::reverse_iterator reverse_iterator;
-
-  iterator begin() { return m_pClusters->begin(); }
-  iterator end() { return m_pClusters->end(); }
-
-  const_iterator begin() const { return m_pClusters->begin(); }
-  const_iterator end() const { return m_pClusters->end(); }
-
-  iterator rbegin() { return m_pClusters->rbegin(); }
-  iterator rend() { return m_pClusters->rend(); }
-
-  iterator insert(iterator loc, value_type const & val)
-  { return m_pClusters->insert(loc, val); }
-
-  void push_back(value_type const & v) { m_pClusters->push_back(v); }
-  void pop_back() { m_pClusters->pop_back(); }
-
-  value_type & back() { return m_pClusters->back(); }
-  value_type const & back() const { return m_pClusters->back(); }
-
-private:
-  boost::shared_ptr<clusters> m_pClusters;
-};
-
-/**
+/*! DBSCAN density-based clustering algorithm.
+ * TODO: Document this function.
+ * \param[in] begin
+ * \param[in] end
+ * \param[in] eps
+ * \param[in] min_points
+ * \param[in] d
+ * \return The cluster data (partitioning of the tuples).
 */
 template<typename Cluster, typename NTupleIter, typename DistFun>
 cluster_data<Cluster>
@ -101,7 +66,10 @@ dbscan(NTupleIter const & begin,
       size_t min_points,
       DistFun const & d)
 {
-  // TODO: Rework the algorithm to NOT make this extra collection.
+  int const UNCLASSIFIED = -1;
+  int const NOISE = 0;
+
+  // TODO: Rework the algorithm to NOT make this extra collection?
  typedef detail::node<NTupleIter> node;
  typedef std::vector<node> ntuple_nodes;
  ntuple_nodes tuples;
@ -111,24 +79,25 @@ dbscan(NTupleIter const & begin,
  for(NTupleIter it = begin; it != end; ++it)
  {
    //++num_elems;
-    //it->cluster = UNCLASSIFIED;
    tuples.push_back(node(it));
  }

  typedef cluster_data<std::vector<NTupleIter> > cluster_data;
  cluster_data p;

-  // Do it...
+  // TODO: We should try to make cluster_num go away.
  int cluster_num = 0;
  for(ntuple_nodes::iterator it = tuples.begin(); it != tuples.end(); ++it)
  {
-    if (it->cluster != UNCLASSIFIED) // Been classified.
+    // Skip this tuple if its already been classified as a cluster or noise.
+    if (it->cluster != UNCLASSIFIED)
      continue;

    // Expand cluster.

    std::vector<ntuple_nodes::iterator> seeds;
    detail::query(it, tuples.begin(), tuples.end(), eps, d, seeds);
+    // If the neighborhood of this tuple is too small, then mark it as noise.
    if (seeds.size() < min_points)
    {
      it->cluster = NOISE;
@ -137,20 +106,20 @@ dbscan(NTupleIter const & begin,

    // Start the next cluster.
    ++cluster_num;
-    p.push_back(Cluster());
+    p.push_back(Cluster()); // TODO: This is goofy.
    Cluster & cur_cluster = p.back();

-    // Mark entire neighborhood as part of current cluster.
+    // Mark entire neighborhood as part of the current cluster.
    it->cluster = cluster_num;
    cur_cluster.push_back(it->tuple);
-    // TODO: Remove it from noise.
    for (size_t n = 0; n < seeds.size(); ++n)
    {
      seeds[n]->cluster = cluster_num;
      cur_cluster.push_back(seeds[n]->tuple);
-      // TODO: Remove it from noise.
    }

+    // Keep adding seeds and processing them until we find all points that
+    // are Density Reachable.
    while (! seeds.empty())
    {
      ntuple_nodes::iterator cur = seeds.back();
@ -181,6 +150,7 @@ dbscan(NTupleIter const & begin,

 } // End of namespace cluster

+// TODO: Should we be exporting this?
 using namespace cluster;

 } // End of namespace algorithm