Created detail subdir.

Moved naive query function into its own header file in detail subdir. Added header for Yuan Ming Chen's k_means algorithm implementation. [SVN r45228]
2008-05-08 20:36:49 +00:00
parent cdf58b4785
commit 64d219039e
3 changed files with 248 additions and 25 deletions
--- a/include/boost/algorithm/cluster/dbscan.hpp
+++ b/include/boost/algorithm/cluster/dbscan.hpp
@ -8,6 +8,7 @@

 #include <boost/algorithm/cluster/cluster_data.hpp>
 #include <boost/algorithm/cluster/concept.hpp>
+#include <boost/algorithm/cluster/detail/naive_query.hpp>
 #include <vector>

 namespace boost
@ -23,29 +24,6 @@ namespace detail
 int const UNCLASSIFIED = -1;
 int const NOISE = 0;

-// TODO: Replace this naive query function w/ R*-tree or fractional cascading.
-// This query mechanism makes the runtime quadratic.
-template<typename NTupleIterT, typename DistFunT>
-static void query(
-  NTupleIterT const & query_pt,
-  NTupleIterT const & begin,
-  NTupleIterT const & end,
-  typename NTupleIterT::difference_type eps,
-  DistFunT const & d,
-  std::vector<NTupleIterT> & v)
-{
-  for(NTupleIterT cur_pt = begin; cur_pt != end; ++cur_pt)
-  {
-    if (query_pt == cur_pt)
-      continue;
-
-    if (d(*query_pt->tuple, *cur_pt->tuple) > eps)
-      continue;
-
-    v.push_back(cur_pt);
-  }
-}
-
 // TODO: Replace this so we don't have to store the cluster info for each tuple?
 template<typename NTupleIterT>
 struct node
@ -107,7 +85,7 @@ dbscan(NTupleIterT const & begin,
    // Expand cluster.

    std::vector<ntuple_nodes::iterator> seeds;
-    detail::query(it, tuples.begin(), tuples.end(), eps, d, seeds);
+    detail::naive_query(it, tuples.begin(), tuples.end(), eps, d, seeds);
    // If the neighborhood of this tuple is too small, then mark it as noise.
    if (seeds.size() < min_points)
    {
@ -137,7 +115,7 @@ dbscan(NTupleIterT const & begin,
      seeds.pop_back();

      std::vector<ntuple_nodes::iterator> results;
-      detail::query(cur, tuples.begin(), tuples.end(), eps, d, results);
+      detail::naive_query(cur, tuples.begin(), tuples.end(), eps, d, results);

      if (results.size() >= min_points)
      {
--- a/include/boost/algorithm/cluster/detail/naive_query.hpp
+++ b/include/boost/algorithm/cluster/detail/naive_query.hpp
@ -0,0 +1,50 @@
+//  (C) Copyright Jonathan Franklin 2008.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#if ! defined BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP
+#define BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP
+
+#include <boost/algorithm/cluster/cluster_data.hpp>
+#include <boost/algorithm/cluster/concept.hpp>
+#include <vector>
+
+namespace boost
+{
+namespace algorithm
+{
+namespace cluster
+{
+namespace detail
+{
+
+// TODO: Replace this naive query function w/ R*-tree or fractional cascading.
+// This query mechanism makes the runtime quadratic.
+template<typename NTupleIterT, typename DistFunT>
+static void naive_query(
+  NTupleIterT const & query_pt,
+  NTupleIterT const & begin,
+  NTupleIterT const & end,
+  typename NTupleIterT::difference_type eps,
+  DistFunT const & d,
+  std::vector<NTupleIterT> & v)
+{
+  for(NTupleIterT cur_pt = begin; cur_pt != end; ++cur_pt)
+  {
+    if (query_pt == cur_pt)
+      continue;
+
+    if (d(*query_pt->tuple, *cur_pt->tuple) > eps)
+      continue;
+
+    v.push_back(cur_pt);
+  }
+}
+
+} // End of namespace detail.
+} // End of namespace cluster
+} // End of namespace algorithm
+} // End of namespace boost
+
+#endif // BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP
--- a/include/boost/algorithm/cluster/k_means.hpp
+++ b/include/boost/algorithm/cluster/k_means.hpp
@ -0,0 +1,195 @@
+/*****
+** References
+** - J. MacQueen, "Some methods for classification and analysis
+**   of multivariate observations", Fifth Berkeley Symposium on
+**   Math Statistics and Probability, 281-297, 1967.
+** - I.S. Dhillon and D.S. Modha, "A data-clustering algorithm
+**   on distributed memory multiprocessors",
+**   Large-Scale Parallel Data Mining, 245-260, 1999.
+** Yuanming Chen, 2008-05-08
+*/
+
+#ifndef BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP
+#define BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP
+
+#include <cmath>
+#include <float.h>
+//#include "common.hpp"
+#include <vector>
+#include <list>
+#include <cassert>
+
+namespace boost {
+  namespace algorithm {
+      namespace cluster {
+          namespace detail {  
+                template<typename AttributeType, typename differenceType>
+                //The original C function
+                int *k_means(AttributeType **data, int n, int m, int k, differenceType eps, AttributeType **centroids)
+                {
+                   /* output cluster label for each data point */
+                   int *labels = (int*)calloc(n, sizeof(int));
+
+                   int h, i, j; /* loop counters, of course :) */
+                   int *counts = (int*)calloc(k, sizeof(int)); /* size of each cluster */
+                   AttributeType old_error, error = FLT_MAX; /* sum of squared euclidean distance */
+                   AttributeType **c = centroids ? centroids : (AttributeType**)calloc(k, sizeof(AttributeType*));
+                   AttributeType **c1 = (AttributeType**)calloc(k, sizeof(AttributeType*)); /* temp centroids */
+
+                   //assert(data && k > 0 && k <= n && m > 0 && t >= 0); /* for debugging */
+
+                   /****
+                   ** initialization */
+
+                   for (h = i = 0; i < k; h += n / k, i++) {
+                      c1[i] = (AttributeType*)calloc(m, sizeof(AttributeType));
+                      if (!centroids) {
+                         c[i] = (AttributeType*)calloc(m, sizeof(AttributeType));
+                      }
+                      /* pick k points as initial centroids */
+                      for (j = m; j-- > 0; c[i][j] = data[h][j]);
+                   }
+
+                   /****
+                   ** main loop */
+
+                   do {
+                      /* save error from last step */
+                      old_error = error, error = 0;
+
+                      /* clear old counts and temp centroids */
+                      for (i = 0; i < k; counts[i++] = 0) {
+                         for (j = 0; j < m; c1[i][j++] = 0);
+                      }
+
+                      for (h = 0; h < n; h++) {
+                         /* identify the closest cluster */
+                         AttributeType min_distance = FLT_MAX;
+                         for (i = 0; i < k; i++) {
+                            AttributeType distance = 0;
+                            for (j = m; j-- > 0; distance += pow(data[h][j] - c[i][j], 2));
+                            if (distance < min_distance) {
+                               labels[h] = i;
+                               min_distance = distance;
+                            }
+                         }
+                         /* update size and temp centroid of the destination cluster */
+                         for (j = m; j-- > 0; c1[labels[h]][j] += data[h][j]);
+                         counts[labels[h]]++;
+                         /* update standard error */
+                         error += min_distance;
+                      }
+
+                      for (i = 0; i < k; i++) { /* update all centroids */
+                         for (j = 0; j < m; j++) {
+                            c[i][j] = counts[i] ? c1[i][j] / counts[i] : c1[i][j];
+                         }
+                      }
+
+                   } while (fabs(error - old_error) > eps);
+
+                   /****
+                   ** housekeeping */
+
+                   for (i = 0; i < k; i++) {
+                      if (!centroids) {
+                         free(c[i]);
+                      }
+                      free(c1[i]);
+                   }
+
+                   if (!centroids) {
+                      free(c);
+                   }
+                   free(c1);
+
+                   free(counts);
+
+                   return labels;
+                }
+          } //End of details namespace
+
+            template<typename PointType>
+            struct KMeansCluster {
+                PointType centroid;
+                std::vector<int> points; //The indice of points are stored here 
+            };
+
+            template <typename KMeansCluster> 
+            struct KMeansClustering { 
+                typedef std::vector< KMeansCluster > type; 
+                type clusters;
+            };
+
+            /** 
+            * @param first: the first data point's iterator
+            * @param last: the last data point's iterator
+            * @param k: the k value for the k-mean algorithm
+            * @return collections of clusters
+            */
+            template <typename NTupleIter>
+            typename KMeansClustering< typename KMeansCluster<typename NTupleIter::value_type> >
+            k_means(NTupleIter first, NTupleIter last, unsigned k, 
+                   typename NTupleIter::difference_type const & eps)
+            {
+                typedef NTupleIter::difference_type DistanceType;
+                typedef NTupleIter::value_type PointType;
+                typedef PointType::value_type AttributeType; //For the c funtion test, it will be a double type
+                const DistanceType knumOfPoints = last - first; //The n variable in the C function
+                const size_t knDimension = PointType::size(); //The m variable in the C function
+
+                AttributeType** ppData = new AttributeType* [knumOfPoints];
+                AttributeType** centroids = new AttributeType* [k]; 
+                //Pre-allocate the result array
+                for(size_t nCentroid = 0; nCentroid < k; nCentroid++)
+                {
+                    centroids[nCentroid] = new AttributeType[knDimension];
+                }
+
+                int nIndex = 0;
+                for(NTupleIter iter = first; iter != last; iter++, nIndex++)
+                {
+                    PointType& pt= *iter; //A point
+                    ppData[nIndex] = new AttributeType[knDimension];
+                    for(unsigned int nAttribute = 0; nAttribute < knDimension; nAttribute++)
+                    {
+                        ppData[nIndex][nAttribute] = pt[nAttribute];
+                    }
+                }
+
+                int* labels = detail::k_means(ppData, (int) knumOfPoints, (int) knDimension, k, eps, centroids);
+
+                typedef KMeansCluster<PointType> KMeansClusterType;
+                KMeansClustering< KMeansClusterType > clustering;
+                for(size_t nCentroid = 0; nCentroid < k; nCentroid++)
+                {
+                    
+                    KMeansClusterType cluster;
+                    PointType centroid;
+                    for(unsigned int nAttribute = 0; nAttribute < knDimension; nAttribute++)
+                    {
+                        centroid[nAttribute] = centroids[nCentroid][nAttribute];
+                    }
+                    cluster.centroid = centroid;
+                    clustering.clusters.push_back(cluster);
+                    delete[] centroids[nCentroid];
+                }
+
+                for(int nPoint = 0; nPoint < knumOfPoints; nPoint++)
+                {
+                    int nCentroidIndex = labels[nPoint];
+                    clustering.clusters[nCentroidIndex].points.push_back(nPoint);
+                    delete[] ppData[nPoint];
+                }
+
+                delete[] centroids;
+                delete[] ppData;
+                delete[] labels;
+
+                return clustering;
+            }
+        } //End of cluster namespace
+    } //End of algorithm namespace
+} //End of boost namespace
+
+#endif // BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP