Moving cluster dir to project specific location in repo.

[SVN r45238]
This commit is contained in:
Jonathan Franklin
2008-05-09 01:43:15 +00:00
parent fe73f86604
commit 7850e71c9e
5 changed files with 0 additions and 505 deletions

View File

@ -1,69 +0,0 @@
// (C) Copyright Jonathan Franklin 2008.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#if ! defined BOOST_ALGORITHM_CLUSTER_CLUSTER_DATA_HPP
#define BOOST_ALGORITHM_CLUSTER_CLUSTER_DATA_HPP
#include <boost/shared_ptr.hpp>
#include <vector>
namespace boost
{
namespace algorithm
{
namespace cluster
{
/*! TODO: Document this type.
*/
template<typename ClusterT>
struct cluster_data
{
typedef ClusterT value_type;
typedef std::vector<value_type> clusters;
cluster_data() : m_pClusters(new clusters) {}
~cluster_data() {}
cluster_data(cluster_data const & c) : m_pClusters(c.m_pClusters) {}
cluster_data const & cluster_data::operator=(cluster_data const & rhs)
{ m_pClusters = rhs.m_pClusters; }
typedef typename clusters::iterator iterator;
typedef typename clusters::const_iterator const_iterator;
typedef typename clusters::reverse_iterator reverse_iterator;
iterator begin() { return m_pClusters->begin(); }
iterator end() { return m_pClusters->end(); }
const_iterator begin() const { return m_pClusters->begin(); }
const_iterator end() const { return m_pClusters->end(); }
iterator rbegin() { return m_pClusters->rbegin(); }
iterator rend() { return m_pClusters->rend(); }
iterator insert(iterator loc, value_type const & val)
{ return m_pClusters->insert(loc, val); }
void push_back(value_type const & v) { m_pClusters->push_back(v); }
void pop_back() { m_pClusters->pop_back(); }
value_type & back() { return m_pClusters->back(); }
value_type const & back() const { return m_pClusters->back(); }
size_t size() const { return m_pClusters->size(); }
private:
boost::shared_ptr<clusters> m_pClusters;
};
} // End of namespace cluster
// TODO: Should we be exporting this?
using namespace cluster;
} // End of namespace algorithm
} // End of namespace boost
#endif // BOOST_ALGORITHM_CLUSTER_CLUSTER_DATA_HPP

View File

@ -1,38 +0,0 @@
// (C) Copyright Jonathan Franklin 2008.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#if ! defined BOOST_ALGORITHM_CLUSTER_CONCEPT_HPP
#define BOOST_ALGORITHM_CLUSTER_CONCEPT_HPP
#include <boost/concept_check.hpp>
namespace boost
{
namespace algorithm
{
namespace cluster
{
// TODO: Document the purpose of this concept.
template<typename T, typename DistanceFunT>
struct DistanceComparableConcept
{
void constraints()
{
// Operation
d(t, t);
}
private:
T t;
DistanceFunT d;
};
// TODO: Add concepts here, then delete this comment.
} // End of namespace cluster;
} // End of namespace algorithm;
} // End of namespace boost;
#endif // BOOST_ALGORITHM_CLUSTER_CONCEPT_HPP

View File

@ -1,153 +0,0 @@
// (C) Copyright Jonathan Franklin 2008.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#if ! defined BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP
#define BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP
#include <boost/algorithm/cluster/cluster_data.hpp>
#include <boost/algorithm/cluster/concept.hpp>
#include <boost/algorithm/cluster/detail/naive_query.hpp>
#include <boost/utility/result_of.hpp>
#include <vector>
namespace boost
{
namespace algorithm
{
namespace cluster
{
namespace detail
{
// TODO: Where should we put these?
int const UNCLASSIFIED = -1;
int const NOISE = 0;
// TODO: Replace this so we don't have to store the cluster info for each tuple?
template<typename NTupleIterT>
struct node
{
node(NTupleIterT const & t) : tuple(t), cluster(UNCLASSIFIED) {}
NTupleIterT tuple;
int cluster;
};
} // End of namespace detail.
/*! DBSCAN density-based clustering algorithm.
* TODO: Document this function.
* \param[in] begin
* \param[in] end
* \param[in] eps
* \param[in] min_points
* \param[in] d
* \return The cluster data (partitioning of the tuples).
*/
template<typename ClusterT, typename NTupleIterT,
typename DistanceT, typename DistFunT>
cluster_data<ClusterT>
dbscan(NTupleIterT const & begin,
NTupleIterT const & end,
DistanceT const & eps,
size_t min_points,
DistFunT const & d)
{
// Concept check.
function_requires<
DistanceComparableConcept<typename NTupleIterT::value_type, DistFunT> >();
//DistanceComparableConcept<int, DistFunT> >();
function_requires<
DistanceComparableConcept<DistanceT, DistFunT> >();
// TODO: Rework the algorithm to NOT make this extra collection?
typedef detail::node<NTupleIterT> node;
typedef std::vector<node> ntuple_nodes;
ntuple_nodes tuples;
// Initialize algorithm.
//size_t num_elems = 0;
for(NTupleIterT it = begin; it != end; ++it)
{
//++num_elems;
tuples.push_back(node(it));
}
typedef cluster_data<std::vector<NTupleIterT> > cluster_data;
cluster_data p;
// TODO: We should try to make cluster_num go away.
int cluster_num = 0;
for(ntuple_nodes::iterator it = tuples.begin(); it != tuples.end(); ++it)
{
// Skip this tuple if its already been classified as a cluster or noise.
if (it->cluster != detail::UNCLASSIFIED)
continue;
// Expand cluster.
std::vector<ntuple_nodes::iterator> seeds;
detail::naive_query(it, tuples.begin(), tuples.end(), eps, d, seeds);
// If the neighborhood of this tuple is too small, then mark it as noise.
if (seeds.size() < min_points)
{
it->cluster = detail::NOISE;
continue;
}
// Start the next cluster.
++cluster_num;
p.push_back(ClusterT()); // TODO: This is goofy.
ClusterT & cur_cluster = p.back();
// Mark entire neighborhood as part of the current cluster.
it->cluster = cluster_num;
cur_cluster.push_back(it->tuple);
for (size_t n = 0; n < seeds.size(); ++n)
{
seeds[n]->cluster = cluster_num;
cur_cluster.push_back(seeds[n]->tuple);
}
// Keep adding seeds and processing them until we find all points that
// are Density Reachable.
while (! seeds.empty())
{
ntuple_nodes::iterator cur = seeds.back();
seeds.pop_back();
std::vector<ntuple_nodes::iterator> results;
detail::naive_query(cur, tuples.begin(), tuples.end(), eps, d, results);
if (results.size() >= min_points)
{
for (size_t n = 0; n < results.size(); ++n)
{
if (results[n]->cluster < 1) // Not assigned to cluster yet.
{
if (detail::UNCLASSIFIED == results[n]->cluster)
seeds.push_back(results[n]);
results[n]->cluster = cluster_num;
cur_cluster.push_back(results[n]->tuple);
}
}
}
}
} // Outer loop for all tuples.
return p;
}
} // End of namespace cluster
// TODO: Should we be exporting this?
using namespace cluster;
} // End of namespace algorithm
} // End of namespace boost
#endif // BOOST_ALGORITHM_CLUSTER_DBSCAN_HPP

View File

@ -1,50 +0,0 @@
// (C) Copyright Jonathan Franklin 2008.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#if ! defined BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP
#define BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP
#include <boost/algorithm/cluster/cluster_data.hpp>
#include <boost/algorithm/cluster/concept.hpp>
#include <vector>
namespace boost
{
namespace algorithm
{
namespace cluster
{
namespace detail
{
// TODO: Replace this naive query function w/ R*-tree or fractional cascading.
// This query mechanism makes the runtime quadratic.
template<typename NTupleIterT, typename DistanceT, typename DistFunT>
static void naive_query(
NTupleIterT const & query_pt,
NTupleIterT const & begin,
NTupleIterT const & end,
DistanceT const & eps,
DistFunT const & d,
std::vector<NTupleIterT> & v)
{
for(NTupleIterT cur_pt = begin; cur_pt != end; ++cur_pt)
{
if (query_pt == cur_pt)
continue;
if (d(*query_pt->tuple, *cur_pt->tuple) > eps)
continue;
v.push_back(cur_pt);
}
}
} // End of namespace detail.
} // End of namespace cluster
} // End of namespace algorithm
} // End of namespace boost
#endif // BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP

View File

@ -1,195 +0,0 @@
/*****
** References
** - J. MacQueen, "Some methods for classification and analysis
** of multivariate observations", Fifth Berkeley Symposium on
** Math Statistics and Probability, 281-297, 1967.
** - I.S. Dhillon and D.S. Modha, "A data-clustering algorithm
** on distributed memory multiprocessors",
** Large-Scale Parallel Data Mining, 245-260, 1999.
** Yuanming Chen, 2008-05-08
*/
#ifndef BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP
#define BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP
#include <cmath>
#include <float.h>
//#include "common.hpp"
#include <vector>
#include <list>
#include <cassert>
namespace boost {
namespace algorithm {
namespace cluster {
namespace detail {
template<typename AttributeType, typename differenceType>
//The original C function
int *k_means(AttributeType **data, int n, int m, int k, differenceType eps, AttributeType **centroids)
{
/* output cluster label for each data point */
int *labels = (int*)calloc(n, sizeof(int));
int h, i, j; /* loop counters, of course :) */
int *counts = (int*)calloc(k, sizeof(int)); /* size of each cluster */
AttributeType old_error, error = FLT_MAX; /* sum of squared euclidean distance */
AttributeType **c = centroids ? centroids : (AttributeType**)calloc(k, sizeof(AttributeType*));
AttributeType **c1 = (AttributeType**)calloc(k, sizeof(AttributeType*)); /* temp centroids */
//assert(data && k > 0 && k <= n && m > 0 && t >= 0); /* for debugging */
/****
** initialization */
for (h = i = 0; i < k; h += n / k, i++) {
c1[i] = (AttributeType*)calloc(m, sizeof(AttributeType));
if (!centroids) {
c[i] = (AttributeType*)calloc(m, sizeof(AttributeType));
}
/* pick k points as initial centroids */
for (j = m; j-- > 0; c[i][j] = data[h][j]);
}
/****
** main loop */
do {
/* save error from last step */
old_error = error, error = 0;
/* clear old counts and temp centroids */
for (i = 0; i < k; counts[i++] = 0) {
for (j = 0; j < m; c1[i][j++] = 0);
}
for (h = 0; h < n; h++) {
/* identify the closest cluster */
AttributeType min_distance = FLT_MAX;
for (i = 0; i < k; i++) {
AttributeType distance = 0;
for (j = m; j-- > 0; distance += pow(data[h][j] - c[i][j], 2));
if (distance < min_distance) {
labels[h] = i;
min_distance = distance;
}
}
/* update size and temp centroid of the destination cluster */
for (j = m; j-- > 0; c1[labels[h]][j] += data[h][j]);
counts[labels[h]]++;
/* update standard error */
error += min_distance;
}
for (i = 0; i < k; i++) { /* update all centroids */
for (j = 0; j < m; j++) {
c[i][j] = counts[i] ? c1[i][j] / counts[i] : c1[i][j];
}
}
} while (fabs(error - old_error) > eps);
/****
** housekeeping */
for (i = 0; i < k; i++) {
if (!centroids) {
free(c[i]);
}
free(c1[i]);
}
if (!centroids) {
free(c);
}
free(c1);
free(counts);
return labels;
}
} //End of details namespace
template<typename PointType>
struct KMeansCluster {
PointType centroid;
std::vector<int> points; //The indice of points are stored here
};
template <typename KMeansCluster>
struct KMeansClustering {
typedef std::vector< KMeansCluster > type;
type clusters;
};
/**
* @param first: the first data point's iterator
* @param last: the last data point's iterator
* @param k: the k value for the k-mean algorithm
* @return collections of clusters
*/
template <typename NTupleIter>
typename KMeansClustering< typename KMeansCluster<typename NTupleIter::value_type> >
k_means(NTupleIter first, NTupleIter last, unsigned k,
typename NTupleIter::difference_type const & eps)
{
typedef NTupleIter::difference_type DistanceType;
typedef NTupleIter::value_type PointType;
typedef PointType::value_type AttributeType; //For the c funtion test, it will be a double type
const DistanceType knumOfPoints = last - first; //The n variable in the C function
const size_t knDimension = PointType::size(); //The m variable in the C function
AttributeType** ppData = new AttributeType* [knumOfPoints];
AttributeType** centroids = new AttributeType* [k];
//Pre-allocate the result array
for(size_t nCentroid = 0; nCentroid < k; nCentroid++)
{
centroids[nCentroid] = new AttributeType[knDimension];
}
int nIndex = 0;
for(NTupleIter iter = first; iter != last; iter++, nIndex++)
{
PointType& pt= *iter; //A point
ppData[nIndex] = new AttributeType[knDimension];
for(unsigned int nAttribute = 0; nAttribute < knDimension; nAttribute++)
{
ppData[nIndex][nAttribute] = pt[nAttribute];
}
}
int* labels = detail::k_means(ppData, (int) knumOfPoints, (int) knDimension, k, eps, centroids);
typedef KMeansCluster<PointType> KMeansClusterType;
KMeansClustering< KMeansClusterType > clustering;
for(size_t nCentroid = 0; nCentroid < k; nCentroid++)
{
KMeansClusterType cluster;
PointType centroid;
for(unsigned int nAttribute = 0; nAttribute < knDimension; nAttribute++)
{
centroid[nAttribute] = centroids[nCentroid][nAttribute];
}
cluster.centroid = centroid;
clustering.clusters.push_back(cluster);
delete[] centroids[nCentroid];
}
for(int nPoint = 0; nPoint < knumOfPoints; nPoint++)
{
int nCentroidIndex = labels[nPoint];
clustering.clusters[nCentroidIndex].points.push_back(nPoint);
delete[] ppData[nPoint];
}
delete[] centroids;
delete[] ppData;
delete[] labels;
return clustering;
}
} //End of cluster namespace
} //End of algorithm namespace
} //End of boost namespace
#endif // BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP