sequence performance tests

[SVN r36086]
This commit is contained in:
Joel de Guzman
2006-11-19 05:15:31 +00:00
parent 5386f5dd3e
commit 4a0304a699
4 changed files with 312 additions and 0 deletions

View File

@ -13,3 +13,6 @@ exe accumulate : accumulate.cpp ;
exe inner_product : inner_product.cpp ;
exe inner_product2 : inner_product2.cpp ;
exe sequence_efficiency : sequence_efficiency.cpp ;

View File

@ -0,0 +1,81 @@
// Copyright David Abrahams, Matthias Troyer, Michael Gauckler
// 2005. Distributed under the Boost Software License, Version
// 1.0. (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <boost/timer.hpp>
namespace test
{
// This value is required to ensure that a smart compiler's dead
// code elimination doesn't optimize away anything we're testing.
// We'll use it to compute the return code of the executable to make
// sure it's needed.
int live_code;
// Call objects of the given Accumulator type repeatedly with x as
// an argument.
template <class Accumulator, class Arg>
void hammer(Arg const& x, long const repeats)
{
// Strategy: because the sum in an accumulator after each call
// depends on the previous value of the sum, the CPU's pipeline
// might be stalled while waiting for the previous addition to
// complete. Therefore, we allocate an array of accumulators,
// and update them in sequence, so that there's no dependency
// between adjacent addition operations.
//
// Additionally, if there were only one accumulator, the
// compiler or CPU might decide to update the value in a
// register rather that writing it back to memory. we want each
// operation to at least update the L1 cache. *** Note: This
// concern is specific to the particular application at which
// we're targeting the test. ***
// This has to be at least as large as the number of
// simultaneous accumulations that can be executing in the
// compiler pipeline. A safe number here is larger than the
// machine's maximum pipeline depth. If you want to test the L2
// or L3 cache, or main memory, you can increase the size of
// this array. 1024 is an upper limit on the pipeline depth of
// current vector machines.
const std::size_t number_of_accumulators = 1024;
live_code = 0; // reset to zero
Accumulator a[number_of_accumulators];
for (long iteration = 0; iteration < repeats; ++iteration)
{
for (Accumulator* ap = a; ap < a + number_of_accumulators; ++ap)
{
(*ap)(x);
}
}
// Accumulate all the partial sums to avoid dead code
// elimination.
for (Accumulator* ap = a; ap < a + number_of_accumulators; ++ap)
{
live_code += ap->sum;
}
}
// Measure the time required to hammer accumulators of the given
// type with the argument x.
template <class Accumulator, class T>
double measure(T const& x, long const repeats)
{
// Hammer accumulators a couple of times to ensure the
// instruction cache is full of our test code, and that we don't
// measure the cost of a page fault for accessing the data page
// containing the memory where the accumulators will be
// allocated
hammer<Accumulator>(x, repeats);
hammer<Accumulator>(x, repeats);
// Now start a timer
boost::timer time;
hammer<Accumulator>(x, repeats); // This time, we'll measure
return time.elapsed() / repeats; // return the time of one iteration
}
}

View File

@ -0,0 +1,179 @@
/*=============================================================================
Copyright (c) 2001-2006 Joel de Guzman
Use, modification and distribution is subject to the Boost Software
License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
http://www.boost.org/LICENSE_1_0.txt)
==============================================================================*/
#include "measure.hpp"
#define FUSION_MAX_LIST_SIZE 30
#define FUSION_MAX_VECTOR_SIZE 30
#include <boost/fusion/algorithm/iteration/accumulate.hpp>
#include <boost/fusion/sequence/container/vector.hpp>
#include <boost/fusion/sequence/container/list.hpp>
#include <algorithm>
#include <numeric>
#include <functional>
#include <iostream>
#include <cmath>
#include <limits>
#ifdef _MSC_VER
// inline aggressively
# pragma inline_recursion(on) // turn on inline recursion
# pragma inline_depth(255) // max inline depth
#endif
namespace
{
struct poly_add
{
template<typename Lhs, typename Rhs>
struct result
{
typedef Lhs type;
};
template<typename Lhs, typename Rhs>
Lhs operator()(const Lhs& lhs, const Rhs& rhs) const
{
return lhs + rhs;
}
};
// Our Accumulator function
template <typename T>
struct accumulator
{
accumulator()
: sum()
{}
template <typename Sequence>
void operator()(Sequence const& seq)
{
this->sum += boost::fusion::accumulate(seq, 0, poly_add());
}
T sum;
};
}
int main()
{
using namespace test;
using namespace boost::fusion;
vector<
int, int, int
>
vsmall(BOOST_PP_ENUM_PARAMS(3,));
list<
int, int, int
>
lsmall(BOOST_PP_ENUM_PARAMS(3,));
vector<
int, int, int, int, int, int, int, int, int, int
>
vmid(BOOST_PP_ENUM_PARAMS(10,));
list<
int, int, int, int, int, int, int, int, int, int
>
lmid(BOOST_PP_ENUM_PARAMS(10,));
vector<
int, int, int, int, int, int, int, int, int, int
, int, int, int, int, int, int, int, int, int, int
, int, int, int, int, int, int, int, int, int, int
>
vbig(BOOST_PP_ENUM_PARAMS(30,));
list<
int, int, int, int, int, int, int, int, int, int
, int, int, int, int, int, int, int, int, int, int
, int, int, int, int, int, int, int, int, int, int
>
lbig(BOOST_PP_ENUM_PARAMS(30,));
// first decide how many repetitions to measure
long repeats = 100;
double measured = 0;
while (measured < 1.0 && repeats <= 10000000)
{
repeats *= 10;
boost::timer time;
hammer<accumulator<int> >(vsmall, repeats);
hammer<accumulator<int> >(lsmall, repeats);
hammer<accumulator<int> >(vmid, repeats);
hammer<accumulator<int> >(lmid, repeats);
hammer<accumulator<int> >(vbig, repeats);
hammer<accumulator<int> >(lbig, repeats);
measured = time.elapsed();
}
measure<accumulator<int> >(vsmall, 1);
std::cout
<< "small vector accumulated result: "
<< live_code << std::endl;
measure<accumulator<int> >(lsmall, 1);
std::cout
<< "small list accumulated result: "
<< live_code << std::endl;
measure<accumulator<int> >(vmid, 1);
std::cout
<< "medium vector accumulated result: "
<< live_code << std::endl;
measure<accumulator<int> >(lmid, 1);
std::cout
<< "medium list accumulated result: "
<< live_code << std::endl;
measure<accumulator<int> >(vbig, 1);
std::cout
<< "big vector accumulated result: "
<< live_code << std::endl;
measure<accumulator<int> >(lbig, 1);
std::cout
<< "big list accumulated result: "
<< live_code << std::endl;
std::cout.setf(std::ios::scientific);
std::cout
<< "small vector time: "
<< measure<accumulator<int> >(vsmall, repeats)
<< std::endl;
std::cout
<< "small list time: "
<< measure<accumulator<int> >(lsmall, repeats)
<< std::endl;
std::cout
<< "medium vector time: "
<< measure<accumulator<int> >(vmid, repeats)
<< std::endl;
std::cout
<< "medium list time: "
<< measure<accumulator<int> >(lmid, repeats)
<< std::endl;
std::cout
<< "big vector time: "
<< measure<accumulator<int> >(vbig, repeats)
<< std::endl;
std::cout
<< "big list time: "
<< measure<accumulator<int> >(lbig, repeats)
<< std::endl;
// This is ultimately responsible for preventing all the test code
// from being optimized away. Change this to return 0 and you
// unplug the whole test's life support system.
return live_code != 0;
}

View File

@ -0,0 +1,49 @@
Timing result for sequence_efficiency.cpp comparing the speed of various
fusion sequences. The test involves accumulating the elements of the
sequence which is primed to have values 0..N (N=size of sequence). Small,
medium and big sequences are tested where:
small = 3 elements
medium = 10 elements
big = 30 elements
Tester: Joel de Guzman. WinXP, P4-3.0GHZ, 2GB RAM
VC7.1 (flags = /MD /O2 /EHsc /GS)
small vector time: 1.880000e-006
small list time: 2.040000e-006
medium vector time: 2.030000e-006
medium list time: 3.590000e-006
big vector time: 1.880000e-006
big list time: 9.070000e-006
VC8.0 (flags = /MD /O2 /EHsc /GS)
small vector time: 1.880000e-006
small list time: 2.030000e-006
medium vector time: 2.030000e-006
medium list time: 3.750000e-006
big vector time: 1.880000e-006
big list time: 9.380000e-006
G++ 3.4 (flags = -ftemplate-depth-128 -funroll-loops -O3 -finline-functions -Wno-inline -Wall)
small vector time: 2.500000e-05
small list time: 2.500000e-05
medium vector time: 7.970000e-05
medium list time: 7.970000e-05
big vector time: 2.516000e-04
big list time: 2.485000e-04
Intel 9.1 (flags = /MD /O2 /EHsc /GS)
small vector time: 1.141000e-006
small list time: 1.156000e-006
medium vector time: 1.156000e-006
medium list time: 1.156000e-006
big vector time: 1.171000e-006
big list time: 1.156000e-006