diff --git a/example/performance/Jamfile.v2 b/example/performance/Jamfile.v2 index ec8e7b29..53d5ec5f 100644 --- a/example/performance/Jamfile.v2 +++ b/example/performance/Jamfile.v2 @@ -13,3 +13,6 @@ exe accumulate : accumulate.cpp ; exe inner_product : inner_product.cpp ; exe inner_product2 : inner_product2.cpp ; + +exe sequence_efficiency : sequence_efficiency.cpp ; + diff --git a/example/performance/measure.hpp b/example/performance/measure.hpp new file mode 100644 index 00000000..9749d1e2 --- /dev/null +++ b/example/performance/measure.hpp @@ -0,0 +1,81 @@ +// Copyright David Abrahams, Matthias Troyer, Michael Gauckler +// 2005. Distributed under the Boost Software License, Version +// 1.0. (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include + +namespace test +{ + // This value is required to ensure that a smart compiler's dead + // code elimination doesn't optimize away anything we're testing. + // We'll use it to compute the return code of the executable to make + // sure it's needed. + int live_code; + + // Call objects of the given Accumulator type repeatedly with x as + // an argument. + template + void hammer(Arg const& x, long const repeats) + { + // Strategy: because the sum in an accumulator after each call + // depends on the previous value of the sum, the CPU's pipeline + // might be stalled while waiting for the previous addition to + // complete. Therefore, we allocate an array of accumulators, + // and update them in sequence, so that there's no dependency + // between adjacent addition operations. + // + // Additionally, if there were only one accumulator, the + // compiler or CPU might decide to update the value in a + // register rather that writing it back to memory. we want each + // operation to at least update the L1 cache. *** Note: This + // concern is specific to the particular application at which + // we're targeting the test. *** + + // This has to be at least as large as the number of + // simultaneous accumulations that can be executing in the + // compiler pipeline. A safe number here is larger than the + // machine's maximum pipeline depth. If you want to test the L2 + // or L3 cache, or main memory, you can increase the size of + // this array. 1024 is an upper limit on the pipeline depth of + // current vector machines. + const std::size_t number_of_accumulators = 1024; + live_code = 0; // reset to zero + + Accumulator a[number_of_accumulators]; + + for (long iteration = 0; iteration < repeats; ++iteration) + { + for (Accumulator* ap = a; ap < a + number_of_accumulators; ++ap) + { + (*ap)(x); + } + } + + // Accumulate all the partial sums to avoid dead code + // elimination. + for (Accumulator* ap = a; ap < a + number_of_accumulators; ++ap) + { + live_code += ap->sum; + } + } + + // Measure the time required to hammer accumulators of the given + // type with the argument x. + template + double measure(T const& x, long const repeats) + { + // Hammer accumulators a couple of times to ensure the + // instruction cache is full of our test code, and that we don't + // measure the cost of a page fault for accessing the data page + // containing the memory where the accumulators will be + // allocated + hammer(x, repeats); + hammer(x, repeats); + + // Now start a timer + boost::timer time; + hammer(x, repeats); // This time, we'll measure + return time.elapsed() / repeats; // return the time of one iteration + } +} diff --git a/example/performance/sequence_efficiency.cpp b/example/performance/sequence_efficiency.cpp new file mode 100644 index 00000000..e9515838 --- /dev/null +++ b/example/performance/sequence_efficiency.cpp @@ -0,0 +1,179 @@ +/*============================================================================= + Copyright (c) 2001-2006 Joel de Guzman + + Use, modification and distribution is subject to the Boost Software + License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt) +==============================================================================*/ +#include "measure.hpp" + +#define FUSION_MAX_LIST_SIZE 30 +#define FUSION_MAX_VECTOR_SIZE 30 + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +// inline aggressively +# pragma inline_recursion(on) // turn on inline recursion +# pragma inline_depth(255) // max inline depth +#endif + +namespace +{ + struct poly_add + { + template + struct result + { + typedef Lhs type; + }; + + template + Lhs operator()(const Lhs& lhs, const Rhs& rhs) const + { + return lhs + rhs; + } + }; + + // Our Accumulator function + template + struct accumulator + { + accumulator() + : sum() + {} + + template + void operator()(Sequence const& seq) + { + this->sum += boost::fusion::accumulate(seq, 0, poly_add()); + } + + T sum; + }; +} + +int main() +{ + using namespace test; + using namespace boost::fusion; + + vector< + int, int, int + > + vsmall(BOOST_PP_ENUM_PARAMS(3,)); + + list< + int, int, int + > + lsmall(BOOST_PP_ENUM_PARAMS(3,)); + + vector< + int, int, int, int, int, int, int, int, int, int + > + vmid(BOOST_PP_ENUM_PARAMS(10,)); + + list< + int, int, int, int, int, int, int, int, int, int + > + lmid(BOOST_PP_ENUM_PARAMS(10,)); + + vector< + int, int, int, int, int, int, int, int, int, int + , int, int, int, int, int, int, int, int, int, int + , int, int, int, int, int, int, int, int, int, int + > + vbig(BOOST_PP_ENUM_PARAMS(30,)); + + list< + int, int, int, int, int, int, int, int, int, int + , int, int, int, int, int, int, int, int, int, int + , int, int, int, int, int, int, int, int, int, int + > + lbig(BOOST_PP_ENUM_PARAMS(30,)); + + // first decide how many repetitions to measure + long repeats = 100; + double measured = 0; + while (measured < 1.0 && repeats <= 10000000) + { + repeats *= 10; + + boost::timer time; + + hammer >(vsmall, repeats); + hammer >(lsmall, repeats); + hammer >(vmid, repeats); + hammer >(lmid, repeats); + hammer >(vbig, repeats); + hammer >(lbig, repeats); + + measured = time.elapsed(); + } + + measure >(vsmall, 1); + std::cout + << "small vector accumulated result: " + << live_code << std::endl; + measure >(lsmall, 1); + std::cout + << "small list accumulated result: " + << live_code << std::endl; + measure >(vmid, 1); + std::cout + << "medium vector accumulated result: " + << live_code << std::endl; + measure >(lmid, 1); + std::cout + << "medium list accumulated result: " + << live_code << std::endl; + measure >(vbig, 1); + std::cout + << "big vector accumulated result: " + << live_code << std::endl; + measure >(lbig, 1); + std::cout + << "big list accumulated result: " + << live_code << std::endl; + + std::cout.setf(std::ios::scientific); + + std::cout + << "small vector time: " + << measure >(vsmall, repeats) + << std::endl; + std::cout + << "small list time: " + << measure >(lsmall, repeats) + << std::endl; + std::cout + << "medium vector time: " + << measure >(vmid, repeats) + << std::endl; + std::cout + << "medium list time: " + << measure >(lmid, repeats) + << std::endl; + std::cout + << "big vector time: " + << measure >(vbig, repeats) + << std::endl; + std::cout + << "big list time: " + << measure >(lbig, repeats) + << std::endl; + + // This is ultimately responsible for preventing all the test code + // from being optimized away. Change this to return 0 and you + // unplug the whole test's life support system. + return live_code != 0; +} diff --git a/example/performance/timings.txt b/example/performance/timings.txt new file mode 100644 index 00000000..af383d87 --- /dev/null +++ b/example/performance/timings.txt @@ -0,0 +1,49 @@ +Timing result for sequence_efficiency.cpp comparing the speed of various +fusion sequences. The test involves accumulating the elements of the +sequence which is primed to have values 0..N (N=size of sequence). Small, +medium and big sequences are tested where: + + small = 3 elements + medium = 10 elements + big = 30 elements + +Tester: Joel de Guzman. WinXP, P4-3.0GHZ, 2GB RAM + +VC7.1 (flags = /MD /O2 /EHsc /GS) + + small vector time: 1.880000e-006 + small list time: 2.040000e-006 + medium vector time: 2.030000e-006 + medium list time: 3.590000e-006 + big vector time: 1.880000e-006 + big list time: 9.070000e-006 + +VC8.0 (flags = /MD /O2 /EHsc /GS) + + small vector time: 1.880000e-006 + small list time: 2.030000e-006 + medium vector time: 2.030000e-006 + medium list time: 3.750000e-006 + big vector time: 1.880000e-006 + big list time: 9.380000e-006 + +G++ 3.4 (flags = -ftemplate-depth-128 -funroll-loops -O3 -finline-functions -Wno-inline -Wall) + + small vector time: 2.500000e-05 + small list time: 2.500000e-05 + medium vector time: 7.970000e-05 + medium list time: 7.970000e-05 + big vector time: 2.516000e-04 + big list time: 2.485000e-04 + +Intel 9.1 (flags = /MD /O2 /EHsc /GS) + + small vector time: 1.141000e-006 + small list time: 1.156000e-006 + medium vector time: 1.156000e-006 + medium list time: 1.156000e-006 + big vector time: 1.171000e-006 + big list time: 1.156000e-006 + + +