sequence performance tests

[SVN r36086]
2006-11-19 05:15:31 +00:00
parent 5386f5dd3e
commit 4a0304a699
4 changed files with 312 additions and 0 deletions
--- a/example/performance/Jamfile.v2
+++ b/example/performance/Jamfile.v2
@ -13,3 +13,6 @@ exe accumulate : accumulate.cpp ;
 exe inner_product : inner_product.cpp ;

 exe inner_product2 : inner_product2.cpp ;
+
+exe sequence_efficiency : sequence_efficiency.cpp ;
+
--- a/example/performance/measure.hpp
+++ b/example/performance/measure.hpp
@ -0,0 +1,81 @@
+// Copyright David Abrahams, Matthias Troyer, Michael Gauckler
+// 2005. Distributed under the Boost Software License, Version
+// 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+#include <boost/timer.hpp>
+
+namespace test
+{
+  // This value is required to ensure that a smart compiler's dead
+  // code elimination doesn't optimize away anything we're testing.
+  // We'll use it to compute the return code of the executable to make
+  // sure it's needed.
+  int live_code;
+
+  // Call objects of the given Accumulator type repeatedly with x as
+  // an argument.
+  template <class Accumulator, class Arg>
+  void hammer(Arg const& x, long const repeats)
+  {
+      // Strategy: because the sum in an accumulator after each call
+      // depends on the previous value of the sum, the CPU's pipeline
+      // might be stalled while waiting for the previous addition to
+      // complete.  Therefore, we allocate an array of accumulators,
+      // and update them in sequence, so that there's no dependency
+      // between adjacent addition operations.
+      //
+      // Additionally, if there were only one accumulator, the
+      // compiler or CPU might decide to update the value in a
+      // register rather that writing it back to memory.  we want each
+      // operation to at least update the L1 cache.  *** Note: This
+      // concern is specific to the particular application at which
+      // we're targeting the test. ***
+
+      // This has to be at least as large as the number of
+      // simultaneous accumulations that can be executing in the
+      // compiler pipeline.  A safe number here is larger than the
+      // machine's maximum pipeline depth. If you want to test the L2
+      // or L3 cache, or main memory, you can increase the size of
+      // this array.  1024 is an upper limit on the pipeline depth of
+      // current vector machines.
+      const std::size_t number_of_accumulators = 1024;
+      live_code = 0; // reset to zero
+
+      Accumulator a[number_of_accumulators];
+      
+      for (long iteration = 0; iteration < repeats; ++iteration)
+      {
+          for (Accumulator* ap = a;  ap < a + number_of_accumulators; ++ap)
+          {
+              (*ap)(x);
+          }
+      }
+
+      // Accumulate all the partial sums to avoid dead code
+      // elimination.
+      for (Accumulator* ap = a;  ap < a + number_of_accumulators; ++ap)
+      {
+          live_code += ap->sum;
+      }
+  }
+
+  // Measure the time required to hammer accumulators of the given
+  // type with the argument x.
+  template <class Accumulator, class T>
+  double measure(T const& x, long const repeats)
+  {
+      // Hammer accumulators a couple of times to ensure the
+      // instruction cache is full of our test code, and that we don't
+      // measure the cost of a page fault for accessing the data page
+      // containing the memory where the accumulators will be
+      // allocated
+      hammer<Accumulator>(x, repeats);
+      hammer<Accumulator>(x, repeats);
+
+      // Now start a timer
+      boost::timer time;
+      hammer<Accumulator>(x, repeats);  // This time, we'll measure
+      return time.elapsed() / repeats;  // return the time of one iteration
+  }
+}
--- a/example/performance/sequence_efficiency.cpp
+++ b/example/performance/sequence_efficiency.cpp
@ -0,0 +1,179 @@
+/*=============================================================================
+    Copyright (c) 2001-2006 Joel de Guzman
+
+    Use, modification and distribution is subject to the Boost Software
+    License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+    http://www.boost.org/LICENSE_1_0.txt)
+==============================================================================*/
+#include "measure.hpp"
+
+#define FUSION_MAX_LIST_SIZE 30
+#define FUSION_MAX_VECTOR_SIZE 30
+
+#include <boost/fusion/algorithm/iteration/accumulate.hpp>
+#include <boost/fusion/sequence/container/vector.hpp>
+#include <boost/fusion/sequence/container/list.hpp>
+
+#include <algorithm>
+#include <numeric>
+#include <functional>
+#include <iostream>
+#include <cmath>
+#include <limits>
+
+#ifdef _MSC_VER
+// inline aggressively
+# pragma inline_recursion(on) // turn on inline recursion
+# pragma inline_depth(255)    // max inline depth
+#endif
+
+namespace
+{
+    struct poly_add
+    {
+        template<typename Lhs, typename Rhs>
+        struct result
+        {
+            typedef Lhs type;
+        };
+
+        template<typename Lhs, typename Rhs>
+        Lhs operator()(const Lhs& lhs, const Rhs& rhs) const
+        {
+            return lhs + rhs;
+        }
+    };
+
+    // Our Accumulator function
+    template <typename T>
+    struct accumulator
+    {
+        accumulator()
+            : sum()
+        {}
+        
+        template <typename Sequence>
+        void operator()(Sequence const& seq)
+        {
+            this->sum += boost::fusion::accumulate(seq, 0, poly_add());
+        }
+        
+        T sum;
+    };
+}
+
+int main()
+{
+    using namespace test;
+    using namespace boost::fusion;
+
+    vector<
+        int, int, int
+    > 
+    vsmall(BOOST_PP_ENUM_PARAMS(3,));
+
+    list<
+        int, int, int
+    > 
+    lsmall(BOOST_PP_ENUM_PARAMS(3,));
+
+    vector<
+        int, int, int, int, int, int, int, int, int, int
+    > 
+    vmid(BOOST_PP_ENUM_PARAMS(10,));
+
+    list<
+        int, int, int, int, int, int, int, int, int, int
+    > 
+    lmid(BOOST_PP_ENUM_PARAMS(10,));
+
+    vector<
+        int, int, int, int, int, int, int, int, int, int
+      , int, int, int, int, int, int, int, int, int, int
+      , int, int, int, int, int, int, int, int, int, int
+    > 
+    vbig(BOOST_PP_ENUM_PARAMS(30,));
+
+    list<
+        int, int, int, int, int, int, int, int, int, int
+      , int, int, int, int, int, int, int, int, int, int
+      , int, int, int, int, int, int, int, int, int, int
+    > 
+    lbig(BOOST_PP_ENUM_PARAMS(30,));
+
+    // first decide how many repetitions to measure
+    long repeats = 100;
+    double measured = 0;
+    while (measured < 1.0 && repeats <= 10000000)
+    {
+        repeats *= 10;
+        
+        boost::timer time;
+
+        hammer<accumulator<int> >(vsmall, repeats);
+        hammer<accumulator<int> >(lsmall, repeats);
+        hammer<accumulator<int> >(vmid, repeats);
+        hammer<accumulator<int> >(lmid, repeats);
+        hammer<accumulator<int> >(vbig, repeats);
+        hammer<accumulator<int> >(lbig, repeats);
+
+        measured = time.elapsed();
+    }
+
+    measure<accumulator<int> >(vsmall, 1);
+    std::cout 
+        << "small vector accumulated result:    " 
+        << live_code << std::endl;
+    measure<accumulator<int> >(lsmall, 1);
+    std::cout 
+        << "small list accumulated result:      " 
+        << live_code << std::endl;
+    measure<accumulator<int> >(vmid, 1);
+    std::cout 
+        << "medium vector accumulated result:   " 
+        << live_code << std::endl;
+    measure<accumulator<int> >(lmid, 1);
+    std::cout 
+        << "medium list accumulated result:     " 
+        << live_code << std::endl;
+    measure<accumulator<int> >(vbig, 1);
+    std::cout 
+        << "big vector accumulated result:      " 
+        << live_code << std::endl;
+    measure<accumulator<int> >(lbig, 1);
+    std::cout 
+        << "big list accumulated result:        " 
+        << live_code << std::endl;
+
+    std::cout.setf(std::ios::scientific);
+
+    std::cout
+        << "small vector time:                  "
+        << measure<accumulator<int> >(vsmall, repeats)
+        << std::endl;
+    std::cout
+        << "small list time:                    "
+        << measure<accumulator<int> >(lsmall, repeats)
+        << std::endl;    
+    std::cout
+        << "medium vector time:                 "
+        << measure<accumulator<int> >(vmid, repeats)
+        << std::endl;
+    std::cout
+        << "medium list time:                   "
+        << measure<accumulator<int> >(lmid, repeats)
+        << std::endl;
+    std::cout
+        << "big vector time:                    "
+        << measure<accumulator<int> >(vbig, repeats)
+        << std::endl;
+    std::cout
+        << "big list time:                      "
+        << measure<accumulator<int> >(lbig, repeats)
+        << std::endl;
+
+    // This is ultimately responsible for preventing all the test code
+    // from being optimized away.  Change this to return 0 and you
+    // unplug the whole test's life support system.
+    return live_code != 0;
+}
--- a/example/performance/timings.txt
+++ b/example/performance/timings.txt
@ -0,0 +1,49 @@
+Timing result for sequence_efficiency.cpp comparing the speed of various
+fusion sequences. The test involves accumulating the elements of the
+sequence which is primed to have values 0..N (N=size of sequence). Small,
+medium and big sequences are tested where:
+
+    small = 3 elements
+    medium = 10 elements
+    big = 30 elements
+
+Tester: Joel de Guzman. WinXP, P4-3.0GHZ, 2GB RAM
+
+VC7.1 (flags = /MD /O2 /EHsc /GS)
+
+    small vector time:                  1.880000e-006
+    small list time:                    2.040000e-006
+    medium vector time:                 2.030000e-006
+    medium list time:                   3.590000e-006
+    big vector time:                    1.880000e-006
+    big list time:                      9.070000e-006
+
+VC8.0 (flags = /MD /O2 /EHsc /GS)
+
+    small vector time:                  1.880000e-006
+    small list time:                    2.030000e-006
+    medium vector time:                 2.030000e-006
+    medium list time:                   3.750000e-006
+    big vector time:                    1.880000e-006
+    big list time:                      9.380000e-006
+
+G++ 3.4 (flags = -ftemplate-depth-128  -funroll-loops -O3 -finline-functions -Wno-inline -Wall)
+
+    small vector time:                  2.500000e-05
+    small list time:                    2.500000e-05
+    medium vector time:                 7.970000e-05
+    medium list time:                   7.970000e-05
+    big vector time:                    2.516000e-04
+    big list time:                      2.485000e-04
+
+Intel 9.1 (flags = /MD /O2 /EHsc /GS)
+
+    small vector time:                  1.141000e-006
+    small list time:                    1.156000e-006
+    medium vector time:                 1.156000e-006
+    medium list time:                   1.156000e-006
+    big vector time:                    1.171000e-006
+    big list time:                      1.156000e-006
+
+
+