diff --git a/experimental/bench_segmented_algos.cpp b/experimental/bench_segmented_algos.cpp index 8760f05..cdc6137 100644 --- a/experimental/bench_segmented_algos.cpp +++ b/experimental/bench_segmented_algos.cpp @@ -8,6 +8,19 @@ // ////////////////////////////////////////////////////////////////////////////// +// Force aggressive function/loop alignment to eliminate instruction cache-line +// alignment noise from benchmark measurements. Without this, identical code can +// show up to 1.8x performance variation depending on where the linker happens +// to place each template instantiation relative to 64-byte cache-line boundaries. +#if defined(__GNUC__) && !defined(__clang__) + #pragma GCC optimize("align-functions=64", "align-loops=32") +#elif defined(__clang__) + // Clang has no file-wide pragma for alignment. Use command-line flags: + // -falign-functions=64 -falign-loops=32 +#elif defined(_MSC_VER) + // MSVC has no pragma or attribute for function/loop alignment control. +#endif + #include #include #include @@ -2492,7 +2505,7 @@ void run_benchmarks() { std::cout << "--- bc::deque<" << typeid(T).name() << "> ---\n"; - bc::deque > > dq; + bc::deque > > dq; fill_test_data(dq, N); run_all(dq, iter, "deque"); std::cout << "\n";