From d6905ab15992007c28589444a685b4a2878eb977 Mon Sep 17 00:00:00 2001 From: Peter Dimov Date: Fri, 25 Nov 2022 20:52:56 +0200 Subject: [PATCH] Add benchmark/word_count.cpp --- benchmark/word_count.cpp | 397 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 benchmark/word_count.cpp diff --git a/benchmark/word_count.cpp b/benchmark/word_count.cpp new file mode 100644 index 0000000..2f09f04 --- /dev/null +++ b/benchmark/word_count.cpp @@ -0,0 +1,397 @@ +// Copyright 2022 Peter Dimov. +// Distributed under the Boost Software License, Version 1.0. +// https://www.boost.org/LICENSE_1_0.txt + +#define _SILENCE_CXX17_OLD_ALLOCATOR_MEMBERS_DEPRECATION_WARNING +#define _SILENCE_CXX20_CISO646_REMOVED_WARNING + +#include +#ifdef HAVE_ABSEIL +# include "absl/hash/hash.h" +#endif +#ifdef HAVE_MULXP_HASH +# include "mulxp_hash.hpp" +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std::chrono_literals; + +static void print_time( std::chrono::steady_clock::time_point & t1, char const* label, std::uint32_t s, std::size_t size ) +{ + auto t2 = std::chrono::steady_clock::now(); + + std::cout << label << ": " << ( t2 - t1 ) / 1ms << " ms (s=" << s << ", size=" << size << ")\n"; + + t1 = t2; +} + +static std::vector words; + +static void init_words() +{ + char const* fn = "enwik9"; // http://mattmahoney.net/dc/textdata + + auto t1 = std::chrono::steady_clock::now(); + + std::ifstream is( fn ); + std::string in( std::istreambuf_iterator( is ), std::istreambuf_iterator{} ); + + boost::regex re( "[a-zA-Z]+"); + boost::sregex_token_iterator it( in.begin(), in.end(), re, 0 ), end; + + words.assign( it, end ); + + auto t2 = std::chrono::steady_clock::now(); + + std::cout << fn << ": " << words.size() << " words, " << ( t2 - t1 ) / 1ms << " ms\n\n"; +} + +template BOOST_NOINLINE void test_word_count( Map& map, std::chrono::steady_clock::time_point & t1 ) +{ + std::size_t s = 0; + + for( auto const& word: words ) + { + ++map[ word ]; + ++s; + } + + print_time( t1, "Word count", s, map.size() ); + + std::cout << std::endl; +} + +template BOOST_NOINLINE void test_contains( Map& map, std::chrono::steady_clock::time_point & t1 ) +{ + std::size_t s = 0; + + for( auto const& word: words ) + { + std::string_view w2( word ); + w2.remove_prefix( 1 ); + + s += map.contains( w2 ); + } + + print_time( t1, "Contains", s, map.size() ); + + std::cout << std::endl; +} + +template BOOST_NOINLINE void test_count( Map& map, std::chrono::steady_clock::time_point & t1 ) +{ + std::size_t s = 0; + + for( auto const& word: words ) + { + std::string_view w2( word ); + w2.remove_prefix( 1 ); + + s += map.count( w2 ); + } + + print_time( t1, "Count", s, map.size() ); + + std::cout << std::endl; +} + +// + +struct record +{ + std::string label_; + long long time_; +}; + +static std::vector times; + +template BOOST_NOINLINE void test( char const* label ) +{ + std::cout << label << ":\n\n"; + + boost::unordered_flat_map map; + + auto t0 = std::chrono::steady_clock::now(); + auto t1 = t0; + + test_word_count( map, t1 ); + + record rec = { label, 0 }; + + test_contains( map, t1 ); + test_count( map, t1 ); + + auto tN = std::chrono::steady_clock::now(); + std::cout << "Total: " << ( tN - t0 ) / 1ms << " ms\n\n"; + + rec.time_ = ( tN - t0 ) / 1ms; + times.push_back( rec ); +} + +// mul31_hash + +struct mul31_hash +{ + // not avalanching + + std::size_t operator()( std::string_view const& st ) const BOOST_NOEXCEPT + { + char const * p = st.data(); + std::size_t n = st.size(); + +#if SIZE_MAX > UINT32_MAX + std::size_t h = 0xCBF29CE484222325ull; +#else + std::size_t h = 0x811C9DC5u; +#endif + + for( std::size_t i = 0; i < n; ++i ) + { + h = h * 31 + static_cast( p[i] ); + } + + return h; + } +}; + +// mul31_x4_hash + +struct mul31_x4_hash +{ + // not avalanching + + std::size_t operator()( std::string_view const& st ) const BOOST_NOEXCEPT + { + char const * p = st.data(); + std::size_t n = st.size(); + +#if SIZE_MAX > UINT32_MAX + std::size_t h = 0xCBF29CE484222325ull; +#else + std::size_t h = 0x811C9DC5u; +#endif + + while( n >= 4 ) + { + h = h * (31u * 31u * 31u * 31u) + + static_cast( p[0] ) * (31u * 31u * 31u) + + static_cast( p[1] ) * (31u * 31u) + + static_cast( p[2] ) * 31u + + static_cast( p[3] ); + + p += 4; + n -= 4; + } + + while( n > 0 ) + { + h = h * 31u + static_cast( *p ); + + ++p; + --n; + } + + return h; + } +}; + +// mul31_x8_hash + +struct mul31_x8_hash +{ + // not avalanching + + std::size_t operator()( std::string_view const& st ) const BOOST_NOEXCEPT + { + char const * p = st.data(); + std::size_t n = st.size(); + + boost::uint64_t h = 0xCBF29CE484222325ull; + + while( n >= 8 ) + { + h = h * (31ull * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull) + + static_cast( p[0] ) * (31ull * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull) + + static_cast( p[1] ) * (31ull * 31ull * 31ull * 31ull * 31ull * 31ull) + + static_cast( p[2] ) * (31ull * 31ull * 31ull * 31ull * 31ull) + + static_cast( p[3] ) * (31ull * 31ull * 31ull * 31ull) + + static_cast( p[4] ) * (31ull * 31ull * 31ull) + + static_cast( p[5] ) * (31ull * 31ull) + + static_cast( p[6] ) * 31ull + + static_cast( p[7] ); + + p += 8; + n -= 8; + } + + while( n > 0 ) + { + h = h * 31u + static_cast( *p ); + + ++p; + --n; + } + + return static_cast( h ); + } +}; + +// fnv1a_hash + +template struct fnv1a_hash_impl; + +template<> struct fnv1a_hash_impl<32> +{ + std::size_t operator()( std::string_view const& s ) const + { + std::size_t h = 0x811C9DC5u; + + char const * first = s.data(); + char const * last = first + s.size(); + + for( ; first != last; ++first ) + { + h ^= static_cast( *first ); + h *= 0x01000193ul; + } + + return h; + } +}; + +template<> struct fnv1a_hash_impl<64> +{ + std::size_t operator()( std::string_view const& s ) const + { + std::size_t h = 0xCBF29CE484222325ull; + + char const * first = s.data(); + char const * last = first + s.size(); + + for( ; first != last; ++first ) + { + h ^= static_cast( *first ); + h *= 0x00000100000001B3ull; + } + + return h; + } +}; + +struct fnv1a_hash: fnv1a_hash_impl< std::numeric_limits::digits > +{ + using is_avalanching = void; +}; + +// std_hash + +struct std_hash: std::hash +{ + using is_avalanching = void; +}; + +// absl_hash + +#ifdef HAVE_ABSEIL + +struct absl_hash: absl::Hash +{ + using is_avalanching = void; +}; + +#endif + +#ifdef HAVE_MULXP_HASH + +struct mulxp0_hash_ +{ + using is_avalanching = void; + + std::size_t operator()( std::string_view const& st ) const BOOST_NOEXCEPT + { + return mulxp0_hash( (unsigned char const*)st.data(), st.size(), 0 ); + } +}; + +struct mulxp1_hash_ +{ + using is_avalanching = void; + + std::size_t operator()( std::string_view const& st ) const BOOST_NOEXCEPT + { + return mulxp1_hash( (unsigned char const*)st.data(), st.size(), 0 ); + } +}; + +struct mulxp2_hash_ +{ + using is_avalanching = void; + + std::size_t operator()( std::string_view const& st ) const BOOST_NOEXCEPT + { + return mulxp2_hash( (unsigned char const*)st.data(), st.size(), 0 ); + } +}; + +struct mulxp3_hash_ +{ + using is_avalanching = void; + + std::size_t operator()( std::string_view const& st ) const BOOST_NOEXCEPT + { + return mulxp3_hash( (unsigned char const*)st.data(), st.size(), 0 ); + } +}; + +#endif + +// + +int main() +{ + init_words(); + + test< boost::hash >( "boost::hash" ); + test< std_hash >( "std::hash" ); + test< mul31_hash >( "mul31_hash" ); + test< mul31_x4_hash >( "mul31_x4_hash" ); + test< mul31_x8_hash >( "mul31_x8_hash" ); + test< fnv1a_hash >( "fnv1a_hash" ); + +#ifdef HAVE_ABSEIL + + test< absl_hash >( "absl::Hash" ); + +#endif + +#ifdef HAVE_MULXP_HASH + + test< mulxp0_hash_ >( "mulxp0_hash" ); + test< mulxp1_hash_ >( "mulxp1_hash" ); + test< mulxp2_hash_ >( "mulxp2_hash" ); + test< mulxp3_hash_ >( "mulxp3_hash" ); + +#endif + + std::cout << "---\n\n"; + + for( auto const& x: times ) + { + std::cout << std::setw( 22 ) << ( x.label_ + ": " ) << std::setw( 5 ) << x.time_ << " ms\n"; + } +} + +#ifdef HAVE_ABSEIL +# include "absl/hash/internal/hash.cc" +# include "absl/hash/internal/low_level_hash.cc" +# include "absl/hash/internal/city.cc" +#endif