diff --git a/include/boost/unordered/detail/foa/concurrent_table.hpp b/include/boost/unordered/detail/foa/concurrent_table.hpp index 000a0c56..ad341e64 100644 --- a/include/boost/unordered/detail/foa/concurrent_table.hpp +++ b/include/boost/unordered/detail/foa/concurrent_table.hpp @@ -949,7 +949,7 @@ private: auto mask=pg->match(hash); if(mask){ auto p=this->arrays.elements+pos*N; - BOOST_UNORDERED_PREFETCH_ELEMENTS(p); + BOOST_UNORDERED_PREFETCH_ELEMENTS(p,N); auto lck=access(access_mode,pos); do{ auto n=unchecked_countr_zero(mask); diff --git a/include/boost/unordered/detail/foa/core.hpp b/include/boost/unordered/detail/foa/core.hpp index 52df266c..dcf107b9 100644 --- a/include/boost/unordered/detail/foa/core.hpp +++ b/include/boost/unordered/detail/foa/core.hpp @@ -85,33 +85,40 @@ }while(0) #endif +/* We use BOOST_UNORDERED_PREFETCH[_ELEMENTS] macros rather than proper + * functions because of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109985 + */ + #if defined(BOOST_GCC)||defined(BOOST_CLANG) -#define BOOST_UNORDERED_PREFETCH(p) __builtin_prefetch((const char*)p) +#define BOOST_UNORDERED_PREFETCH(p) __builtin_prefetch((const char*)(p)) #elif defined(BOOST_UNORDERED_SSE2) -#define BOOST_UNORDERED_PREFETCH(p) _mm_prefetch((const char*)p,_MM_HINT_T0) +#define BOOST_UNORDERED_PREFETCH(p) _mm_prefetch((const char*)(p),_MM_HINT_T0) #else -#define BOOST_UNORDERED_PREFETCH(p) +#define BOOST_UNORDERED_PREFETCH(p) ((void)0) #endif /* We have experimentally confirmed that ARM architectures get a higher - * speedup when around the first half of the element slots in a group are - * prefetched, whereas for Intel just the first cache line is best. - * Please report back if you find better tunings for some particular - * architectures. - */ + * speedup when around the first half of the element slots in a group are + * prefetched, whereas for Intel just the first cache line is best. + * Please report back if you find better tunings for some particular + * architectures. + */ + #if BOOST_ARCH_ARM /* Cache line size can't be known at compile time, so we settle on - * the very frequent value of 64B. - */ -#define BOOST_UNORDERED_PREFETCH_ELEMENTS(p) \ - do{ \ - constexpr int cache_line=64; \ - const char *p0=reinterpret_cast(p), \ - *p1=p0+sizeof(value_type)*N/2; \ - for(;p0(BOOST_UNORDERED_P), \ + *p1=p0+sizeof(*BOOST_UNORDERED_P)*(N)/2; \ + for(;p0