forked from boostorg/unordered
Replace prefetch_elements() with macro so builtins aren't optimized away by DSE
This commit is contained in:
@ -85,6 +85,35 @@
|
|||||||
}while(0)
|
}while(0)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(BOOST_GCC)||defined(BOOST_CLANG)
|
||||||
|
#define BOOST_UNORDERED_PREFETCH(p) __builtin_prefetch((const char*)p)
|
||||||
|
#elif defined(BOOST_UNORDERED_SSE2)
|
||||||
|
#define BOOST_UNORDERED_PREFETCH(p) _mm_prefetch((const char*)p,_MM_HINT_T0)
|
||||||
|
#else
|
||||||
|
#define BOOST_UNORDERED_PREFETCH(p)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* We have experimentally confirmed that ARM architectures get a higher
|
||||||
|
* speedup when around the first half of the element slots in a group are
|
||||||
|
* prefetched, whereas for Intel just the first cache line is best.
|
||||||
|
* Please report back if you find better tunings for some particular
|
||||||
|
* architectures.
|
||||||
|
*/
|
||||||
|
#if BOOST_ARCH_ARM
|
||||||
|
/* Cache line size can't be known at compile time, so we settle on
|
||||||
|
* the very frequent value of 64B.
|
||||||
|
*/
|
||||||
|
#define BOOST_UNORDERED_PREFETCH_ELEMENTS(p) \
|
||||||
|
do{ \
|
||||||
|
constexpr int cache_line=64; \
|
||||||
|
const char *p0=reinterpret_cast<const char*>(p), \
|
||||||
|
*p1=p0+sizeof(value_type)*N/2; \
|
||||||
|
for(;p0<p1;p0+=cache_line)BOOST_UNORDERED_PREFETCH(p0); \
|
||||||
|
}while(0)
|
||||||
|
#else
|
||||||
|
#define BOOST_UNORDERED_PREFETCH_ELEMENTS(p) BOOST_UNORDERED_PREFETCH(p)
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __has_feature
|
#ifdef __has_feature
|
||||||
#define BOOST_UNORDERED_HAS_FEATURE(x) __has_feature(x)
|
#define BOOST_UNORDERED_HAS_FEATURE(x) __has_feature(x)
|
||||||
#else
|
#else
|
||||||
@ -1037,21 +1066,6 @@ void swap_if(T& x,T& y){using std::swap; swap(x,y);}
|
|||||||
template<bool B,typename T,typename std::enable_if<!B>::type* =nullptr>
|
template<bool B,typename T,typename std::enable_if<!B>::type* =nullptr>
|
||||||
void swap_if(T&,T&){}
|
void swap_if(T&,T&){}
|
||||||
|
|
||||||
inline void prefetch(const void* p)
|
|
||||||
{
|
|
||||||
(void) p;
|
|
||||||
#if BOOST_WORKAROUND(BOOST_GCC, >= 120000) && defined(BOOST_UNORDERED_SSE2)
|
|
||||||
// gcc-12 and above seem to remove the `__bulitin_prefetch` call below so we
|
|
||||||
// manually insert the instruction via an asm declaration.
|
|
||||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109985
|
|
||||||
asm("prefetcht0 %[ptr]"::[ptr]"m"(*(char const*)p):);
|
|
||||||
#elif defined(BOOST_GCC)||defined(BOOST_CLANG)
|
|
||||||
__builtin_prefetch((const char*)p);
|
|
||||||
#elif defined(BOOST_UNORDERED_SSE2)
|
|
||||||
_mm_prefetch((const char*)p,_MM_HINT_T0);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Allocator>
|
template<typename Allocator>
|
||||||
struct is_std_allocator:std::false_type{};
|
struct is_std_allocator:std::false_type{};
|
||||||
|
|
||||||
@ -1457,7 +1471,7 @@ public:
|
|||||||
if(mask){
|
if(mask){
|
||||||
BOOST_UNORDERED_ASSUME(arrays.elements!=nullptr);
|
BOOST_UNORDERED_ASSUME(arrays.elements!=nullptr);
|
||||||
auto p=arrays.elements+pos*N;
|
auto p=arrays.elements+pos*N;
|
||||||
prefetch_elements(p);
|
BOOST_UNORDERED_PREFETCH_ELEMENTS(p);
|
||||||
do{
|
do{
|
||||||
auto n=unchecked_countr_zero(mask);
|
auto n=unchecked_countr_zero(mask);
|
||||||
if(BOOST_LIKELY(bool(pred()(x,key_from(p[n]))))){
|
if(BOOST_LIKELY(bool(pred()(x,key_from(p[n]))))){
|
||||||
@ -1650,28 +1664,6 @@ public:
|
|||||||
return pg->match_occupied()&~(int(pg==last-1)<<(N-1));
|
return pg->match_occupied()&~(int(pg==last-1)<<(N-1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void prefetch_elements(const element_type* p)
|
|
||||||
{
|
|
||||||
/* We have experimentally confirmed that ARM architectures get a higher
|
|
||||||
* speedup when around the first half of the element slots in a group are
|
|
||||||
* prefetched, whereas for Intel just the first cache line is best.
|
|
||||||
* Please report back if you find better tunings for some particular
|
|
||||||
* architectures.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#if BOOST_ARCH_ARM
|
|
||||||
/* Cache line size can't be known at compile time, so we settle on
|
|
||||||
* the very frequent value of 64B.
|
|
||||||
*/
|
|
||||||
constexpr int cache_line=64;
|
|
||||||
const char *p0=reinterpret_cast<const char*>(p),
|
|
||||||
*p1=p0+sizeof(value_type)*N/2;
|
|
||||||
for(;p0<p1;p0+=cache_line)prefetch(p0);
|
|
||||||
#else
|
|
||||||
prefetch(p);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename... Args>
|
template<typename... Args>
|
||||||
locator unchecked_emplace_at(
|
locator unchecked_emplace_at(
|
||||||
std::size_t pos0,std::size_t hash,Args&&... args)
|
std::size_t pos0,std::size_t hash,Args&&... args)
|
||||||
|
Reference in New Issue
Block a user