Replace prefetch_elements() with macro so builtins aren't optimized away by DSE

2023-05-26 20:56:52 -07:00
parent 2ab4225473
commit 7874625c08
1 changed files with 30 additions and 38 deletions
--- a/include/boost/unordered/detail/foa/core.hpp
+++ b/include/boost/unordered/detail/foa/core.hpp
@@ -85,6 +85,35 @@
  }while(0)
 #endif

+#if defined(BOOST_GCC)||defined(BOOST_CLANG)
+#define BOOST_UNORDERED_PREFETCH(p) __builtin_prefetch((const char*)p)
+#elif defined(BOOST_UNORDERED_SSE2)
+#define BOOST_UNORDERED_PREFETCH(p) _mm_prefetch((const char*)p,_MM_HINT_T0)
+#else
+#define BOOST_UNORDERED_PREFETCH(p)
+#endif
+
+/* We have experimentally confirmed that ARM architectures get a higher
+  * speedup when around the first half of the element slots in a group are
+  * prefetched, whereas for Intel just the first cache line is best.
+  * Please report back if you find better tunings for some particular
+  * architectures.
+  */
+#if BOOST_ARCH_ARM
+/* Cache line size can't be known at compile time, so we settle on
+  * the very frequent value of 64B.
+  */
+#define BOOST_UNORDERED_PREFETCH_ELEMENTS(p)                \
+  do{                                                       \
+    constexpr int  cache_line=64;                           \
+    const char    *p0=reinterpret_cast<const char*>(p),     \
+                  *p1=p0+sizeof(value_type)*N/2;            \
+    for(;p0<p1;p0+=cache_line)BOOST_UNORDERED_PREFETCH(p0); \
+  }while(0)
+#else
+#define BOOST_UNORDERED_PREFETCH_ELEMENTS(p) BOOST_UNORDERED_PREFETCH(p)
+#endif
+
 #ifdef __has_feature
 #define BOOST_UNORDERED_HAS_FEATURE(x) __has_feature(x)
 #else
@@ -1037,21 +1066,6 @@ void swap_if(T& x,T& y){using std::swap; swap(x,y);}
 template<bool B,typename T,typename std::enable_if<!B>::type* =nullptr>
 void swap_if(T&,T&){}

-inline void prefetch(const void* p)
-{
-  (void) p;
-#if BOOST_WORKAROUND(BOOST_GCC, >= 120000) && defined(BOOST_UNORDERED_SSE2)
-  // gcc-12 and above seem to remove the `__bulitin_prefetch` call below so we
-  // manually insert the instruction via an asm declaration.
-  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109985
-  asm("prefetcht0 %[ptr]"::[ptr]"m"(*(char const*)p):);
-#elif defined(BOOST_GCC)||defined(BOOST_CLANG)
-  __builtin_prefetch((const char*)p);
-#elif defined(BOOST_UNORDERED_SSE2)
-  _mm_prefetch((const char*)p,_MM_HINT_T0);
-#endif    
-}
-
 template<typename Allocator>
 struct is_std_allocator:std::false_type{};

@@ -1457,7 +1471,7 @@ public:
      if(mask){
        BOOST_UNORDERED_ASSUME(arrays.elements!=nullptr);
        auto p=arrays.elements+pos*N;
-        prefetch_elements(p);
+        BOOST_UNORDERED_PREFETCH_ELEMENTS(p);
        do{
          auto n=unchecked_countr_zero(mask);
          if(BOOST_LIKELY(bool(pred()(x,key_from(p[n]))))){
@@ -1650,28 +1664,6 @@ public:
    return pg->match_occupied()&~(int(pg==last-1)<<(N-1));
  }

-  static inline void prefetch_elements(const element_type* p)
-  {
-    /* We have experimentally confirmed that ARM architectures get a higher
-     * speedup when around the first half of the element slots in a group are
-     * prefetched, whereas for Intel just the first cache line is best.
-     * Please report back if you find better tunings for some particular
-     * architectures.
-     */
-
-#if BOOST_ARCH_ARM
-    /* Cache line size can't be known at compile time, so we settle on
-     * the very frequent value of 64B.
-     */
-    constexpr int  cache_line=64;
-    const char    *p0=reinterpret_cast<const char*>(p),
-                  *p1=p0+sizeof(value_type)*N/2;
-    for(;p0<p1;p0+=cache_line)prefetch(p0);
-#else
-    prefetch(p);
-#endif
-  }
-
  template<typename... Args>
  locator unchecked_emplace_at(
    std::size_t pos0,std::size_t hash,Args&&... args)