Refactored and documented the sort and merge algorithm

2025-09-30 01:30:54 +02:00 · 2016-03-23 22:11:06 +01:00
parent f86a3a40bb
commit cae8d2dda3
1 changed files with 322 additions and 156 deletions
--- a/include/boost/move/algo/detail/adaptive_sort_merge.hpp
+++ b/include/boost/move/algo/detail/adaptive_sort_merge.hpp
@@ -37,8 +37,8 @@
 //   elements twice.
 //
 // The adaptive_merge algorithm was developed by Ion Gaztanaga reusing some parts
-// from the sorting algorithm and implementing a block merge algorithm
-// without moving elements left or right, which is used when external memory
+// from the sorting algorithm and implementing an additional block merge algorithm
+// without moving elements to left or right, which is used when external memory
 // is available.
 //////////////////////////////////////////////////////////////////////////////
 #ifndef BOOST_MOVE_ADAPTIVE_SORT_MERGE_HPP
@@ -371,7 +371,7 @@ RandIt op_partial_merge_with_buf_impl
   //Now merge from buffer
   if(first2 != last2)
   while(1){
-		if(comp(*first2, *buf_first1)) {
+      if(comp(*first2, *buf_first1)) {
         op(first2++, first1++);
         if(first2 == last2)
            break;
@@ -450,7 +450,7 @@ void op_merge_blocks_with_buf
         skip_first_it = false;
         bool const last_it = key_first == key_end;
         //If the trailing block is empty, we'll make it equal to the previous if empty
-		   bool const is_range2_A  = last_it ? (!l_irreg2 && is_range1_A) : key_comp(*key_first, midkey);
+         bool const is_range2_A  = last_it ? (!l_irreg2 && is_range1_A) : key_comp(*key_first, midkey);

         if(is_range1_A == is_range2_A){
            if(buffer != buffer_end){
@@ -565,8 +565,8 @@ RandIt op_partial_merge_left_impl
   while(first1 != last1){
      if(first2 == last2){
         return first1;
-	   }
-		if(comp(*first2, *first1)) {
+      }
+      if(comp(*first2, *first1)) {
         op(first2, buf_first);
         ++first2;
      }
@@ -670,11 +670,11 @@ RandIt op_partial_merge_left_smart_impl
   BOOST_ASSERT(0 != (last1-first1));
   if(first2 != last2)
   while(1){
-		if(comp(*first2, *first1)) {
+      if(comp(*first2, *first1)) {
         op(first2++, dest++);
         if(first2 == last2){
            return first1;
-   	   }
+         }
      }
      else{
         op(first1++, dest++);
@@ -719,7 +719,7 @@ void op_merge_blocks_left
 {
   if(n_bef_irreg2 == 0){
      RandIt const last_reg(first+l_irreg1+n_aft_irreg2*l_block);
-		op_merge_left(first-l_block, first, last_reg, last_reg+l_irreg2, comp, op);
+      op_merge_left(first-l_block, first, last_reg, last_reg+l_irreg2, comp, op);
   }
   else {
      RandIt buffer = first  - l_block;
@@ -736,8 +736,8 @@ void op_merge_blocks_left
         skip_first_it = false;
         bool const last_it = key_first == key_end;
         //If the trailing block is empty, we'll make it equal to the previous if empty
-		   bool const is_range2_A    = last_it ? (!l_irreg2 && is_range1_A) : key_comp(*key_first, midkey);
-		   bool const is_buffer_middle = last1 == buffer;
+         bool const is_range2_A    = last_it ? (!l_irreg2 && is_range1_A) : key_comp(*key_first, midkey);
+         bool const is_buffer_middle = last1 == buffer;

         if(is_range1_A == is_range2_A){
            //If range1 is buffered, write it to its final position
@@ -824,7 +824,7 @@ RandIt op_partial_merge_right_impl
 {
   RandIt const first2 = last1;
   while(first2 != last2){
-		if(last1 == first1){
+      if(last1 == first1){
         return last2;
      }
      --last2;
@@ -881,7 +881,7 @@ void op_merge_blocks_right
   for(bool is_range2_A = false; key_first != key_end; last1 = first1, first1 -= l_block){
      --key_end;
      bool const is_range1_A = key_comp(*key_end, midkey);
-		bool const is_buffer_middle = first2 == buffer_end;
+      bool const is_buffer_middle = first2 == buffer_end;

      if(is_range1_A == is_range2_A){
         if(!is_buffer_middle){
@@ -937,17 +937,17 @@ RandIt partial_merge_bufferless_impl
      return first1;
   }
   bool const is_range1_A = *pis_range1_A;
-	if(first1 != last1 && comp(*last1, last1[-1])){
+   if(first1 != last1 && comp(*last1, last1[-1])){
      do{
         RandIt const old_last1 = last1;
-			last1  = lower_bound(last1, last2, *first1, comp);
+         last1  = lower_bound(last1, last2, *first1, comp);
         first1 = rotate_gcd(first1, old_last1, last1);//old_last1 == last1 supported
         if(last1 == last2){
            return first1;
         }
         do{
            ++first1;
-			} while(last1 != first1 && !comp(*last1, *first1) );
+         } while(last1 != first1 && !comp(*last1, *first1) );
      } while(first1 != last1);
   }
   *pis_range1_A = !is_range1_A;
@@ -993,7 +993,7 @@ void merge_blocks_bufferless
      bool is_range1_A = l_irreg1 ? true : key_comp(*key_first++, midkey);

      for( ; key_first != key_end; ++key_first){
-		   bool is_range2_A = key_comp(*key_first, midkey);
+         bool is_range2_A = key_comp(*key_first, midkey);
         if(is_range1_A == is_range2_A){
            first1 = last1;
         }
@@ -1077,9 +1077,9 @@ typename iterator_traits<RandIt>::size_type
      if(xbuf.capacity() >= max_collected){
         value_type *const ph0 = xbuf.add(first);
         while(u != last && h < max_collected){
-		      value_type * const r = lower_bound(ph0, xbuf.end(), *u, comp);
+            value_type * const r = lower_bound(ph0, xbuf.end(), *u, comp);
            //If key not found add it to [h, h+h0)
-		      if(r == xbuf.end() || comp(*u, *r) ){
+            if(r == xbuf.end() || comp(*u, *r) ){
               RandIt const new_h0 = boost::move(search_end, u, h0);
               search_end = u;
               ++search_end;
@@ -1094,9 +1094,9 @@ typename iterator_traits<RandIt>::size_type
      }
      else{
         while(u != last && h < max_collected){
-		      RandIt const r = lower_bound(h0, search_end, *u, comp);
+            RandIt const r = lower_bound(h0, search_end, *u, comp);
            //If key not found add it to [h, h+h0)
-		      if(r == search_end || comp(*u, *r) ){
+            if(r == search_end || comp(*u, *r) ){
               RandIt const new_h0 = rotate_gcd(h0, search_end, u);
               search_end = u;
               ++search_end;
@@ -1222,11 +1222,11 @@ void slow_stable_sort
      if(do_merge){
         size_type const h_2 = 2*h;
         while((L-p0) > h_2){
-			   merge_bufferless(first+p0, first+p0+h, first+p0+h_2, comp);
+            merge_bufferless(first+p0, first+p0+h, first+p0+h_2, comp);
            p0 += h_2;
         }
      }
-		if((L-p0) > h){
+      if((L-p0) > h){
         merge_bufferless(first+p0, first+p0+h, last, comp);
      }
   }
@@ -1401,7 +1401,7 @@ void combine_params
      size_type const irreg_off = is_merge_left ? 0u: l_irreg2-1;
      RandIt prev_block_first = first + l_combined - l_irreg2;
      const value_type &incomplete_block_first = prev_block_first[irreg_off];
-		while(n_aft_irreg2 != n_reg_block && 
+      while(n_aft_irreg2 != n_reg_block && 
            comp(incomplete_block_first, (prev_block_first-= l_block)[reg_off]) ){
         ++n_aft_irreg2;
      }
@@ -1709,12 +1709,12 @@ void op_merge_right_step
   if(restk <= l_build_buf){
      op(backward_t(),first_block+p, first_block+p+restk, first_block+p+restk+l_build_buf);
   }
-	else{
+   else{
      op_merge_right(first_block+p, first_block+p+l_build_buf, first_block+p+restk, first_block+p+restk+l_build_buf, comp, op);
   }
   while(p>0){
      p -= 2*l_build_buf;
-		op_merge_right(first_block+p, first_block+p+l_build_buf, first_block+p+2*l_build_buf, first_block+p+3*l_build_buf, comp, op);
+      op_merge_right(first_block+p, first_block+p+l_build_buf, first_block+p+2*l_build_buf, first_block+p+3*l_build_buf, comp, op);
   }
 }

@@ -1954,13 +1954,13 @@ void stable_merge


 template<class RandIt, class Compare>
-void final_merge( bool buffer_right
-                , RandIt const first
-                , typename iterator_traits<RandIt>::size_type const l_intbuf
-                , typename iterator_traits<RandIt>::size_type const n_keys
-                , typename iterator_traits<RandIt>::size_type const len
-                , adaptive_xbuf<typename iterator_traits<RandIt>::value_type> & xbuf
-                , Compare comp)
+void adaptive_sort_final_merge( bool buffer_right
+                              , RandIt const first
+                              , typename iterator_traits<RandIt>::size_type const l_intbuf
+                              , typename iterator_traits<RandIt>::size_type const n_keys
+                              , typename iterator_traits<RandIt>::size_type const len
+                              , adaptive_xbuf<typename iterator_traits<RandIt>::value_type> & xbuf
+                              , Compare comp)
 {
   BOOST_ASSERT(n_keys || xbuf.size() == l_intbuf);
   xbuf.clear();
@@ -2009,7 +2009,7 @@ bool build_params
   //segments of size l_build_buf*2, maximizing the classic merge phase.
   l_intbuf = size_type(ceil_sqrt_multiple(len, &l_base));

-   //This is the minimum number of case to implement the ideal algorithm
+   //This is the minimum number of keys to implement the ideal algorithm
   //
   //l_intbuf is used as buffer plus the key count
   size_type n_min_ideal_keys = l_intbuf-1u;
@@ -2030,10 +2030,10 @@ bool build_params
      //
      //If available memory is 2*sqrt(l), then only sqrt(l) unique keys are needed,
      //(to be used for keys in combine_all_blocks) as the whole l_build_buf
-      //we'll be backuped in the buffer during build_blocks.
+      //will be backuped in the buffer during build_blocks.
      bool const non_unique_buf = xbuf.capacity() >= 2*l_intbuf;
      size_type const to_collect = non_unique_buf ? l_intbuf : l_intbuf*2;
-	   size_type collected = collect_unique(first, first+len, to_collect, comp, xbuf);
+      size_type collected = collect_unique(first, first+len, to_collect, comp, xbuf);

      //If available memory is 2*sqrt(l), then for "build_params" 
      //the situation is the same as if 2*l_intbuf were collected.
@@ -2044,7 +2044,7 @@ bool build_params
      //is possible (due to very low unique keys), then go to a slow sort based on rotations.
      if(collected < (n_min_ideal_keys+l_intbuf)){
         if(collected < 4){  //No combination possible with less that 4 keys
-			   return false;
+            return false;
         }
         n_keys = l_intbuf;
         while(n_keys&(n_keys-1)){
@@ -2053,6 +2053,7 @@ bool build_params
         while(n_keys > collected){
            n_keys/=2;
         }
+         //AdaptiveSortInsertionSortThreshold is always power of two so the minimum is power of two
         l_base = min_value<Unsigned>(n_keys, AdaptiveSortInsertionSortThreshold);
         l_intbuf = 0;
         l_build_buf = n_keys;
@@ -2072,6 +2073,218 @@ bool build_params
   return true;
 }

+
+#define BOOST_MOVE_ADAPTIVE_MERGE_WITH_BUF
+
+template<class RandIt, class Compare>
+inline void adaptive_merge_combine_blocks( RandIt first
+                                      , typename iterator_traits<RandIt>::size_type len1
+                                      , typename iterator_traits<RandIt>::size_type len2
+                                      , typename iterator_traits<RandIt>::size_type collected
+                                      , typename iterator_traits<RandIt>::size_type n_keys
+                                      , typename iterator_traits<RandIt>::size_type l_block
+                                      , bool use_internal_buf
+                                      , bool xbuf_used
+                                      , Compare comp
+                                      , adaptive_xbuf<typename iterator_traits<RandIt>::value_type> & xbuf
+                                      )
+{
+   typedef typename iterator_traits<RandIt>::size_type size_type;
+   size_type const len = len1+len2;
+   size_type const l_combine  = len-collected;
+   size_type const l_combine1 = len1-collected;
+   size_type n_bef_irreg2, n_aft_irreg2, l_irreg1, l_irreg2, midkey_idx;
+   if(n_keys){
+      RandIt const first_data = first+collected;
+      RandIt const keys = first;
+      combine_params( keys, comp, first_data, l_combine
+                     , l_combine1, l_block, xbuf, comp
+                     , midkey_idx, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, true);   //Outputs
+      BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A combine: ", len);
+      if(xbuf_used){
+         merge_blocks_with_buf
+            (keys, keys[midkey_idx], comp, first_data, l_block, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, comp, xbuf, xbuf_used);
+         BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A mrg xbf: ", len);
+      }
+      else if(use_internal_buf){
+
+         #ifdef BOOST_MOVE_ADAPTIVE_MERGE_WITH_BUF
+         range_xbuf<RandIt, swap_op> rbuf(first_data-l_block, first_data);
+         merge_blocks_with_buf
+            (keys, keys[midkey_idx], comp, first_data, l_block, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, comp, rbuf, xbuf_used);
+         BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A mrg buf: ", len);
+         #else
+         merge_blocks_left
+            (keys, keys[midkey_idx], comp, first_data, l_block, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, comp, xbuf_used);
+         BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A mrg lft: ", len);
+         #endif
+      }
+      else{
+         merge_blocks_bufferless
+            (keys, keys[midkey_idx], comp, first_data, l_block, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, comp);
+         BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A mrg bfl: ", len);
+      }
+   }
+   else{
+      xbuf.clear();
+      size_type *const uint_keys = xbuf.template aligned_trailing<size_type>(l_block);
+      combine_params( uint_keys, less(), first, l_combine
+                     , l_combine1, l_block, xbuf, comp
+                     , midkey_idx, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, true);   //Outputs
+      BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A combine: ", len);
+      merge_blocks_with_buf
+         (uint_keys, uint_keys[midkey_idx], less(), first, l_block, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, comp, xbuf, true);
+      xbuf.clear();
+      BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A mrg lft: ", len);
+   }
+
+}
+
+template<class RandIt, class Compare>
+inline void adaptive_merge_final_merge( RandIt first
+                                      , typename iterator_traits<RandIt>::size_type len1
+                                      , typename iterator_traits<RandIt>::size_type len2
+                                      , typename iterator_traits<RandIt>::size_type collected
+                                      , typename iterator_traits<RandIt>::size_type l_intbuf
+                                      , typename iterator_traits<RandIt>::size_type l_block
+                                      , bool use_internal_buf
+                                      , bool xbuf_used
+                                      , Compare comp
+                                      , adaptive_xbuf<typename iterator_traits<RandIt>::value_type> & xbuf
+                                      )
+{
+   typedef typename iterator_traits<RandIt>::size_type size_type;
+   (void)l_block;
+   size_type n_keys = collected-l_intbuf;
+   size_type len = len1+len2;
+   if(use_internal_buf){
+      if(xbuf_used){
+         xbuf.clear();
+         //Nothing to do
+         if(n_keys){
+            stable_sort(first, first+n_keys, comp, xbuf);
+            stable_merge(first, first+n_keys, first+len, comp, xbuf);
+         }
+      }
+      else{
+         #ifdef BOOST_MOVE_ADAPTIVE_MERGE_WITH_BUF
+         xbuf.clear();
+         stable_sort(first, first+collected, comp, xbuf);
+         stable_merge(first, first+collected, first+len, comp, xbuf);
+         #else
+         xbuf.clear();
+         stable_sort(first+len-l_block, first+len, comp, xbuf);
+         RandIt const pos1 = lower_bound(first+n_keys, first+len-l_block, first[len-1], comp);
+         RandIt const pos2 = rotate_gcd(pos1, first+len-l_block, first+len);
+         stable_merge(first+n_keys, pos1, pos2, antistable<Compare>(comp), xbuf);
+         if(n_keys){
+            stable_sort(first, first+n_keys, comp, xbuf);
+            stable_merge(first, first+n_keys, first+len, comp, xbuf);
+         }
+         #endif
+      }
+
+      BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A buf mrg: ", len);
+   }
+   else{
+      stable_sort(first, first+collected, comp, xbuf);
+      xbuf.clear();
+      if(xbuf.capacity() >= collected){
+         buffered_merge(first, first+collected, first+len1+len2, comp, xbuf);
+      }
+      else{
+         merge_bufferless(first, first+collected, first+len1+len2, comp);
+      }
+   }
+   BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A key mrg: ", len);
+}
+
+template<class SizeType, class Xbuf>
+inline SizeType adaptive_merge_n_keys_intbuf(SizeType l_block, SizeType len, Xbuf & xbuf, SizeType &l_intbuf_inout)
+{
+   typedef SizeType size_type;
+   size_type l_intbuf = xbuf.capacity() >= l_block ? 0u : l_block;
+
+   //This is the minimum number of keys to implement the ideal algorithm
+   //ceil(len/l_block) - 1 (as the first block is used as buffer)
+   size_type n_keys = l_block;
+   while(n_keys >= (len-l_intbuf-n_keys)/l_block){
+      --n_keys;
+   }
+   ++n_keys;
+   BOOST_ASSERT(n_keys < l_block);
+
+   if(xbuf.template supports_aligned_trailing<size_type>(l_block, n_keys)){
+      n_keys = 0u;
+   }
+   l_intbuf_inout = l_intbuf;
+   return n_keys;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+
+// Main explanation of the sort algorithm.
+//
+// csqrtlen = ceil(sqrt(len));
+//
+// * First, 2*csqrtlen unique elements elements are extracted from elements to be
+//   sorted and placed in the beginning of the range.
+//
+// * Step "build_blocks": In this nearly-classic merge step, 2*csqrtlen unique elements
+//   will be used as auxiliary memory, so trailing len-2*csqrtlen elements are
+//   are grouped in blocks of sorted 4*csqrtlen elements. At the end of the step
+//   2*csqrtlen unique elements are again the leading elements of the whole range.
+//
+// * Step "combine_blocks": pairs of previously formed blocks are merged with a different
+//   ("smart") algorithm to form blocks of 8*csqrtlen elements. This step is slower than the
+//   "build_blocks" step and repeated iteratively (forming blocks of 16*csqrtlen, 32*csqrtlen
+//   elements, etc) of until all trailing (len-2*csqrtlen) elements are merged.
+//
+//   In "combine_blocks" len/csqrtlen elements used are as "keys" (markers) to
+//   know if elements belong to the first or second block to be merged and another 
+//   leading csqrtlen elements are used as buffer. Explanation of the "combine_blocks" step:
+//
+//   Iteratively until all trailing (len-2*csqrtlen) elements are merged:
+//      Iteratively for each pair of previously merged block:
+//         * Blocks are divided groups of csqrtlen elements and
+//           2*merged_block/csqrtlen keys are sorted to be used as markers
+//         * Groups are selection-sorted by first or last element (depending wheter they
+//           merged to left or right) and keys are reordered accordingly as an imitation-buffer.
+//         * Elements of each block pair is merged using the csqrtlen buffer taking into account
+//           if they belong to the first half or second half (marked by the key).
+//
+// * In the final merge step leading elements (2*csqrtlen) are sorted and merged with
+//   rotations with the rest of sorted elements in the "combine_blocks" step.
+//
+// Corner cases:
+//
+// * If no 2*csqrtlen elements can be extracted:
+//
+//    * If csqrtlen+len/csqrtlen are extracted, then only csqrtlen elements are used
+//      as buffer in the "build_blocks" step forming blocks of 2*csqrtlen elements. This
+//      means that an additional "combine_blocks" step will be needed to merge all elements.
+//    
+//    * If no csqrtlen+len/csqrtlen elements can be extracted, but still more than a minimum,
+//      then reduces the number of elements used as buffer and keys in the "build_blocks"
+//      and "combine_blocks" steps. If "combine_blocks" has no enough keys due to this reduction
+//      then uses a rotation based smart merge.
+//
+//    * If the minimum number of keys can't be extracted, a rotation-based sorting is performed.
+//
+// * If auxiliary memory is more or equal than ceil(len/2), half-copying mergesort is used.
+//
+// * If auxiliary memory is more than csqrtlen+n_keys*sizeof(std::size_t),
+//   then only csqrtlen elements need to be extracted and "combine_blocks" will use integral
+//   keys to combine blocks.
+//
+// * If auxiliary memory is available, the "build_blocks" will be extended to build bigger blocks
+//   using classic merge.
 template<class RandIt, class Compare>
 void adaptive_sort_impl
   ( RandIt first
@@ -2093,7 +2306,7 @@ void adaptive_sort_impl
      return;
   }

-   //Make sure it is at least two
+   //Make sure it is at least four
   BOOST_STATIC_ASSERT(AdaptiveSortInsertionSortThreshold >= 4);

   size_type l_base = 0;
@@ -2101,12 +2314,14 @@ void adaptive_sort_impl
   size_type n_keys = 0;
   size_type l_build_buf = 0;

+   //Calculate and extract needed unique elements. If a minimum is not achieved
+   //fallback to rotation-based merge
   if(!build_params(first, len, comp, n_keys, l_intbuf, l_base, l_build_buf, xbuf)){
      stable_sort(first, first+len, comp, xbuf);
      return;
   }

-   //Otherwise, continue in adaptive_sort
+   //Otherwise, continue the adaptive_sort
   BOOST_MOVE_ADAPTIVE_SORT_PRINT("\n   After collect_unique: ", len);
   size_type const n_key_plus_buf = l_intbuf+n_keys;
   //l_build_buf is always power of two if l_intbuf is zero
@@ -2122,9 +2337,51 @@ void adaptive_sort_impl
      (first, n_keys, first+n_keys, len-n_keys, l_merged, l_intbuf, xbuf, comp);

   //Sort keys and buffer and merge the whole sequence
-   final_merge(buffer_right, first, l_intbuf, n_keys, len, xbuf, comp);
+   adaptive_sort_final_merge(buffer_right, first, l_intbuf, n_keys, len, xbuf, comp);
 }

+// Main explanation of the merge algorithm.
+//
+// csqrtlen = ceil(sqrt(len));
+//
+// * First, csqrtlen [to be used as buffer] + (len/csqrtlen - 1) [to be used as keys] => to_collect
+//   unique elements are extracted from elements to be sorted and placed in the beginning of the range.
+//
+// * Step "combine_blocks": the leading (len1-to_collect) elements plus trailing len2 elements
+//   are merged with a non-trivial ("smart") algorithm to form an ordered range trailing "len-to_collect" elements.
+//
+//   Explanation of the "combine_blocks" step:
+//
+//         * Trailing [first+to_collect, first+len1) elements are divided in groups of cqrtlen elements.
+//           Remaining elements that can't form a group are grouped in the front of those elements.
+//         * Trailing [first+len1, first+len1+len2) elements are divided in groups of cqrtlen elements.
+//           Remaining elements that can't form a group are grouped in the back of those elements.
+//         * Groups are selection-sorted by first or last element (depending wheter they
+//           merged to left or right) and keys are reordered accordingly as an imitation-buffer.
+//         * Elements of each block pair is merged using the csqrtlen buffer taking into account
+//           if they belong to the first half or second half (marked by the key).
+//
+// * In the final merge step leading "to_collect" elements are merged with rotations
+//   with the rest of merged elements in the "combine_blocks" step.
+//
+// Corner cases:
+//
+// * If no "to_collect" elements can be extracted:
+//
+//    * If more than a minimum number of elements is extracted
+//      then reduces the number of elements used as buffer and keys in the
+//      and "combine_blocks" steps. If "combine_blocks" has no enough keys due to this reduction
+//      then uses a rotation based smart merge.
+//
+//    * If the minimum number of keys can't be extracted, a rotation-based merge is performed.
+//
+// * If auxiliary memory is more or equal than min(len1, len2), a buffered merge is performed.
+//
+// * If the len1 or len2 are less than 2*csqrtlen then a rotation-based merge is performed.
+//
+// * If auxiliary memory is more than csqrtlen+n_keys*sizeof(std::size_t),
+//   then no csqrtlen need to be extracted and "combine_blocks" will use integral
+//   keys to combine blocks.
 template<class RandIt, class Compare>
 void adaptive_merge_impl
   ( RandIt first
@@ -2144,134 +2401,43 @@ void adaptive_merge_impl
      //Calculate ideal parameters and try to collect needed unique keys
      size_type l_block = size_type(ceil_sqrt(len));

+      //One range is not big enough to extract keys and the internal buffer so a
+      //rotation-based based merge will do just fine
      if(len1 <= l_block*2 || len2 <= l_block*2){
         merge_bufferless(first, first+len1, first+len1+len2, comp);
         return;
      }

-      size_type l_intbuf = xbuf.capacity() >= l_block ? 0u : l_block;
-
-      //This is the minimum number of case to implement the ideal algorithm
-      //ceil(len/l_block) - 1 (as the first block is used as buffer)
-      size_type n_keys = l_block;
-      while(n_keys >= (len-l_intbuf-n_keys)/l_block){
-         --n_keys;
-      }
-      ++n_keys;
-      BOOST_ASSERT(n_keys < l_block);
-
-      if(xbuf.template supports_aligned_trailing<size_type>(l_block, n_keys)){
-         n_keys = 0u;
-      }
-
+      //Detail the number of keys and internal buffer. If xbuf has enough memory, no
+      //internal buffer is needed so l_intbuf will remain 0.
+      size_type l_intbuf = 0;
+      size_type n_keys = adaptive_merge_n_keys_intbuf(l_block, len, xbuf, l_intbuf);
      size_type const to_collect = l_intbuf+n_keys;
-	   size_type const collected = collect_unique(first, first+len1, to_collect, comp, xbuf);
-
+      //Try to extract needed unique values from the first range
+      size_type const collected  = collect_unique(first, first+len1, to_collect, comp, xbuf);
      BOOST_MOVE_ADAPTIVE_SORT_PRINT("\n   A collect: ", len);
+
+      //Not the minimum number of keys is not available on the first range, so fallback to rotations
      if(collected != to_collect && collected < 4){
         merge_bufferless(first, first+len1, first+len1+len2, comp);
+         return;
      }
-      else{
-         bool use_internal_buf = true;
-         if (collected != to_collect){
-            l_intbuf = 0u;
-            n_keys = collected;
-            use_internal_buf = false;
-            l_block  = lblock_for_combine(l_intbuf, n_keys, len, use_internal_buf);
-            l_intbuf = use_internal_buf ? l_block : 0u;
-         }

-         bool xbuf_used = collected == to_collect && xbuf.capacity() >= l_block;
-         size_type const l_combine  = len-collected;
-         size_type const l_combine1 = len1-collected;
-
-         size_type n_bef_irreg2, n_aft_irreg2, l_irreg1, l_irreg2, midkey_idx;
-         if(n_keys){
-            RandIt const first_data = first+collected;
-            RandIt const keys = first;
-            combine_params( keys, comp, first_data, l_combine
-                          , l_combine1, l_block, xbuf, comp
-                          , midkey_idx, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, true);   //Outputs
-            BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A combine: ", len);
-            if(xbuf_used){
-               merge_blocks_with_buf
-                  (keys, keys[midkey_idx], comp, first_data, l_block, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, comp, xbuf, xbuf_used);
-               BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A mrg xbf: ", len);
-            }
-            else if(use_internal_buf){
-               #define BOOST_MOVE_ADAPTIVE_MERGE_WITH_BUF
-               #ifdef BOOST_MOVE_ADAPTIVE_MERGE_WITH_BUF
-               range_xbuf<RandIt, swap_op> rbuf(first_data-l_block, first_data);
-               merge_blocks_with_buf
-                  (keys, keys[midkey_idx], comp, first_data, l_block, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, comp, rbuf, xbuf_used);
-               BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A mrg buf: ", len);
-               #else
-               merge_blocks_left
-                  (keys, keys[midkey_idx], comp, first_data, l_block, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, comp, xbuf_used);
-               BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A mrg lft: ", len);
-               #endif
-            }
-            else{
-               merge_blocks_bufferless
-                  (keys, keys[midkey_idx], comp, first_data, l_block, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, comp);
-               BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A mrg bfl: ", len);
-            }
-         }
-         else{
-            xbuf.clear();
-            size_type *const uint_keys = xbuf.template aligned_trailing<size_type>(l_block);
-            combine_params( uint_keys, less(), first, l_combine
-                          , l_combine1, l_block, xbuf, comp
-                          , midkey_idx, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, true);   //Outputs
-            BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A combine: ", len);
-            merge_blocks_with_buf
-               (uint_keys, uint_keys[midkey_idx], less(), first, l_block, l_irreg1, n_bef_irreg2, n_aft_irreg2, l_irreg2, comp, xbuf, true);
-            xbuf.clear();
-            BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A mrg lft: ", len);
-         }
-
-         n_keys = collected-l_intbuf;
-         if(use_internal_buf){
-            if(xbuf_used){
-               xbuf.clear();
-               //Nothing to do
-               if(n_keys){
-                  stable_sort(first, first+n_keys, comp, xbuf);
-                  stable_merge(first, first+n_keys, first+len, comp, xbuf);
-               }
-            }
-            else{
-               #ifdef BOOST_MOVE_ADAPTIVE_MERGE_WITH_BUF
-               xbuf.clear();
-               stable_sort(first, first+collected, comp, xbuf);
-               stable_merge(first, first+collected, first+len, comp, xbuf);
-               #else
-               xbuf.clear();
-               stable_sort(first+len-l_block, first+len, comp, xbuf);
-               RandIt const pos1 = lower_bound(first+n_keys, first+len-l_block, first[len-1], comp);
-               RandIt const pos2 = rotate_gcd(pos1, first+len-l_block, first+len);
-               stable_merge(first+n_keys, pos1, pos2, antistable<Compare>(comp), xbuf);
-               if(n_keys){
-                  stable_sort(first, first+n_keys, comp, xbuf);
-                  stable_merge(first, first+n_keys, first+len, comp, xbuf);
-               }
-               #endif
-            }
-
-            BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A buf mrg: ", len);
-         }
-         else{
-            stable_sort(first, first+collected, comp, xbuf);
-            xbuf.clear();
-            if(xbuf.capacity() >= collected){
-               buffered_merge(first, first+collected, first+len1+len2, comp, xbuf);
-            }
-            else{
-               merge_bufferless(first, first+collected, first+len1+len2, comp);
-            }
-         }
-         BOOST_MOVE_ADAPTIVE_SORT_PRINT("   A key mrg: ", len);
+      //If not enough keys but more than minimum, adjust the internal buffer and key count
+      bool use_internal_buf = collected == to_collect;
+      if (!use_internal_buf){
+         l_intbuf = 0u;
+         n_keys = collected;
+         l_block  = lblock_for_combine(l_intbuf, n_keys, len, use_internal_buf);
+         //If use_internal_buf is false, then then internal buffer will be zero and rotation-based combination will be used
+         l_intbuf = use_internal_buf ? l_block : 0u;
      }
+
+      bool const xbuf_used = collected == to_collect && xbuf.capacity() >= l_block;
+      //Merge trailing elements using smart merges
+      adaptive_merge_combine_blocks(first, len1, len2, collected,   n_keys, l_block, use_internal_buf, xbuf_used, comp, xbuf);
+      //Merge buffer and keys with the rest of the values
+      adaptive_merge_final_merge   (first, len1, len2, collected, l_intbuf, l_block, use_internal_buf, xbuf_used, comp, xbuf);
   }
 }