From 4546ffba1d8af0c97072456779a5f0d44067a43c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ion=20Gazta=C3=B1aga?= Date: Sun, 31 Jul 2016 14:11:49 +0200 Subject: [PATCH] As suggested in Trac #12229 ("intrusive::unordered_set::rehash() broken) a new "full rehash" function is added to force rehashing of existing elements when keys are changed but the new keys respect some hash and equality invariants. --- doc/intrusive.qbk | 1 + include/boost/intrusive/hashtable.hpp | 223 ++++++++++++++-------- include/boost/intrusive/unordered_set.hpp | 6 + test/unordered_test.hpp | 21 +- 4 files changed, 165 insertions(+), 86 deletions(-) diff --git a/doc/intrusive.qbk b/doc/intrusive.qbk index 2eecd34..e5498f0 100644 --- a/doc/intrusive.qbk +++ b/doc/intrusive.qbk @@ -3841,6 +3841,7 @@ to be inserted in intrusive containers are allocated using `std::vector` or `std * [@https://svn.boost.org/trac/boost/ticket/11994 Boost Trac #11994: ['Support intrusive container key extractors that return the key by value]] * [@https://svn.boost.org/trac/boost/ticket/12184 Boost Trac #12184: ['clang -Wdocumentation warning]] * [@https://svn.boost.org/trac/boost/ticket/12190 Boost Trac #12190: ['Intrusive List + Flat Map combination crashes]] + * [@https://svn.boost.org/trac/boost/ticket/12229 Boost Trac #12229: ['intrusive::unordered_set::rehash() broken]] * [@https://svn.boost.org/trac/boost/ticket/12245 Boost Trac #12245: ['bstree uses a shared static size_traits for constant_time_size]] [endsect] diff --git a/include/boost/intrusive/hashtable.hpp b/include/boost/intrusive/hashtable.hpp index 4a8e9a8..146a497 100644 --- a/include/boost/intrusive/hashtable.hpp +++ b/include/boost/intrusive/hashtable.hpp @@ -2835,100 +2835,55 @@ class hashtable_impl //! new_bucket_traits.bucket_count() can be bigger or smaller than this->bucket_count(). //! 'new_bucket_traits' copy constructor should not throw. //! - //! Effects: Updates the internal reference with the new bucket, erases - //! the values from the old bucket and inserts then in the new one. + //! Effects: + //! If `new_bucket_traits.bucket_begin() == this->bucket_pointer()` is false, + //! unlinks values from the old bucket and inserts then in the new one according + //! to the hash value of values. + //! + //! If `new_bucket_traits.bucket_begin() == this->bucket_pointer()` is true, + //! the implementations avoids moving values as much as possible. + //! //! Bucket traits hold by *this is assigned from new_bucket_traits. //! If the container is configured as incremental<>, the split bucket is set //! to the new bucket_count(). //! //! If store_hash option is true, this method does not use the hash function. + //! If false, the implementation tries to minimize calls to the hash function + //! (e.g. once for equivalent values if optimize_multikey is true). + //! + //! If rehash is successful updates the internal bucket_traits with new_bucket_traits. //! //! Complexity: Average case linear in this->size(), worst case quadratic. //! //! Throws: If the hasher functor throws. Basic guarantee. void rehash(const bucket_traits &new_bucket_traits) { - const bucket_ptr new_buckets = new_bucket_traits.bucket_begin(); - size_type new_bucket_count = new_bucket_traits.bucket_count(); - const bucket_ptr old_buckets = this->priv_bucket_pointer(); - size_type old_bucket_count = this->priv_bucket_count(); + this->rehash_impl(new_bucket_traits, false); + } - //Check power of two bucket array if the option is activated - BOOST_INTRUSIVE_INVARIANT_ASSERT - (!power_2_buckets || (0 == (new_bucket_count & (new_bucket_count-1u)))); - - size_type n = this->priv_get_cache_bucket_num(); - const bool same_buffer = old_buckets == new_buckets; - //If the new bucket length is a common factor - //of the old one we can avoid hash calculations. - const bool fast_shrink = (!incremental) && (old_bucket_count >= new_bucket_count) && - (power_2_buckets || (old_bucket_count % new_bucket_count) == 0); - //If we are shrinking the same bucket array and it's - //is a fast shrink, just rehash the last nodes - size_type new_first_bucket_num = new_bucket_count; - if(same_buffer && fast_shrink && (n < new_bucket_count)){ - new_first_bucket_num = n; - n = new_bucket_count; - } - - //Anti-exception stuff: they destroy the elements if something goes wrong. - //If the source and destination buckets are the same, the second rollback function - //is harmless, because all elements have been already unlinked and destroyed - typedef detail::init_disposer NodeDisposer; - typedef detail::exception_array_disposer ArrayDisposer; - NodeDisposer node_disp; - ArrayDisposer rollback1(new_buckets[0], node_disp, new_bucket_count); - ArrayDisposer rollback2(old_buckets[0], node_disp, old_bucket_count); - - //Put size in a safe value for rollback exception - size_type const size_backup = this->priv_size_traits().get_size(); - this->priv_size_traits().set_size(0); - //Put cache to safe position - this->priv_initialize_cache(); - this->priv_insertion_update_cache(size_type(0u)); - - //Iterate through nodes - for(; n < old_bucket_count; ++n){ - bucket_type &old_bucket = old_buckets[n]; - if(!fast_shrink){ - for( siterator before_i(old_bucket.before_begin()), i(old_bucket.begin()), end_sit(old_bucket.end()) - ; i != end_sit - ; i = before_i, ++i){ - const value_type &v = this->priv_value_from_slist_node(i.pointed_node()); - const std::size_t hash_value = this->priv_stored_or_compute_hash(v, store_hash_t()); - const size_type new_n = detail::hash_to_bucket_split - (hash_value, new_bucket_count, new_bucket_count); - if(cache_begin && new_n < new_first_bucket_num) - new_first_bucket_num = new_n; - siterator const last = (priv_last_in_group)(i); - if(same_buffer && new_n == n){ - before_i = last; - } - else{ - bucket_type &new_b = new_buckets[new_n]; - new_b.splice_after(new_b.before_begin(), old_bucket, before_i, last); - } - } - } - else{ - const size_type new_n = detail::hash_to_bucket_split(n, new_bucket_count, new_bucket_count); - if(cache_begin && new_n < new_first_bucket_num) - new_first_bucket_num = new_n; - bucket_type &new_b = new_buckets[new_n]; - new_b.splice_after( new_b.before_begin() - , old_bucket - , old_bucket.before_begin() - , bucket_plus_vtraits_t::priv_get_last(old_bucket, optimize_multikey_t())); - } - } - - this->priv_size_traits().set_size(size_backup); - this->priv_split_traits().set_size(new_bucket_count); - this->priv_bucket_traits() = new_bucket_traits; - this->priv_initialize_cache(); - this->priv_insertion_update_cache(new_first_bucket_num); - rollback1.release(); - rollback2.release(); + //! Note: This function is used when keys from inserted elements are changed + //! (e.g. a language change when key is a string) but uniqueness and hash properties are + //! preserved so a fast full rehash recovers invariants for *this without extracting and + //! reinserting all elements again. + //! + //! Requires: Calls produced to the hash function should not alter the value uniqueness + //! properties of already inserted elements. If hasher(key1) == hasher(key2) was true when + //! elements were inserted, it shall be true during calls produced in the execution of this function. + //! + //! key_equal is not called inside this function so it is assumed that key_equal(value1, value2) + //! should produce the same results as before for inserted elements. + //! + //! Effects: Reprocesses all values hold by *this, recalculating their hash values + //! and redistributing them though the buckets. + //! + //! If store_hash option is true, this method uses the hash function and updates the stored hash value. + //! + //! Complexity: Average case linear in this->size(), worst case quadratic. + //! + //! Throws: If the hasher functor throws. Basic guarantee. + void full_rehash() + { + this->rehash_impl(this->priv_bucket_traits(), true); } //! Requires: @@ -3113,6 +3068,110 @@ class hashtable_impl void check() const {} private: + void rehash_impl(const bucket_traits &new_bucket_traits, bool do_full_rehash) + { + const bucket_ptr new_buckets = new_bucket_traits.bucket_begin(); + size_type new_bucket_count = new_bucket_traits.bucket_count(); + const bucket_ptr old_buckets = this->priv_bucket_pointer(); + size_type old_bucket_count = this->priv_bucket_count(); + + //Check power of two bucket array if the option is activated + BOOST_INTRUSIVE_INVARIANT_ASSERT + (!power_2_buckets || (0 == (new_bucket_count & (new_bucket_count-1u)))); + + size_type n = this->priv_get_cache_bucket_num(); + const bool same_buffer = old_buckets == new_buckets; + //If the new bucket length is a common factor + //of the old one we can avoid hash calculations. + const bool fast_shrink = (!do_full_rehash) && (!incremental) && (old_bucket_count >= new_bucket_count) && + (power_2_buckets || (old_bucket_count % new_bucket_count) == 0); + //If we are shrinking the same bucket array and it's + //is a fast shrink, just rehash the last nodes + size_type new_first_bucket_num = new_bucket_count; + if(same_buffer && fast_shrink && (n < new_bucket_count)){ + new_first_bucket_num = n; + n = new_bucket_count; + } + + //Anti-exception stuff: they destroy the elements if something goes wrong. + //If the source and destination buckets are the same, the second rollback function + //is harmless, because all elements have been already unlinked and destroyed + typedef detail::init_disposer NodeDisposer; + typedef detail::exception_array_disposer ArrayDisposer; + NodeDisposer node_disp; + ArrayDisposer rollback1(new_buckets[0], node_disp, new_bucket_count); + ArrayDisposer rollback2(old_buckets[0], node_disp, old_bucket_count); + + //Put size in a safe value for rollback exception + size_type const size_backup = this->priv_size_traits().get_size(); + this->priv_size_traits().set_size(0); + //Put cache to safe position + this->priv_initialize_cache(); + this->priv_insertion_update_cache(size_type(0u)); + + //Iterate through nodes + for(; n < old_bucket_count; ++n){ + bucket_type &old_bucket = old_buckets[n]; + if(!fast_shrink){ + for( siterator before_i(old_bucket.before_begin()), i(old_bucket.begin()), end_sit(old_bucket.end()) + ; i != end_sit + ; i = before_i, ++i){ + + //First obtain hash value (and store it if do_full_rehash) + std::size_t hash_value; + if(do_full_rehash){ + value_type &v = this->priv_value_from_slist_node(i.pointed_node()); + hash_value = this->priv_hasher()(key_of_value()(v)); + node_functions_t::store_hash(pointer_traits::pointer_to(this->priv_value_to_node(v)), hash_value, store_hash_t()); + } + else{ + const value_type &v = this->priv_value_from_slist_node(i.pointed_node()); + hash_value = this->priv_stored_or_compute_hash(v, store_hash_t()); + } + + //Now calculate the new bucket position + const size_type new_n = detail::hash_to_bucket_split + (hash_value, new_bucket_count, new_bucket_count); + + //Update first used bucket cache + if(cache_begin && new_n < new_first_bucket_num) + new_first_bucket_num = new_n; + + //If the target bucket is new, transfer the whole group + siterator const last = (priv_last_in_group)(i); + + if(same_buffer && new_n == n){ + before_i = last; + } + else{ + bucket_type &new_b = new_buckets[new_n]; + new_b.splice_after(new_b.before_begin(), old_bucket, before_i, last); + } + } + } + else{ + const size_type new_n = detail::hash_to_bucket_split(n, new_bucket_count, new_bucket_count); + if(cache_begin && new_n < new_first_bucket_num) + new_first_bucket_num = new_n; + bucket_type &new_b = new_buckets[new_n]; + new_b.splice_after( new_b.before_begin() + , old_bucket + , old_bucket.before_begin() + , bucket_plus_vtraits_t::priv_get_last(old_bucket, optimize_multikey_t())); + } + } + + this->priv_size_traits().set_size(size_backup); + this->priv_split_traits().set_size(new_bucket_count); + if(&new_bucket_traits != &this->priv_bucket_traits()){ + this->priv_bucket_traits() = new_bucket_traits; + } + this->priv_initialize_cache(); + this->priv_insertion_update_cache(new_first_bucket_num); + rollback1.release(); + rollback2.release(); + } + template void priv_clone_from(MaybeConstHashtableImpl &src, Cloner cloner, Disposer disposer) { diff --git a/include/boost/intrusive/unordered_set.hpp b/include/boost/intrusive/unordered_set.hpp index 1174dff..1968fa2 100644 --- a/include/boost/intrusive/unordered_set.hpp +++ b/include/boost/intrusive/unordered_set.hpp @@ -360,6 +360,9 @@ class unordered_set_impl //! @copydoc ::boost::intrusive::hashtable::rehash(const bucket_traits &) void rehash(const bucket_traits &new_bucket_traits); + //! @copydoc ::boost::intrusive::hashtable::full_rehash + void full_rehash(); + //! @copydoc ::boost::intrusive::hashtable::incremental_rehash(bool) bool incremental_rehash(bool grow = true); @@ -836,6 +839,9 @@ class unordered_multiset_impl //! @copydoc ::boost::intrusive::hashtable::rehash(const bucket_traits &) void rehash(const bucket_traits &new_bucket_traits); + //! @copydoc ::boost::intrusive::hashtable::full_rehash + void full_rehash(); + //! @copydoc ::boost::intrusive::hashtable::incremental_rehash(bool) bool incremental_rehash(bool grow = true); diff --git a/test/unordered_test.hpp b/test/unordered_test.hpp index fb586cd..de743a4 100644 --- a/test/unordered_test.hpp +++ b/test/unordered_test.hpp @@ -530,27 +530,32 @@ void test_unordered { int init_values [] = { 1, 2, 2, 3, 4, 5 }; TEST_INTRUSIVE_SEQUENCE_MAYBEUNIQUE( init_values, testset1 ); } - //Full shrink rehash + //Shrink rehash testset1.rehash(bucket_traits( pointer_traits:: pointer_to(buckets1[0]), 4)); BOOST_TEST (testset1.incremental_rehash() == false); { int init_values [] = { 4, 5, 1, 2, 2, 3 }; TEST_INTRUSIVE_SEQUENCE_MAYBEUNIQUE( init_values, testset1 ); } - //Full shrink rehash again + + //Shrink rehash again testset1.rehash(bucket_traits( pointer_traits:: pointer_to(buckets1[0]), 2)); BOOST_TEST (testset1.incremental_rehash() == false); { int init_values [] = { 2, 2, 4, 3, 5, 1 }; TEST_INTRUSIVE_SEQUENCE_MAYBEUNIQUE( init_values, testset1 ); } - //Full growing rehash + + //Growing rehash testset1.rehash(bucket_traits( pointer_traits:: pointer_to(buckets1[0]), BucketSize)); - BOOST_TEST (testset1.incremental_rehash() == false); + + //Full rehash (no effects) + testset1.full_rehash(); { int init_values [] = { 1, 2, 2, 3, 4, 5 }; TEST_INTRUSIVE_SEQUENCE_MAYBEUNIQUE( init_values, testset1 ); } + //Incremental rehash shrinking //First incremental rehashes should lead to the same sequence for(std::size_t split_bucket = testset1.split_count(); split_bucket > 6; --split_bucket){ @@ -559,16 +564,19 @@ void test_unordered { int init_values [] = { 1, 2, 2, 3, 4, 5 }; TEST_INTRUSIVE_SEQUENCE_MAYBEUNIQUE( init_values, testset1 ); } } + //Incremental rehash step BOOST_TEST (testset1.incremental_rehash(false) == true); BOOST_TEST(testset1.split_count() == (BucketSize/2+1)); { int init_values [] = { 5, 1, 2, 2, 3, 4 }; TEST_INTRUSIVE_SEQUENCE_MAYBEUNIQUE( init_values, testset1 ); } + //Incremental rehash step 2 BOOST_TEST (testset1.incremental_rehash(false) == true); BOOST_TEST(testset1.split_count() == (BucketSize/2)); { int init_values [] = { 4, 5, 1, 2, 2, 3 }; TEST_INTRUSIVE_SEQUENCE_MAYBEUNIQUE( init_values, testset1 ); } + //This incremental rehash should fail because we've reached the half of the bucket array BOOST_TEST(testset1.incremental_rehash(false) == false); BOOST_TEST(testset1.split_count() == BucketSize/2); @@ -619,6 +627,11 @@ void test_unordered pointer_to(buckets3[0]), BucketSize*2)); { int init_values [] = { 1, 2, 2, 3, 4, 5 }; TEST_INTRUSIVE_SEQUENCE_MAYBEUNIQUE( init_values, testset1 ); } + + //Full rehash (no effects) + testset1.full_rehash(); + { int init_values [] = { 1, 2, 2, 3, 4, 5 }; + TEST_INTRUSIVE_SEQUENCE_MAYBEUNIQUE( init_values, testset1 ); } } //test: find, equal_range (lower_bound, upper_bound):