From f0d0a1ebd76716246a384a55d10323f00f88b409 Mon Sep 17 00:00:00 2001
From: Victor Zverovich <victor.zverovich@gmail.com>
Date: Sat, 25 Aug 2018 16:08:32 -0700
Subject: [PATCH] Implement Grisu2 digit generation

---
 CMakeLists.txt           |  6 +--
 include/fmt/format-inl.h | 79 +++++++++++++++++++++++++++++++++++++++-
 include/fmt/format.h     | 43 +++++++---------------
 3 files changed, 95 insertions(+), 33 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65d5f2e6..c25be5f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
       -Wcast-qual -Wformat=2 -Wmissing-include-dirs
       -Wcast-align -Wnon-virtual-dtor
       -Wctor-dtor-privacy -Wdisabled-optimization
-      -Winvalid-pch -Wmissing-declarations -Woverloaded-virtual
+      -Winvalid-pch -Woverloaded-virtual
       -Wno-ctor-dtor-privacy -Wno-dangling-else -Wno-float-equal
       -Wno-format-nonliteral -Wno-sign-conversion -Wno-shadow)
   if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.6)
@@ -101,8 +101,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
       -Wno-unused-member-function
       -Wno-format-nonliteral -Wno-missing-noreturn -Wno-undefined-func-template
       -Wno-shadow -Wno-sign-conversion -Wno-used-but-marked-unused
-      -Wno-covered-switch-default -Wno-missing-variable-declarations
-      -Wno-double-promotion)
+      -Wno-covered-switch-default -Wno-missing-prototypes
+      -Wno-missing-variable-declarations -Wno-double-promotion)
 
   set(WERROR_FLAG -Werror)
 
diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index 49779a0f..98aaee26 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -275,11 +275,16 @@ const char basic_data<T>::DIGITS[] =
 
 template <typename T>
 const uint32_t basic_data<T>::POWERS_OF_10_32[] = {
+  1, FMT_POWERS_OF_10(1)
+};
+
+template <typename T>
+const uint32_t basic_data<T>::ZERO_OR_POWERS_OF_10_32[] = {
   0, FMT_POWERS_OF_10(1)
 };
 
 template <typename T>
-const uint64_t basic_data<T>::POWERS_OF_10_64[] = {
+const uint64_t basic_data<T>::ZERO_OR_POWERS_OF_10_64[] = {
   0,
   FMT_POWERS_OF_10(1),
   FMT_POWERS_OF_10(1000000000ull),
@@ -361,6 +366,78 @@ FMT_FUNC fp get_cached_power(int min_exponent, int &pow10_exponent) {
   pow10_exponent = first_dec_exp + index * dec_exp_step;
   return fp(data::POW10_SIGNIFICANDS[index], data::POW10_EXPONENTS[index]);
 }
+
+// Generates output using Grisu2 digit-gen algorithm.
+FMT_FUNC void grisu2_gen_digits(
+    const fp &scaled_value, const fp &scaled_upper, uint64_t delta,
+    char *buffer, size_t &size, int &dec_exp) {
+  internal::fp one(1ull << -scaled_upper.e, scaled_upper.e);
+  uint32_t hi = static_cast<uint32_t>(scaled_upper.f >> -one.e);  // p1 in Grisu
+  uint64_t lo = scaled_upper.f & (one.f - 1);                     // p2 in Grisu
+  size = 0;
+  auto kappa = count_digits(hi); // TODO: more descriptive name
+  while (kappa > 0) {
+    uint32_t digit = 0;
+    // This optimization by miloyip reduces the number of integer divisions by
+    // one per iteration.
+    switch (kappa) {
+    case 10: digit = hi / 1000000000; hi %= 1000000000; break;
+    case  9: digit = hi /  100000000; hi %=  100000000; break;
+    case  8: digit = hi /   10000000; hi %=   10000000; break;
+    case  7: digit = hi /    1000000; hi %=    1000000; break;
+    case  6: digit = hi /     100000; hi %=     100000; break;
+    case  5: digit = hi /      10000; hi %=      10000; break;
+    case  4: digit = hi /       1000; hi %=       1000; break;
+    case  3: digit = hi /        100; hi %=        100; break;
+    case  2: digit = hi /         10; hi %=         10; break;
+    case  1: digit = hi;              hi =           0; break;
+    default:
+      FMT_ASSERT(false, "invalid number of digits");
+    }
+    if (digit != 0 || size != 0)
+      buffer[size++] = '0' + static_cast<char>(digit);
+    --kappa;
+    uint64_t remainder = (static_cast<uint64_t>(hi) << -one.e) + lo;
+    if (remainder <= delta) {
+      dec_exp += kappa;
+      // TODO: use scaled_value
+      (void)scaled_value;
+      return;
+    }
+  }
+  for (;;) {
+    lo *= 10;
+    delta *= 10;
+    char digit = static_cast<char>(lo >> -one.e);
+    if (digit != 0 || size != 0)
+      buffer[size++] = '0' + digit;
+    lo &= one.f - 1;
+    --kappa;
+    if (lo < delta) {
+      dec_exp += kappa;
+      return;
+    }
+  }
+}
+
+FMT_FUNC void grisu2_format(double value, char *buffer, size_t &size) {
+  fp fp_value(value);
+  fp lower, upper;
+  fp_value.compute_boundaries(lower, upper);
+  // Find a cached power of 10 close to 1 / upper.
+  int dec_exp = 0;  // K in Grisu paper.
+  const int min_exp = -60;
+  auto dec_pow = get_cached_power(
+      min_exp - (upper.e + fp::significand_size), dec_exp);
+  fp_value.normalize();
+  fp scaled_value = fp_value * dec_pow;
+  fp scaled_lower = lower * dec_pow;
+  fp scaled_upper = upper * dec_pow;
+  ++scaled_lower.f;  // +1 ulp
+  --scaled_upper.f;  // -1 ulp
+  uint64_t delta = scaled_upper.f - scaled_lower.f;
+  grisu2_gen_digits(scaled_value, scaled_upper, delta, buffer, size, dec_exp);
+}
 }  // namespace internal
 
 #if FMT_USE_WINDOWS_H
diff --git a/include/fmt/format.h b/include/fmt/format.h
index bd559a52..f4c0f6ee 100644
--- a/include/fmt/format.h
+++ b/include/fmt/format.h
@@ -365,6 +365,10 @@ FMT_API fp operator*(fp x, fp y);
 // (binary) exponent satisfies min_exponent <= c_k.e <= min_exponent + 3.
 FMT_API fp get_cached_power(int min_exponent, int &pow10_exponent);
 
+// Formats value using Grisu2 algorithm:
+// https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf
+FMT_API void grisu2_format(double value, char *buffer, size_t &size);
+
 template <typename Allocator>
 typename Allocator::value_type *allocate(Allocator& alloc, std::size_t n) {
 #if __cplusplus >= 201103L || FMT_MSC_VER >= 1700
@@ -952,7 +956,8 @@ struct int_traits {
 template <typename T = void>
 struct FMT_API basic_data {
   static const uint32_t POWERS_OF_10_32[];
-  static const uint64_t POWERS_OF_10_64[];
+  static const uint32_t ZERO_OR_POWERS_OF_10_32[];
+  static const uint64_t ZERO_OR_POWERS_OF_10_64[];
   static const uint64_t POW10_SIGNIFICANDS[];
   static const int16_t POW10_EXPONENTS[];
   static const char DIGITS[];
@@ -973,7 +978,7 @@ inline unsigned count_digits(uint64_t n) {
   // Based on http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10
   // and the benchmark https://github.com/localvoid/cxx-benchmark-count-digits.
   int t = (64 - FMT_BUILTIN_CLZLL(n | 1)) * 1233 >> 12;
-  return to_unsigned(t) - (n < data::POWERS_OF_10_64[t]) + 1;
+  return to_unsigned(t) - (n < data::ZERO_OR_POWERS_OF_10_64[t]) + 1;
 }
 #else
 // Fallback version of count_digits used when __builtin_clz is not available.
@@ -1043,7 +1048,8 @@ class decimal_formatter {
       // https://github.com/jeaiii/itoa
       unsigned n = N - 1;
       unsigned a = n / 5 * n * 53 / 16;
-      uint64_t t = ((1ULL << (32 + a)) / data::POWERS_OF_10_32[n] + 1 - n / 9);
+      uint64_t t = ((1ULL << (32 + a)) /
+                   data::ZERO_OR_POWERS_OF_10_32[n] + 1 - n / 9);
       t = ((t * u) >> a) + n / 5 * 4;
       write_pair(0, t >> 32);
       for (unsigned i = 2; i < N; i += 2) {
@@ -1075,7 +1081,7 @@ class decimal_formatter_null : public decimal_formatter {
 // Optional version of count_digits for better performance on 32-bit platforms.
 inline unsigned count_digits(uint32_t n) {
   int t = (32 - FMT_BUILTIN_CLZ(n | 1)) * 1233 >> 12;
-  return to_unsigned(t) - (n < data::POWERS_OF_10_32[t]) + 1;
+  return to_unsigned(t) - (n < data::ZERO_OR_POWERS_OF_10_32[t]) + 1;
 }
 #endif
 
@@ -2943,31 +2949,10 @@ void basic_writer<Range>::write_double(T value, const format_specs &spec) {
   basic_memory_buffer<char_type> buffer;
   if (internal::const_check(FMT_USE_GRISU && sizeof(T) <= sizeof(double) &&
       std::numeric_limits<double>::is_iec559)) {
-    internal::fp fp_value(static_cast<double>(value));
-    fp_value.normalize();
-    // Find a cached power of 10 close to 1 / fp_value.
-    int dec_exp = 0;
-    const int min_exp = -60;
-    auto dec_pow = internal::get_cached_power(
-        min_exp - (fp_value.e + internal::fp::significand_size), dec_exp);
-    internal::fp product = fp_value * dec_pow;
-    // Generate output using Grisu digit-gen-mix algorithm.
-    internal::fp one(1ull << -product.e, product.e);
-    uint64_t hi = product.f >> -one.e;
-    uint64_t f = product.f & (one.f - 1);
-    typedef back_insert_range<internal::basic_buffer<char_type>> range;
-    basic_writer<range> w{range(buffer)};
-    w.write(hi);
-    size_t digits = buffer.size();
-    w.write('.');
-    const unsigned max_digits = 18;
-    while (digits++ < max_digits) {
-      f *= 10;
-      w.write(static_cast<char>('0' + (f >> -one.e)));
-      f &= one.f - 1;
-    }
-    w.write('e');
-    w.write(-dec_exp);
+    char buf[100]; // TODO: max size
+    size_t size = 0;
+    internal::grisu2_format(static_cast<double>(value), buf, size);
+    buffer.append(buf, buf + size); // TODO: avoid extra copy
   } else {
     format_specs normalized_spec(spec);
     normalized_spec.type_ = handler.type;