From 5ad8906e91efd76cae8365ec7de34434b67a3ca2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Kr=C3=BCgler?= <daniel.kruegler@gmail.com>
Date: Sun, 8 Aug 2021 18:29:12 +0200
Subject: [PATCH] #148: icu_regex_traits::translate_nocase doesn't use
 case-folding In translate_nocase replace u_tolower by u_foldCase(c,
 U_FOLD_CASE_DEFAULT) suitable for single codeunit case folding

---
 include/boost/regex/v4/icu.hpp         |   2 +-
 include/boost/regex/v5/icu.hpp         |   2 +-
 test/Jamfile.v2                        |   3 +
 test/unicode/unicode_casefold_test.cpp | 204 +++++++++++++++++++++++++
 4 files changed, 209 insertions(+), 2 deletions(-)
 create mode 100644 test/unicode/unicode_casefold_test.cpp
diff --git a/include/boost/regex/v4/icu.hpp b/include/boost/regex/v4/icu.hpp
index 9724b0f8..7e70f57e 100644
--- a/include/boost/regex/v4/icu.hpp
+++ b/include/boost/regex/v4/icu.hpp
@@ -187,7 +187,7 @@ namespace boost {
       }
       char_type translate_nocase(char_type c) const
       {
-         return ::u_tolower(c);
+         return ::u_foldCase(c, U_FOLD_CASE_DEFAULT);
       }
       char_type translate(char_type c, bool icase) const
       {
diff --git a/include/boost/regex/v5/icu.hpp b/include/boost/regex/v5/icu.hpp
index a9264965..f172553d 100644
--- a/include/boost/regex/v5/icu.hpp
+++ b/include/boost/regex/v5/icu.hpp
@@ -161,7 +161,7 @@ public:
    }
    char_type translate_nocase(char_type c) const
    {
-      return ::u_tolower(c);
+      return ::u_foldCase(c, U_FOLD_CASE_DEFAULT);
    }
    char_type translate(char_type c, bool icase) const
    {
diff --git a/test/Jamfile.v2 b/test/Jamfile.v2
index 9cef46b7..9a50918c 100644
--- a/test/Jamfile.v2
+++ b/test/Jamfile.v2
@@ -122,6 +122,9 @@ test-suite regex
       [ run unicode/unicode_iterator_test.cpp : : : 
             [ check-target-builds ../build//is_legacy_03 : : <source>../build//boost_regex ]
             release <define>TEST_UTF16 : unicode_iterator_test_utf16 ]
+      [ run unicode/unicode_casefold_test.cpp 
+            ../build//boost_regex ../build//icu_options      
+      ]
       [ run static_mutex/static_mutex_test.cpp
             ../../thread/build//boost_thread ../build//boost_regex
       ]
diff --git a/test/unicode/unicode_casefold_test.cpp b/test/unicode/unicode_casefold_test.cpp
new file mode 100644
index 00000000..d1597c81
--- /dev/null
+++ b/test/unicode/unicode_casefold_test.cpp
@@ -0,0 +1,204 @@
+/*
+ *
+ * Copyright (c) 2021 John Maddock
+ * Copyright (c) 2021 Daniel Kruegler
+ *
+ * Use, modification and distribution are subject to the 
+ * Boost Software License, Version 1.0. (See accompanying file 
+ * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+ *
+ */
+ 
+ /*
+  *   LOCATION:    see http://www.boost.org for most recent version.
+  *   FILE         unicode_casefold_test.cpp
+  *   VERSION      see <boost/version.hpp>
+  *   DESCRIPTION: Simple test suite for Unicode case folding.
+  */
+
+#include <boost/regex/config.hpp>
+#include <boost/detail/lightweight_main.hpp>
+#include "../test_macros.hpp"
+
+#if defined(BOOST_HAS_ICU)
+
+#include <boost/regex/icu.hpp>
+
+#include <utility>
+
+#include <unicode/uversion.h>
+#include <unicode/uchar.h>
+
+typedef std::pair<int, int> unicode_verinfo;
+
+// Function to query the effective Unicode major and minor
+// version, because some spot test cases can only be tested 
+// for specific Unicode versions.
+unicode_verinfo get_unicode_version()
+{
+  UVersionInfo versionArray = {};
+  u_getUnicodeVersion(versionArray);
+  unicode_verinfo result(versionArray[0] , versionArray[1]);
+  return result;
+}
+
+void latin_1_checks()
+{
+  typedef boost::icu_regex_traits traits_type;
+  traits_type traits;
+
+  // Test range [U+0000, U+0041): Identity fold
+  for (traits_type::char_type c = 0x0; c < 0x41; ++c)
+  {
+    traits_type::char_type nc = traits.translate_nocase(c);
+    BOOST_CHECK_EQUAL(nc, c);
+  }
+
+  // Test ASCII upper case letters [A, Z]: Each character folds 
+  // to its lowercase variant:
+  for (traits_type::char_type c = 0x41; c <= 0x5A; ++c)
+  {
+    traits_type::char_type nc = traits.translate_nocase(c);
+    const int shift = 0x61 - 0x41;
+    BOOST_CHECK_EQUAL(nc, c + shift);
+    BOOST_CHECK_EQUAL(nc, traits.tolower(c));
+  }
+
+  // Test range (U+005A, U+00B5): Identity fold
+  for (traits_type::char_type c = 0x5A + 1; c < 0xB5; ++c)
+  {
+    traits_type::char_type nc = traits.translate_nocase(c);
+    BOOST_CHECK_EQUAL(nc, c);
+  }
+
+  // U+00B5 maps to its decomposition GREEK SMALL LETTER MU 
+  // (U+03BC):
+  {
+    traits_type::char_type c = 0xB5;
+    traits_type::char_type nc = traits.translate_nocase(c);
+    BOOST_CHECK_EQUAL(nc, 0x03BC);
+  }
+
+  // Test range (U+00B5, U+00BF]: Identity fold
+  for (traits_type::char_type c = 0xB5 + 1; c <= 0xBF; ++c)
+  {
+    traits_type::char_type nc = traits.translate_nocase(c);
+    BOOST_CHECK_EQUAL(nc, c);
+  }
+
+  // Test range [U+00C0, U+00D6]: Each character folds 
+  // to its lowercase variant:
+  for (traits_type::char_type c = 0xC0; c <= 0xD6; ++c)
+  {
+    traits_type::char_type nc = traits.translate_nocase(c);
+    traits_type::char_type lc = traits.tolower(c);
+    BOOST_CHECK_EQUAL(nc, lc);
+    BOOST_CHECK_NE(nc, c);
+  }
+
+  // U+00D7: Identity fold
+  {
+    traits_type::char_type c = 0xD7;
+    traits_type::char_type nc = traits.translate_nocase(c);
+    BOOST_CHECK_EQUAL(nc, c);
+  }
+
+  // Test range [U+00D8, U+00DE]: Each character folds 
+  // to its lowercase variant:
+  for (traits_type::char_type c = 0xD8; c <= 0xDE; ++c)
+  {
+    traits_type::char_type nc = traits.translate_nocase(c);
+    traits_type::char_type lc = traits.tolower(c);
+    BOOST_CHECK_EQUAL(nc, lc);
+    BOOST_CHECK_NE(nc, c);
+  }
+
+  // Test range [U+00DF, U+00BF]: Identity fold
+  // Note that case folding of U+00DF (LATIN SMALL 
+  // LETTER SHARP S) does not fold to U+1E9E (LATIN 
+  // CAPITAL LETTER SHARP S) due to case folding 
+  // stability contract
+  for (traits_type::char_type c = 0xDF; c <= 0xFF; ++c)
+  {
+    traits_type::char_type nc = traits.translate_nocase(c);
+    BOOST_CHECK_EQUAL(nc, c);
+  }
+}
+
+void spot_checks()
+{
+  // test specific values ripped straight out of the Unicode standard
+  // to verify that our case folding is the same as theirs:
+  typedef boost::icu_regex_traits traits_type;
+  traits_type traits;
+
+  const unicode_verinfo unicode_version = get_unicode_version();
+
+  // 'LATIN CAPITAL LETTER SHARP S' folds to
+  // 'LATIN SMALL LETTER SHARP S'
+  if (unicode_version >= unicode_verinfo(5, 1))
+  {
+    traits_type::char_type c = 0x1E9E;
+    traits_type::char_type nc = traits.translate_nocase(c);
+    traits_type::char_type lc = traits.tolower(c);
+    BOOST_CHECK_EQUAL(nc, lc);
+    BOOST_CHECK_EQUAL(nc, 0xDF);
+  }
+
+  // Capital sigma (U+03A3) is the uppercase form of both the regular (U+03C2) 
+  // and final (U+03C3) lowercase sigma. All these characters exists since
+  // Unicode 1.1.0.
+  {
+    traits_type::char_type c = 0x03A3;
+    traits_type::char_type nc = traits.translate_nocase(c);
+    traits_type::char_type lc = traits.tolower(c);
+    BOOST_CHECK_EQUAL(nc, lc);
+    BOOST_CHECK_EQUAL(nc, 0x03C3);
+    c = 0x03C2;
+    nc = traits.translate_nocase(c);
+    BOOST_CHECK_EQUAL(nc, 0x03C3);
+    c = 0x03C3;
+    nc = traits.translate_nocase(c);
+    BOOST_CHECK_EQUAL(nc, c);
+  }
+
+  // In Turkish languages the lowercase letter 'i' (U+0069) maps to an 
+  // uppercase dotted I (U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE), 
+  // while the uppercase letter 'I' (U+0049) maps to the dotless lowercase 
+  // i (U+0131). The Unicode simple default mapping folds U+0130 to itself, 
+  // but folds U+0049 to U+0069.
+  {
+    traits_type::char_type c = 0x0130;
+    traits_type::char_type nc = traits.translate_nocase(c);
+    BOOST_CHECK_EQUAL(nc, c);
+    c = 0x0049;
+    nc = traits.translate_nocase(c);
+    traits_type::char_type lc = traits.tolower(c);
+    BOOST_CHECK_EQUAL(nc, lc);
+    BOOST_CHECK_EQUAL(nc, 0x0069);
+  }
+
+  // Cherokee small letters were added with Unicode 8.0,
+  // but the upper case letters existed before, therefore
+  // the small letters case fold to upper case letters.
+  if (unicode_version >= unicode_verinfo(8, 0))
+  {
+    traits_type::char_type c = 0x13F8;
+    traits_type::char_type nc = traits.translate_nocase(c);
+    traits_type::char_type uc = traits.toupper(c);
+    BOOST_CHECK_EQUAL(nc, uc);
+    BOOST_CHECK_EQUAL(nc, 0x13F0);
+  }
+
+}
+
+#endif
+
+int cpp_main( int, char* [] ) 
+{
+#if defined(BOOST_HAS_ICU)
+  latin_1_checks();
+  spot_checks();
+#endif
+  return boost::report_errors();
+}