From 5ad8906e91efd76cae8365ec7de34434b67a3ca2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Kr=C3=BCgler?= Date: Sun, 8 Aug 2021 18:29:12 +0200 Subject: [PATCH] #148: icu_regex_traits::translate_nocase doesn't use case-folding In translate_nocase replace u_tolower by u_foldCase(c, U_FOLD_CASE_DEFAULT) suitable for single codeunit case folding --- include/boost/regex/v4/icu.hpp | 2 +- include/boost/regex/v5/icu.hpp | 2 +- test/Jamfile.v2 | 3 + test/unicode/unicode_casefold_test.cpp | 204 +++++++++++++++++++++++++ 4 files changed, 209 insertions(+), 2 deletions(-) create mode 100644 test/unicode/unicode_casefold_test.cpp diff --git a/include/boost/regex/v4/icu.hpp b/include/boost/regex/v4/icu.hpp index 9724b0f8..7e70f57e 100644 --- a/include/boost/regex/v4/icu.hpp +++ b/include/boost/regex/v4/icu.hpp @@ -187,7 +187,7 @@ namespace boost { } char_type translate_nocase(char_type c) const { - return ::u_tolower(c); + return ::u_foldCase(c, U_FOLD_CASE_DEFAULT); } char_type translate(char_type c, bool icase) const { diff --git a/include/boost/regex/v5/icu.hpp b/include/boost/regex/v5/icu.hpp index a9264965..f172553d 100644 --- a/include/boost/regex/v5/icu.hpp +++ b/include/boost/regex/v5/icu.hpp @@ -161,7 +161,7 @@ public: } char_type translate_nocase(char_type c) const { - return ::u_tolower(c); + return ::u_foldCase(c, U_FOLD_CASE_DEFAULT); } char_type translate(char_type c, bool icase) const { diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 9cef46b7..9a50918c 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -122,6 +122,9 @@ test-suite regex [ run unicode/unicode_iterator_test.cpp : : : [ check-target-builds ../build//is_legacy_03 : : ../build//boost_regex ] release TEST_UTF16 : unicode_iterator_test_utf16 ] + [ run unicode/unicode_casefold_test.cpp + ../build//boost_regex ../build//icu_options + ] [ run static_mutex/static_mutex_test.cpp ../../thread/build//boost_thread ../build//boost_regex ] diff --git a/test/unicode/unicode_casefold_test.cpp b/test/unicode/unicode_casefold_test.cpp new file mode 100644 index 00000000..d1597c81 --- /dev/null +++ b/test/unicode/unicode_casefold_test.cpp @@ -0,0 +1,204 @@ +/* + * + * Copyright (c) 2021 John Maddock + * Copyright (c) 2021 Daniel Kruegler + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + + /* + * LOCATION: see http://www.boost.org for most recent version. + * FILE unicode_casefold_test.cpp + * VERSION see + * DESCRIPTION: Simple test suite for Unicode case folding. + */ + +#include +#include +#include "../test_macros.hpp" + +#if defined(BOOST_HAS_ICU) + +#include + +#include + +#include +#include + +typedef std::pair unicode_verinfo; + +// Function to query the effective Unicode major and minor +// version, because some spot test cases can only be tested +// for specific Unicode versions. +unicode_verinfo get_unicode_version() +{ + UVersionInfo versionArray = {}; + u_getUnicodeVersion(versionArray); + unicode_verinfo result(versionArray[0] , versionArray[1]); + return result; +} + +void latin_1_checks() +{ + typedef boost::icu_regex_traits traits_type; + traits_type traits; + + // Test range [U+0000, U+0041): Identity fold + for (traits_type::char_type c = 0x0; c < 0x41; ++c) + { + traits_type::char_type nc = traits.translate_nocase(c); + BOOST_CHECK_EQUAL(nc, c); + } + + // Test ASCII upper case letters [A, Z]: Each character folds + // to its lowercase variant: + for (traits_type::char_type c = 0x41; c <= 0x5A; ++c) + { + traits_type::char_type nc = traits.translate_nocase(c); + const int shift = 0x61 - 0x41; + BOOST_CHECK_EQUAL(nc, c + shift); + BOOST_CHECK_EQUAL(nc, traits.tolower(c)); + } + + // Test range (U+005A, U+00B5): Identity fold + for (traits_type::char_type c = 0x5A + 1; c < 0xB5; ++c) + { + traits_type::char_type nc = traits.translate_nocase(c); + BOOST_CHECK_EQUAL(nc, c); + } + + // U+00B5 maps to its decomposition GREEK SMALL LETTER MU + // (U+03BC): + { + traits_type::char_type c = 0xB5; + traits_type::char_type nc = traits.translate_nocase(c); + BOOST_CHECK_EQUAL(nc, 0x03BC); + } + + // Test range (U+00B5, U+00BF]: Identity fold + for (traits_type::char_type c = 0xB5 + 1; c <= 0xBF; ++c) + { + traits_type::char_type nc = traits.translate_nocase(c); + BOOST_CHECK_EQUAL(nc, c); + } + + // Test range [U+00C0, U+00D6]: Each character folds + // to its lowercase variant: + for (traits_type::char_type c = 0xC0; c <= 0xD6; ++c) + { + traits_type::char_type nc = traits.translate_nocase(c); + traits_type::char_type lc = traits.tolower(c); + BOOST_CHECK_EQUAL(nc, lc); + BOOST_CHECK_NE(nc, c); + } + + // U+00D7: Identity fold + { + traits_type::char_type c = 0xD7; + traits_type::char_type nc = traits.translate_nocase(c); + BOOST_CHECK_EQUAL(nc, c); + } + + // Test range [U+00D8, U+00DE]: Each character folds + // to its lowercase variant: + for (traits_type::char_type c = 0xD8; c <= 0xDE; ++c) + { + traits_type::char_type nc = traits.translate_nocase(c); + traits_type::char_type lc = traits.tolower(c); + BOOST_CHECK_EQUAL(nc, lc); + BOOST_CHECK_NE(nc, c); + } + + // Test range [U+00DF, U+00BF]: Identity fold + // Note that case folding of U+00DF (LATIN SMALL + // LETTER SHARP S) does not fold to U+1E9E (LATIN + // CAPITAL LETTER SHARP S) due to case folding + // stability contract + for (traits_type::char_type c = 0xDF; c <= 0xFF; ++c) + { + traits_type::char_type nc = traits.translate_nocase(c); + BOOST_CHECK_EQUAL(nc, c); + } +} + +void spot_checks() +{ + // test specific values ripped straight out of the Unicode standard + // to verify that our case folding is the same as theirs: + typedef boost::icu_regex_traits traits_type; + traits_type traits; + + const unicode_verinfo unicode_version = get_unicode_version(); + + // 'LATIN CAPITAL LETTER SHARP S' folds to + // 'LATIN SMALL LETTER SHARP S' + if (unicode_version >= unicode_verinfo(5, 1)) + { + traits_type::char_type c = 0x1E9E; + traits_type::char_type nc = traits.translate_nocase(c); + traits_type::char_type lc = traits.tolower(c); + BOOST_CHECK_EQUAL(nc, lc); + BOOST_CHECK_EQUAL(nc, 0xDF); + } + + // Capital sigma (U+03A3) is the uppercase form of both the regular (U+03C2) + // and final (U+03C3) lowercase sigma. All these characters exists since + // Unicode 1.1.0. + { + traits_type::char_type c = 0x03A3; + traits_type::char_type nc = traits.translate_nocase(c); + traits_type::char_type lc = traits.tolower(c); + BOOST_CHECK_EQUAL(nc, lc); + BOOST_CHECK_EQUAL(nc, 0x03C3); + c = 0x03C2; + nc = traits.translate_nocase(c); + BOOST_CHECK_EQUAL(nc, 0x03C3); + c = 0x03C3; + nc = traits.translate_nocase(c); + BOOST_CHECK_EQUAL(nc, c); + } + + // In Turkish languages the lowercase letter 'i' (U+0069) maps to an + // uppercase dotted I (U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE), + // while the uppercase letter 'I' (U+0049) maps to the dotless lowercase + // i (U+0131). The Unicode simple default mapping folds U+0130 to itself, + // but folds U+0049 to U+0069. + { + traits_type::char_type c = 0x0130; + traits_type::char_type nc = traits.translate_nocase(c); + BOOST_CHECK_EQUAL(nc, c); + c = 0x0049; + nc = traits.translate_nocase(c); + traits_type::char_type lc = traits.tolower(c); + BOOST_CHECK_EQUAL(nc, lc); + BOOST_CHECK_EQUAL(nc, 0x0069); + } + + // Cherokee small letters were added with Unicode 8.0, + // but the upper case letters existed before, therefore + // the small letters case fold to upper case letters. + if (unicode_version >= unicode_verinfo(8, 0)) + { + traits_type::char_type c = 0x13F8; + traits_type::char_type nc = traits.translate_nocase(c); + traits_type::char_type uc = traits.toupper(c); + BOOST_CHECK_EQUAL(nc, uc); + BOOST_CHECK_EQUAL(nc, 0x13F0); + } + +} + +#endif + +int cpp_main( int, char* [] ) +{ +#if defined(BOOST_HAS_ICU) + latin_1_checks(); + spot_checks(); +#endif + return boost::report_errors(); +}