Merge "Implement wcwidth(3) in terms of icu4c."
am: bafa1985ec
Change-Id: Ia7e74b6124206fe5f2c5bf372c03ee5246045462
This commit is contained in:
commit
a97079d94e
6 changed files with 249 additions and 10 deletions
|
@ -1446,6 +1446,7 @@ cc_library_static {
|
|||
"bionic/getpriority.cpp",
|
||||
"bionic/gettid.cpp",
|
||||
"bionic/grp_pwd.cpp",
|
||||
"bionic/icu_wrappers.cpp",
|
||||
"bionic/ifaddrs.cpp",
|
||||
"bionic/inotify_init.cpp",
|
||||
"bionic/ioctl.cpp",
|
||||
|
@ -1553,6 +1554,7 @@ cc_library_static {
|
|||
"bionic/wchar_l.cpp",
|
||||
"bionic/wcstod.cpp",
|
||||
"bionic/wctype.cpp",
|
||||
"bionic/wcwidth.cpp",
|
||||
"bionic/wmempcpy.cpp",
|
||||
],
|
||||
|
||||
|
|
49
libc/bionic/icu_wrappers.cpp
Normal file
49
libc/bionic/icu_wrappers.cpp
Normal file
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Copyright (C) 2017 The Android Open Source Project
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "private/icu.h"
|
||||
|
||||
int8_t __icu_charType(wint_t wc) {
|
||||
typedef int8_t (*u_charType_t)(UChar32);
|
||||
static auto u_charType = reinterpret_cast<u_charType_t>(__find_icu_symbol("u_charType"));
|
||||
return u_charType ? u_charType(wc) : -1;
|
||||
}
|
||||
|
||||
int32_t __icu_getIntPropertyValue(wint_t wc, UProperty property) {
|
||||
typedef int32_t (*u_getIntPropertyValue_t)(UChar32, UProperty);
|
||||
static auto u_getIntPropertyValue =
|
||||
reinterpret_cast<u_getIntPropertyValue_t>(__find_icu_symbol("u_getIntPropertyValue"));
|
||||
return u_getIntPropertyValue ? u_getIntPropertyValue(wc, property) : 0;
|
||||
}
|
||||
|
||||
bool __icu_hasBinaryProperty(wint_t wc, UProperty property, int (*fallback)(int)) {
|
||||
typedef UBool (*u_hasBinaryProperty_t)(UChar32, UProperty);
|
||||
static auto u_hasBinaryProperty =
|
||||
reinterpret_cast<u_hasBinaryProperty_t>(__find_icu_symbol("u_hasBinaryProperty"));
|
||||
return u_hasBinaryProperty ? u_hasBinaryProperty(wc, property) : fallback(wc);
|
||||
}
|
|
@ -53,12 +53,6 @@ enum {
|
|||
WC_TYPE_MAX
|
||||
};
|
||||
|
||||
static bool __icu_hasBinaryProperty(wint_t wc, UProperty property, int (*fallback)(int)) {
|
||||
typedef UBool (*FnT)(UChar32, UProperty);
|
||||
static auto u_hasBinaryProperty = reinterpret_cast<FnT>(__find_icu_symbol("u_hasBinaryProperty"));
|
||||
return u_hasBinaryProperty ? u_hasBinaryProperty(wc, property) : fallback(wc);
|
||||
}
|
||||
|
||||
int iswalnum(wint_t wc) { return __icu_hasBinaryProperty(wc, UCHAR_POSIX_ALNUM, isalnum); }
|
||||
int iswalpha(wint_t wc) { return __icu_hasBinaryProperty(wc, UCHAR_ALPHABETIC, isalpha); }
|
||||
int iswblank(wint_t wc) { return __icu_hasBinaryProperty(wc, UCHAR_POSIX_BLANK, isblank); }
|
||||
|
@ -155,10 +149,6 @@ wctype_t wctype_l(const char* property, locale_t) {
|
|||
return wctype(property);
|
||||
}
|
||||
|
||||
int wcwidth(wchar_t wc) {
|
||||
return (wc > 0);
|
||||
}
|
||||
|
||||
static wctrans_t wctrans_tolower = wctrans_t(1);
|
||||
static wctrans_t wctrans_toupper = wctrans_t(2);
|
||||
|
||||
|
|
92
libc/bionic/wcwidth.cpp
Normal file
92
libc/bionic/wcwidth.cpp
Normal file
|
@ -0,0 +1,92 @@
|
|||
/*
|
||||
* Copyright (C) 2017 The Android Open Source Project
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <wchar.h>
|
||||
|
||||
#include "private/icu.h"
|
||||
|
||||
int wcwidth(wchar_t wc) {
|
||||
// Fast-path ASCII.
|
||||
if (wc >= 0x20 && wc < 0x7f) return 1;
|
||||
|
||||
// ASCII NUL is a special case.
|
||||
if (wc == 0) return 0;
|
||||
|
||||
// C0.
|
||||
if (wc < ' ' || (wc >= 0x7f && wc <= 0xa0)) return -1;
|
||||
|
||||
// Now for the i18n part. This isn't defined or standardized, so a lot of the choices are
|
||||
// pretty arbitrary. See https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c for more details.
|
||||
|
||||
// Fancy unicode control characters?
|
||||
switch (__icu_charType(wc)) {
|
||||
case -1:
|
||||
// No icu4c available; give up.
|
||||
return -1;
|
||||
case U_CONTROL_CHAR:
|
||||
return -1;
|
||||
case U_NON_SPACING_MARK:
|
||||
case U_ENCLOSING_MARK:
|
||||
case U_FORMAT_CHAR:
|
||||
return 0;
|
||||
}
|
||||
if (__icu_hasBinaryProperty(wc, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, nullptr)) return 0;
|
||||
|
||||
// Medial and final jamo render as zero width when used correctly.
|
||||
switch (__icu_getIntPropertyValue(wc, UCHAR_HANGUL_SYLLABLE_TYPE)) {
|
||||
case U_HST_VOWEL_JAMO:
|
||||
case U_HST_TRAILING_JAMO:
|
||||
return 0;
|
||||
case U_HST_LEADING_JAMO:
|
||||
case U_HST_LV_SYLLABLE:
|
||||
case U_HST_LVT_SYLLABLE:
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (wc >= 0x3248 && wc <= 0x4dff) {
|
||||
// Circled two-digit CJK "speed sign" numbers. EastAsianWidth is ambiguous,
|
||||
// but wide makes more sense.
|
||||
if (wc <= 0x324f) return 2;
|
||||
// Hexagrams. EastAsianWidth is neutral, but wide seems better.
|
||||
if (wc >= 0x4dc0) return 2;
|
||||
}
|
||||
|
||||
// The EastAsianWidth property is at least defined by the Unicode standard!
|
||||
switch (__icu_getIntPropertyValue(wc, UCHAR_EAST_ASIAN_WIDTH)) {
|
||||
case U_EA_AMBIGUOUS:
|
||||
case U_EA_HALFWIDTH:
|
||||
case U_EA_NARROW:
|
||||
case U_EA_NEUTRAL:
|
||||
return 1;
|
||||
case U_EA_FULLWIDTH:
|
||||
case U_EA_WIDE:
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -30,12 +30,14 @@
|
|||
#define _PRIVATE_ICU_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <wchar.h>
|
||||
|
||||
typedef int8_t UBool;
|
||||
typedef int32_t UChar32;
|
||||
|
||||
enum UProperty {
|
||||
UCHAR_ALPHABETIC = 0,
|
||||
UCHAR_DEFAULT_IGNORABLE_CODE_POINT = 5,
|
||||
UCHAR_LOWERCASE = 22,
|
||||
UCHAR_POSIX_ALNUM = 44,
|
||||
UCHAR_POSIX_BLANK = 45,
|
||||
|
@ -44,12 +46,39 @@ enum UProperty {
|
|||
UCHAR_POSIX_XDIGIT = 48,
|
||||
UCHAR_UPPERCASE = 30,
|
||||
UCHAR_WHITE_SPACE = 31,
|
||||
UCHAR_EAST_ASIAN_WIDTH = 0x1004,
|
||||
UCHAR_HANGUL_SYLLABLE_TYPE = 0x100b,
|
||||
};
|
||||
|
||||
enum UCharCategory {
|
||||
U_NON_SPACING_MARK = 6,
|
||||
U_ENCLOSING_MARK = 7,
|
||||
U_CONTROL_CHAR = 15,
|
||||
U_FORMAT_CHAR = 16,
|
||||
};
|
||||
|
||||
enum UEastAsianWidth {
|
||||
U_EA_NEUTRAL,
|
||||
U_EA_AMBIGUOUS,
|
||||
U_EA_HALFWIDTH,
|
||||
U_EA_FULLWIDTH,
|
||||
U_EA_NARROW,
|
||||
U_EA_WIDE,
|
||||
};
|
||||
|
||||
enum UHangulSyllableType {
|
||||
U_HST_NOT_APPLICABLE,
|
||||
U_HST_LEADING_JAMO,
|
||||
U_HST_VOWEL_JAMO,
|
||||
U_HST_TRAILING_JAMO,
|
||||
U_HST_LV_SYLLABLE,
|
||||
U_HST_LVT_SYLLABLE,
|
||||
};
|
||||
|
||||
int8_t __icu_charType(wint_t wc);
|
||||
int32_t __icu_getIntPropertyValue(wint_t wc, UProperty property);
|
||||
bool __icu_hasBinaryProperty(wint_t wc, UProperty property, int (*fallback)(int));
|
||||
|
||||
void* __find_icu_symbol(const char* symbol_name);
|
||||
|
||||
#endif // _PRIVATE_ICU_H
|
||||
|
|
|
@ -754,3 +754,80 @@ TEST(wchar, wcstof) {
|
|||
TEST(wchar, wcstold) {
|
||||
CheckWcsToFloat(wcstold);
|
||||
}
|
||||
|
||||
static void AssertWcwidthRange(wchar_t begin, wchar_t end, int expected) {
|
||||
for (wchar_t i = begin; i < end; ++i) {
|
||||
EXPECT_EQ(expected, wcwidth(i)) << static_cast<int>(i);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_NUL) {
|
||||
// NUL is defined to return 0 rather than -1, despite being a C0 control.
|
||||
EXPECT_EQ(0, wcwidth(0));
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_ascii) {
|
||||
AssertWcwidthRange(0x20, 0x7f, 1); // Non-C0 non-DEL ASCII.
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_controls) {
|
||||
AssertWcwidthRange(0x01, 0x20, -1); // C0 controls.
|
||||
EXPECT_EQ(-1, wcwidth(0x7f)); // DEL.
|
||||
AssertWcwidthRange(0x80, 0xa0, -1); // C1 controls.
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_non_spacing_and_enclosing_marks_and_format) {
|
||||
EXPECT_EQ(0, wcwidth(0x0300)); // Combining grave.
|
||||
EXPECT_EQ(0, wcwidth(0x20dd)); // Combining enclosing circle.
|
||||
EXPECT_EQ(0, wcwidth(0x00ad)); // Soft hyphen (SHY).
|
||||
EXPECT_EQ(0, wcwidth(0x200b)); // Zero width space.
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_cjk) {
|
||||
EXPECT_EQ(2, wcwidth(0x4e00)); // Start of CJK unified block.
|
||||
EXPECT_EQ(2, wcwidth(0x9fff)); // End of CJK unified block.
|
||||
EXPECT_EQ(2, wcwidth(0x3400)); // Start of CJK extension A block.
|
||||
EXPECT_EQ(2, wcwidth(0x4dbf)); // End of CJK extension A block.
|
||||
EXPECT_EQ(2, wcwidth(0x20000)); // Start of CJK extension B block.
|
||||
EXPECT_EQ(2, wcwidth(0x2a6df)); // End of CJK extension B block.
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_korean_combining_jamo) {
|
||||
AssertWcwidthRange(0x1160, 0x1200, 0); // Original range.
|
||||
EXPECT_EQ(0, wcwidth(0xd7b0)); // Newer.
|
||||
EXPECT_EQ(0, wcwidth(0xd7cb));
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_korean_jeongeul_syllables) {
|
||||
EXPECT_EQ(2, wcwidth(0xac00)); // Start of block.
|
||||
EXPECT_EQ(2, wcwidth(0xd7a3)); // End of defined code points in Unicode 7.
|
||||
// Undefined characters at the end of the block have width 1.
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_kana) {
|
||||
// Hiragana (most, not undefined).
|
||||
AssertWcwidthRange(0x3041, 0x3097, 2);
|
||||
// Katakana.
|
||||
AssertWcwidthRange(0x30a0, 0x3100, 2);
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_circled_two_digit_cjk) {
|
||||
// Circled two-digit CJK "speed sign" numbers are wide,
|
||||
// though EastAsianWidth is ambiguous.
|
||||
AssertWcwidthRange(0x3248, 0x3250, 2);
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_hexagrams) {
|
||||
// Hexagrams are wide, though EastAsianWidth is neutral.
|
||||
AssertWcwidthRange(0x4dc0, 0x4e00, 2);
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_default_ignorables) {
|
||||
AssertWcwidthRange(0xfff0, 0xfff8, 0); // Unassigned by default ignorable.
|
||||
EXPECT_EQ(0, wcwidth(0xe0000)); // ...through 0xe0fff.
|
||||
}
|
||||
|
||||
TEST(wchar, wcwidth_korean_common_non_syllables) {
|
||||
EXPECT_EQ(2, wcwidth(L'ㅜ')); // Korean "crying" emoticon.
|
||||
EXPECT_EQ(2, wcwidth(L'ㅋ')); // Korean "laughing" emoticon.
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue