Merge changes I1a60d6ef,Idabf0107 into main am: 1982dce568
am: d5ef5029fe
am: 0378513f6d
am: cdd4a25109
am: a287663d1c
Original change: https://android-review.googlesource.com/c/platform/bionic/+/2670156 Change-Id: I9e3dfb1928d414dd66db5fe1b2309a83f5147a71 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
This commit is contained in:
commit
1adec823d5
5 changed files with 78 additions and 126 deletions
|
@ -47,15 +47,36 @@ static size_t begin_surrogate(char32_t c32, char16_t* pc16,
|
|||
mbstate_set_byte(state, 3, nconv & 0xff);
|
||||
|
||||
*pc16 = ((c32 & 0xffc00) >> 10) | 0xd800;
|
||||
// Defined by POSIX as return value for first surrogate character.
|
||||
return static_cast<size_t>(-3);
|
||||
// https://issuetracker.google.com/289419882
|
||||
//
|
||||
// We misread the spec when implementing this. The first call should return
|
||||
// the length of the decoded character, and the second call should return -3
|
||||
// to indicate that the output is a continuation of the character decoded by
|
||||
// the first call.
|
||||
//
|
||||
// C23 7.30.1.3.4:
|
||||
//
|
||||
// between 1 and n inclusive if the next n or fewer bytes complete a valid
|
||||
// multibyte character (which is the value stored); the value returned is
|
||||
// the number of bytes that complete the multibyte character.
|
||||
//
|
||||
// (size_t)(-3) if the next character resulting from a previous call has
|
||||
// been stored (no bytes from the input have been consumed by this call).
|
||||
//
|
||||
// The first call returns the number of bytes consumed, and the second call
|
||||
// returns -3.
|
||||
//
|
||||
// All UTF-8 sequences that encode a surrogate pair are 4 bytes, but we may
|
||||
// not have seen the full sequence yet.
|
||||
return nconv;
|
||||
}
|
||||
|
||||
static size_t finish_surrogate(char16_t* pc16, mbstate_t* state) {
|
||||
char16_t trail = mbstate_get_byte(state, 1) << 8 |
|
||||
mbstate_get_byte(state, 0);
|
||||
*pc16 = trail;
|
||||
return mbstate_reset_and_return(mbstate_get_byte(state, 3), state);
|
||||
mbstate_reset(state);
|
||||
return static_cast<size_t>(-3);
|
||||
}
|
||||
|
||||
size_t mbrtoc16(char16_t* pc16, const char* s, size_t n, mbstate_t* ps) {
|
||||
|
|
|
@ -51,7 +51,21 @@ size_t mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps) {
|
|||
}
|
||||
|
||||
if (n == 0) {
|
||||
return 0;
|
||||
// C23 7.30.1 (for each `mbrtoc*` function) says:
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// 0 if the next n or fewer bytes complete the multibyte character that
|
||||
// corresponds to the null wide character (which is the value stored).
|
||||
//
|
||||
// (size_t)(-2) if the next n bytes contribute to an incomplete (but
|
||||
// potentially valid) multibyte character, and all n bytes have been
|
||||
// processed (no value is stored).
|
||||
//
|
||||
// Bionic historically interpreted the behavior when n is 0 to be the next 0
|
||||
// bytes decoding to the null. That's a pretty bad interpretation, and both
|
||||
// glibc and musl return -2 for that case.
|
||||
return BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE;
|
||||
}
|
||||
|
||||
uint8_t ch;
|
||||
|
|
|
@ -57,14 +57,18 @@ static inline __wur uint8_t mbstate_get_byte(const mbstate_t* ps, int n) {
|
|||
return ps->__seq[n];
|
||||
}
|
||||
|
||||
static inline void mbstate_reset(mbstate_t* ps) {
|
||||
*(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
|
||||
}
|
||||
|
||||
static inline __wur size_t mbstate_reset_and_return_illegal(int _errno, mbstate_t* ps) {
|
||||
errno = _errno;
|
||||
*(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
|
||||
mbstate_reset(ps);
|
||||
return BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE;
|
||||
}
|
||||
|
||||
static inline __wur size_t mbstate_reset_and_return(size_t _return, mbstate_t* ps) {
|
||||
*(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
|
||||
mbstate_reset(ps);
|
||||
return _return;
|
||||
}
|
||||
|
||||
|
|
|
@ -40,31 +40,6 @@ constexpr bool kLibcRejectsOverLongUtf8Sequences = false;
|
|||
#error kLibcRejectsOverLongUtf8Sequences must be configured for this platform
|
||||
#endif
|
||||
|
||||
// C23 7.30.1 (for each `mbrtoc*` function) says:
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// 0 if the next n or fewer bytes complete the multibyte character that
|
||||
// corresponds to the null wide character (which is the value stored).
|
||||
//
|
||||
// (size_t)(-2) if the next n bytes contribute to an incomplete (but
|
||||
// potentially valid) multibyte character, and all n bytes have been
|
||||
// processed (no value is stored).
|
||||
//
|
||||
// Bionic historically interpreted the behavior when n is 0 to be the next 0
|
||||
// bytes decoding to the null. That's a pretty bad interpretation, and both
|
||||
// glibc and musl return -2 for that case.
|
||||
//
|
||||
// The tests currently checks the incorrect behavior for bionic because gtest
|
||||
// doesn't support explicit xfail annotations. The behavior difference here
|
||||
// should be fixed, but danalbert wants to add more tests before tackling the
|
||||
// bugs.
|
||||
#ifdef __ANDROID__
|
||||
constexpr size_t kExpectedResultForZeroLength = 0U;
|
||||
#else
|
||||
constexpr size_t kExpectedResultForZeroLength = static_cast<size_t>(-2);
|
||||
#endif
|
||||
|
||||
TEST(uchar, sizeof_uchar_t) {
|
||||
EXPECT_EQ(2U, sizeof(char16_t));
|
||||
EXPECT_EQ(4U, sizeof(char32_t));
|
||||
|
@ -199,11 +174,11 @@ TEST(uchar, mbrtoc16_zero_len) {
|
|||
char16_t out;
|
||||
|
||||
out = L'x';
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc16(&out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(L'x', out);
|
||||
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc16(&out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc16(&out, "", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "", 0, nullptr));
|
||||
EXPECT_EQ(1U, mbrtoc16(&out, "hello", 1, nullptr));
|
||||
EXPECT_EQ(L'h', out);
|
||||
}
|
||||
|
@ -224,32 +199,6 @@ TEST(uchar, mbrtoc16) {
|
|||
ASSERT_EQ(3U, mbrtoc16(&out, "\xe2\x82\xac" "def", 6, nullptr));
|
||||
ASSERT_EQ(static_cast<char16_t>(0x20ac), out);
|
||||
// 4-byte UTF-8 will be returned as a surrogate pair...
|
||||
#ifdef __BIONIC__
|
||||
// https://issuetracker.google.com/289419882
|
||||
//
|
||||
// We misread the spec when implementing this. The first call should return
|
||||
// the length of the decoded character, and the second call should return -3
|
||||
// to indicate that the output is a continuation of the character decoded by
|
||||
// the first call.
|
||||
//
|
||||
// C23 7.30.1.3.4:
|
||||
//
|
||||
// between 1 and n inclusive if the next n or fewer bytes complete a valid
|
||||
// multibyte character (which is the value stored); the value returned is
|
||||
// the number of bytes that complete the multibyte character.
|
||||
//
|
||||
// (size_t)(-3) if the next character resulting from a previous call has
|
||||
// been stored (no bytes from the input have been consumed by this call).
|
||||
//
|
||||
// Leaving the test for the wrong outputs here while we clean up and improve
|
||||
// the rest of the tests to get a better handle on the behavior differences
|
||||
// before fixing the bug.
|
||||
ASSERT_EQ(static_cast<size_t>(-3),
|
||||
mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, nullptr));
|
||||
ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
|
||||
ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d" "ef", 6, nullptr));
|
||||
ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
|
||||
#else
|
||||
ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, nullptr));
|
||||
ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
|
||||
ASSERT_EQ(static_cast<size_t>(-3), mbrtoc16(&out,
|
||||
|
@ -257,7 +206,6 @@ TEST(uchar, mbrtoc16) {
|
|||
"ef",
|
||||
6, nullptr));
|
||||
ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(uchar, mbrtoc16_long_sequences) {
|
||||
|
@ -326,14 +274,6 @@ void test_mbrtoc16_incomplete(mbstate_t* ps) {
|
|||
// 4-byte UTF-8.
|
||||
ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\xf4", 1, ps));
|
||||
ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\x8a\xaf", 2, ps));
|
||||
#ifdef __BIONIC__
|
||||
// https://issuetracker.google.com/289419882
|
||||
// See explanation in mbrtoc16 test for the same bug.
|
||||
ASSERT_EQ(static_cast<size_t>(-3), mbrtoc16(&out, "\x8d" "ef", 3, ps));
|
||||
ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
|
||||
ASSERT_EQ(1U, mbrtoc16(&out, "\x80" "ef", 3, ps));
|
||||
ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
|
||||
#else
|
||||
ASSERT_EQ(1U, mbrtoc16(&out,
|
||||
"\x8d"
|
||||
"ef",
|
||||
|
@ -344,7 +284,6 @@ void test_mbrtoc16_incomplete(mbstate_t* ps) {
|
|||
"ef",
|
||||
3, ps));
|
||||
ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
|
||||
#endif
|
||||
ASSERT_TRUE(mbsinit(ps));
|
||||
|
||||
// Invalid 2-byte
|
||||
|
@ -444,16 +383,16 @@ TEST(uchar, mbrtoc32) {
|
|||
char32_t out[8];
|
||||
|
||||
out[0] = L'x';
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<char32_t>(L'x'), out[0]);
|
||||
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(out, "", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(out, "", 0, nullptr));
|
||||
EXPECT_EQ(1U, mbrtoc32(out, "hello", 1, nullptr));
|
||||
EXPECT_EQ(static_cast<char32_t>(L'h'), out[0]);
|
||||
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(nullptr, "hello", 0, nullptr));
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(nullptr, "", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(nullptr, "hello", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(nullptr, "", 0, nullptr));
|
||||
EXPECT_EQ(1U, mbrtoc32(nullptr, "hello", 1, nullptr));
|
||||
|
||||
EXPECT_EQ(0U, mbrtoc32(nullptr, nullptr, 0, nullptr));
|
||||
|
|
|
@ -54,31 +54,6 @@ constexpr bool kLibcRejectsOverLongUtf8Sequences = false;
|
|||
#error kLibcRejectsOverLongUtf8Sequences must be configured for this platform
|
||||
#endif
|
||||
|
||||
// C23 7.31.6.3.2 (mbrtowc) says:
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// 0 if the next n or fewer bytes complete the multibyte character that
|
||||
// corresponds to the null wide character (which is the value stored).
|
||||
//
|
||||
// (size_t)(-2) if the next n bytes contribute to an incomplete (but
|
||||
// potentially valid) multibyte character, and all n bytes have been
|
||||
// processed (no value is stored).
|
||||
//
|
||||
// Bionic historically interpreted the behavior when n is 0 to be the next 0
|
||||
// bytes decoding to the null. That's a pretty bad interpretation, and both
|
||||
// glibc and musl return -2 for that case.
|
||||
//
|
||||
// The tests currently checks the incorrect behavior for bionic because gtest
|
||||
// doesn't support explicit xfail annotations. The behavior difference here
|
||||
// should be fixed, but danalbert wants to add more tests before tackling the
|
||||
// bugs.
|
||||
#ifdef __ANDROID__
|
||||
constexpr size_t kExpectedResultForZeroLength = 0U;
|
||||
#else
|
||||
constexpr size_t kExpectedResultForZeroLength = static_cast<size_t>(-2);
|
||||
#endif
|
||||
|
||||
#if defined(__GLIBC__)
|
||||
constexpr bool kLibcSupportsParsingBinaryLiterals = __GLIBC_PREREQ(2, 38);
|
||||
#else
|
||||
|
@ -92,7 +67,7 @@ TEST(wchar, sizeof_wchar_t) {
|
|||
|
||||
TEST(wchar, mbrlen) {
|
||||
char bytes[] = { 'h', 'e', 'l', 'l', 'o', '\0' };
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrlen(&bytes[0], 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrlen(&bytes[0], 0, nullptr));
|
||||
EXPECT_EQ(1U, mbrlen(&bytes[0], 1, nullptr));
|
||||
|
||||
EXPECT_EQ(1U, mbrlen(&bytes[4], 1, nullptr));
|
||||
|
@ -317,37 +292,36 @@ TEST(wchar, wcsstr_80199) {
|
|||
TEST(wchar, mbtowc) {
|
||||
wchar_t out[8];
|
||||
|
||||
// bionic has the same misunderstanding of the result for a zero-length
|
||||
// conversion for mbtowc as it does for all the other multibyte conversion
|
||||
// functions but mbtowc returns different values than all the others:
|
||||
// mbtowc and all the mbrto* APIs behave slightly differently when n is 0:
|
||||
//
|
||||
// C23 7.24.7.2.4:
|
||||
// mbrtowc returns 0 "if the next n or fewer bytes complete the multibyte
|
||||
// character that corresponds to the null wide character"
|
||||
//
|
||||
// If s is a null pointer, the mbtowc function returns a nonzero or zero
|
||||
// value, if multibyte character encodings, respectively, do or do not have
|
||||
// state-dependent encodings. If s is not a null pointer, the mbtowc function
|
||||
// either returns 0 (if s points to the null character), or returns the number
|
||||
// of bytes that are contained in the converted multibyte character (if the
|
||||
// next n or fewer bytes form a valid multibyte character), or returns -1 (if
|
||||
// they do not form a valid multibyte character).
|
||||
|
||||
#ifdef __BIONIC__
|
||||
int expected_result_for_zero_length = 0;
|
||||
// mbrtoc says: "If s is not a null pointer, the mbtowc function either
|
||||
// returns 0 (if s points to the null character)..."
|
||||
//
|
||||
// So mbrtowc will not provide the correct mbtowc return value for "" and
|
||||
// n = 0.
|
||||
//
|
||||
// glibc gets this right, but all the BSDs (including macOS) and bionic (by
|
||||
// way of openbsd) return -1 instead of 0.
|
||||
#ifdef __GLIBC__
|
||||
int expected_result_for_zero_length_empty_string = 0;
|
||||
#else
|
||||
int expected_result_for_zero_length = -1;
|
||||
int expected_result_for_zero_length_empty_string = -1;
|
||||
#endif
|
||||
|
||||
out[0] = 'x';
|
||||
EXPECT_EQ(expected_result_for_zero_length, mbtowc(out, "hello", 0));
|
||||
EXPECT_EQ(-1, mbtowc(out, "hello", 0));
|
||||
EXPECT_EQ('x', out[0]);
|
||||
|
||||
EXPECT_EQ(expected_result_for_zero_length, mbtowc(out, "hello", 0));
|
||||
EXPECT_EQ(0, mbtowc(out, "", 0));
|
||||
EXPECT_EQ(-1, mbtowc(out, "hello", 0));
|
||||
EXPECT_EQ(expected_result_for_zero_length_empty_string, mbtowc(out, "", 0));
|
||||
EXPECT_EQ(1, mbtowc(out, "hello", 1));
|
||||
EXPECT_EQ(L'h', out[0]);
|
||||
|
||||
EXPECT_EQ(expected_result_for_zero_length, mbtowc(nullptr, "hello", 0));
|
||||
EXPECT_EQ(0, mbtowc(nullptr, "", 0));
|
||||
EXPECT_EQ(-1, mbtowc(nullptr, "hello", 0));
|
||||
EXPECT_EQ(expected_result_for_zero_length_empty_string, mbtowc(nullptr, "", 0));
|
||||
EXPECT_EQ(1, mbtowc(nullptr, "hello", 1));
|
||||
|
||||
EXPECT_EQ(0, mbtowc(nullptr, nullptr, 0));
|
||||
|
@ -357,16 +331,16 @@ TEST(wchar, mbrtowc) {
|
|||
wchar_t out[8];
|
||||
|
||||
out[0] = 'x';
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(out, "hello", 0, nullptr));
|
||||
EXPECT_EQ('x', out[0]);
|
||||
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(out, "", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(out, "hello", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(out, "", 0, nullptr));
|
||||
EXPECT_EQ(1U, mbrtowc(out, "hello", 1, nullptr));
|
||||
EXPECT_EQ(L'h', out[0]);
|
||||
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(nullptr, "hello", 0, nullptr));
|
||||
EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(nullptr, "", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(nullptr, "hello", 0, nullptr));
|
||||
EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(nullptr, "", 0, nullptr));
|
||||
EXPECT_EQ(1U, mbrtowc(nullptr, "hello", 1, nullptr));
|
||||
|
||||
EXPECT_EQ(0U, mbrtowc(nullptr, nullptr, 0, nullptr));
|
||||
|
|
Loading…
Reference in a new issue