From 3a8fed1ac339b279000ba89f85041d9e28829288 Mon Sep 17 00:00:00 2001 From: Dan Albert Date: Thu, 20 Jul 2023 23:08:22 +0000 Subject: [PATCH 1/2] Fix mismatched return types for surrogate pairs. We've had these backward all this time. The relevant quote is in a code comment in the implementation, but the first call after completely decoding a code point that requires a surrogate pair should return the number of bytes decoded by the most recent call, and the second call should return -3 (if only C had given those some named constants that might have been more obviously wrong). Bug: https://issuetracker.google.com/289419882 Test: Fixed the test, tests run against glibc and musl to confirm Change-Id: Idabf01075b1cad35b604ede8d676d6f0b1dc91e6 --- libc/bionic/mbrtoc16.cpp | 27 +++++++++++++++++++++++--- libc/private/bionic_mbstate.h | 8 ++++++-- tests/uchar_test.cpp | 36 ----------------------------------- 3 files changed, 30 insertions(+), 41 deletions(-) diff --git a/libc/bionic/mbrtoc16.cpp b/libc/bionic/mbrtoc16.cpp index 154b8a35b..e87991a4e 100644 --- a/libc/bionic/mbrtoc16.cpp +++ b/libc/bionic/mbrtoc16.cpp @@ -47,15 +47,36 @@ static size_t begin_surrogate(char32_t c32, char16_t* pc16, mbstate_set_byte(state, 3, nconv & 0xff); *pc16 = ((c32 & 0xffc00) >> 10) | 0xd800; - // Defined by POSIX as return value for first surrogate character. - return static_cast(-3); + // https://issuetracker.google.com/289419882 + // + // We misread the spec when implementing this. The first call should return + // the length of the decoded character, and the second call should return -3 + // to indicate that the output is a continuation of the character decoded by + // the first call. + // + // C23 7.30.1.3.4: + // + // between 1 and n inclusive if the next n or fewer bytes complete a valid + // multibyte character (which is the value stored); the value returned is + // the number of bytes that complete the multibyte character. + // + // (size_t)(-3) if the next character resulting from a previous call has + // been stored (no bytes from the input have been consumed by this call). + // + // The first call returns the number of bytes consumed, and the second call + // returns -3. + // + // All UTF-8 sequences that encode a surrogate pair are 4 bytes, but we may + // not have seen the full sequence yet. + return nconv; } static size_t finish_surrogate(char16_t* pc16, mbstate_t* state) { char16_t trail = mbstate_get_byte(state, 1) << 8 | mbstate_get_byte(state, 0); *pc16 = trail; - return mbstate_reset_and_return(mbstate_get_byte(state, 3), state); + mbstate_reset(state); + return static_cast(-3); } size_t mbrtoc16(char16_t* pc16, const char* s, size_t n, mbstate_t* ps) { diff --git a/libc/private/bionic_mbstate.h b/libc/private/bionic_mbstate.h index 29f5aa679..0e5f861f7 100644 --- a/libc/private/bionic_mbstate.h +++ b/libc/private/bionic_mbstate.h @@ -57,14 +57,18 @@ static inline __wur uint8_t mbstate_get_byte(const mbstate_t* ps, int n) { return ps->__seq[n]; } +static inline void mbstate_reset(mbstate_t* ps) { + *(reinterpret_cast(ps->__seq)) = 0; +} + static inline __wur size_t mbstate_reset_and_return_illegal(int _errno, mbstate_t* ps) { errno = _errno; - *(reinterpret_cast(ps->__seq)) = 0; + mbstate_reset(ps); return BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE; } static inline __wur size_t mbstate_reset_and_return(size_t _return, mbstate_t* ps) { - *(reinterpret_cast(ps->__seq)) = 0; + mbstate_reset(ps); return _return; } diff --git a/tests/uchar_test.cpp b/tests/uchar_test.cpp index 9e42f58dd..c08b5742f 100644 --- a/tests/uchar_test.cpp +++ b/tests/uchar_test.cpp @@ -224,32 +224,6 @@ TEST(uchar, mbrtoc16) { ASSERT_EQ(3U, mbrtoc16(&out, "\xe2\x82\xac" "def", 6, nullptr)); ASSERT_EQ(static_cast(0x20ac), out); // 4-byte UTF-8 will be returned as a surrogate pair... -#ifdef __BIONIC__ - // https://issuetracker.google.com/289419882 - // - // We misread the spec when implementing this. The first call should return - // the length of the decoded character, and the second call should return -3 - // to indicate that the output is a continuation of the character decoded by - // the first call. - // - // C23 7.30.1.3.4: - // - // between 1 and n inclusive if the next n or fewer bytes complete a valid - // multibyte character (which is the value stored); the value returned is - // the number of bytes that complete the multibyte character. - // - // (size_t)(-3) if the next character resulting from a previous call has - // been stored (no bytes from the input have been consumed by this call). - // - // Leaving the test for the wrong outputs here while we clean up and improve - // the rest of the tests to get a better handle on the behavior differences - // before fixing the bug. - ASSERT_EQ(static_cast(-3), - mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, nullptr)); - ASSERT_EQ(static_cast(0xdbea), out); - ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d" "ef", 6, nullptr)); - ASSERT_EQ(static_cast(0xdfcd), out); -#else ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, nullptr)); ASSERT_EQ(static_cast(0xdbea), out); ASSERT_EQ(static_cast(-3), mbrtoc16(&out, @@ -257,7 +231,6 @@ TEST(uchar, mbrtoc16) { "ef", 6, nullptr)); ASSERT_EQ(static_cast(0xdfcd), out); -#endif } TEST(uchar, mbrtoc16_long_sequences) { @@ -326,14 +299,6 @@ void test_mbrtoc16_incomplete(mbstate_t* ps) { // 4-byte UTF-8. ASSERT_EQ(static_cast(-2), mbrtoc16(&out, "\xf4", 1, ps)); ASSERT_EQ(static_cast(-2), mbrtoc16(&out, "\x8a\xaf", 2, ps)); -#ifdef __BIONIC__ - // https://issuetracker.google.com/289419882 - // See explanation in mbrtoc16 test for the same bug. - ASSERT_EQ(static_cast(-3), mbrtoc16(&out, "\x8d" "ef", 3, ps)); - ASSERT_EQ(static_cast(0xdbea), out); - ASSERT_EQ(1U, mbrtoc16(&out, "\x80" "ef", 3, ps)); - ASSERT_EQ(static_cast(0xdfcd), out); -#else ASSERT_EQ(1U, mbrtoc16(&out, "\x8d" "ef", @@ -344,7 +309,6 @@ void test_mbrtoc16_incomplete(mbstate_t* ps) { "ef", 3, ps)); ASSERT_EQ(static_cast(0xdfcd), out); -#endif ASSERT_TRUE(mbsinit(ps)); // Invalid 2-byte From 16007d520423649c00e21d0a422cd54a51134cb3 Mon Sep 17 00:00:00 2001 From: Dan Albert Date: Thu, 20 Jul 2023 23:38:57 +0000 Subject: [PATCH 2/2] Fix result for zero-length non-null conversion. Bug: None Test: deleted the xfail half of the test Change-Id: I1a60d6ef27ebad14de79ac3bc637a6f2280334d8 --- libc/bionic/mbrtoc32.cpp | 16 ++++++++- tests/uchar_test.cpp | 41 +++++----------------- tests/wchar_test.cpp | 76 +++++++++++++--------------------------- 3 files changed, 48 insertions(+), 85 deletions(-) diff --git a/libc/bionic/mbrtoc32.cpp b/libc/bionic/mbrtoc32.cpp index c26dd71d5..74deb4093 100644 --- a/libc/bionic/mbrtoc32.cpp +++ b/libc/bionic/mbrtoc32.cpp @@ -51,7 +51,21 @@ size_t mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps) { } if (n == 0) { - return 0; + // C23 7.30.1 (for each `mbrtoc*` function) says: + // + // Returns: + // + // 0 if the next n or fewer bytes complete the multibyte character that + // corresponds to the null wide character (which is the value stored). + // + // (size_t)(-2) if the next n bytes contribute to an incomplete (but + // potentially valid) multibyte character, and all n bytes have been + // processed (no value is stored). + // + // Bionic historically interpreted the behavior when n is 0 to be the next 0 + // bytes decoding to the null. That's a pretty bad interpretation, and both + // glibc and musl return -2 for that case. + return BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE; } uint8_t ch; diff --git a/tests/uchar_test.cpp b/tests/uchar_test.cpp index c08b5742f..512f098eb 100644 --- a/tests/uchar_test.cpp +++ b/tests/uchar_test.cpp @@ -40,31 +40,6 @@ constexpr bool kLibcRejectsOverLongUtf8Sequences = false; #error kLibcRejectsOverLongUtf8Sequences must be configured for this platform #endif -// C23 7.30.1 (for each `mbrtoc*` function) says: -// -// Returns: -// -// 0 if the next n or fewer bytes complete the multibyte character that -// corresponds to the null wide character (which is the value stored). -// -// (size_t)(-2) if the next n bytes contribute to an incomplete (but -// potentially valid) multibyte character, and all n bytes have been -// processed (no value is stored). -// -// Bionic historically interpreted the behavior when n is 0 to be the next 0 -// bytes decoding to the null. That's a pretty bad interpretation, and both -// glibc and musl return -2 for that case. -// -// The tests currently checks the incorrect behavior for bionic because gtest -// doesn't support explicit xfail annotations. The behavior difference here -// should be fixed, but danalbert wants to add more tests before tackling the -// bugs. -#ifdef __ANDROID__ -constexpr size_t kExpectedResultForZeroLength = 0U; -#else -constexpr size_t kExpectedResultForZeroLength = static_cast(-2); -#endif - TEST(uchar, sizeof_uchar_t) { EXPECT_EQ(2U, sizeof(char16_t)); EXPECT_EQ(4U, sizeof(char32_t)); @@ -199,11 +174,11 @@ TEST(uchar, mbrtoc16_zero_len) { char16_t out; out = L'x'; - EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc16(&out, "hello", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtoc16(&out, "hello", 0, nullptr)); EXPECT_EQ(L'x', out); - EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc16(&out, "hello", 0, nullptr)); - EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc16(&out, "", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtoc16(&out, "hello", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtoc16(&out, "", 0, nullptr)); EXPECT_EQ(1U, mbrtoc16(&out, "hello", 1, nullptr)); EXPECT_EQ(L'h', out); } @@ -408,16 +383,16 @@ TEST(uchar, mbrtoc32) { char32_t out[8]; out[0] = L'x'; - EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(out, "hello", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtoc32(out, "hello", 0, nullptr)); EXPECT_EQ(static_cast(L'x'), out[0]); - EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(out, "hello", 0, nullptr)); - EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(out, "", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtoc32(out, "hello", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtoc32(out, "", 0, nullptr)); EXPECT_EQ(1U, mbrtoc32(out, "hello", 1, nullptr)); EXPECT_EQ(static_cast(L'h'), out[0]); - EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(nullptr, "hello", 0, nullptr)); - EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(nullptr, "", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtoc32(nullptr, "hello", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtoc32(nullptr, "", 0, nullptr)); EXPECT_EQ(1U, mbrtoc32(nullptr, "hello", 1, nullptr)); EXPECT_EQ(0U, mbrtoc32(nullptr, nullptr, 0, nullptr)); diff --git a/tests/wchar_test.cpp b/tests/wchar_test.cpp index 04932ba21..28c1046ef 100644 --- a/tests/wchar_test.cpp +++ b/tests/wchar_test.cpp @@ -54,31 +54,6 @@ constexpr bool kLibcRejectsOverLongUtf8Sequences = false; #error kLibcRejectsOverLongUtf8Sequences must be configured for this platform #endif -// C23 7.31.6.3.2 (mbrtowc) says: -// -// Returns: -// -// 0 if the next n or fewer bytes complete the multibyte character that -// corresponds to the null wide character (which is the value stored). -// -// (size_t)(-2) if the next n bytes contribute to an incomplete (but -// potentially valid) multibyte character, and all n bytes have been -// processed (no value is stored). -// -// Bionic historically interpreted the behavior when n is 0 to be the next 0 -// bytes decoding to the null. That's a pretty bad interpretation, and both -// glibc and musl return -2 for that case. -// -// The tests currently checks the incorrect behavior for bionic because gtest -// doesn't support explicit xfail annotations. The behavior difference here -// should be fixed, but danalbert wants to add more tests before tackling the -// bugs. -#ifdef __ANDROID__ -constexpr size_t kExpectedResultForZeroLength = 0U; -#else -constexpr size_t kExpectedResultForZeroLength = static_cast(-2); -#endif - #if defined(__GLIBC__) constexpr bool kLibcSupportsParsingBinaryLiterals = __GLIBC_PREREQ(2, 38); #else @@ -92,7 +67,7 @@ TEST(wchar, sizeof_wchar_t) { TEST(wchar, mbrlen) { char bytes[] = { 'h', 'e', 'l', 'l', 'o', '\0' }; - EXPECT_EQ(kExpectedResultForZeroLength, mbrlen(&bytes[0], 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrlen(&bytes[0], 0, nullptr)); EXPECT_EQ(1U, mbrlen(&bytes[0], 1, nullptr)); EXPECT_EQ(1U, mbrlen(&bytes[4], 1, nullptr)); @@ -317,37 +292,36 @@ TEST(wchar, wcsstr_80199) { TEST(wchar, mbtowc) { wchar_t out[8]; - // bionic has the same misunderstanding of the result for a zero-length - // conversion for mbtowc as it does for all the other multibyte conversion - // functions but mbtowc returns different values than all the others: + // mbtowc and all the mbrto* APIs behave slightly differently when n is 0: // - // C23 7.24.7.2.4: + // mbrtowc returns 0 "if the next n or fewer bytes complete the multibyte + // character that corresponds to the null wide character" // - // If s is a null pointer, the mbtowc function returns a nonzero or zero - // value, if multibyte character encodings, respectively, do or do not have - // state-dependent encodings. If s is not a null pointer, the mbtowc function - // either returns 0 (if s points to the null character), or returns the number - // of bytes that are contained in the converted multibyte character (if the - // next n or fewer bytes form a valid multibyte character), or returns -1 (if - // they do not form a valid multibyte character). - -#ifdef __BIONIC__ - int expected_result_for_zero_length = 0; + // mbrtoc says: "If s is not a null pointer, the mbtowc function either + // returns 0 (if s points to the null character)..." + // + // So mbrtowc will not provide the correct mbtowc return value for "" and + // n = 0. + // + // glibc gets this right, but all the BSDs (including macOS) and bionic (by + // way of openbsd) return -1 instead of 0. +#ifdef __GLIBC__ + int expected_result_for_zero_length_empty_string = 0; #else - int expected_result_for_zero_length = -1; + int expected_result_for_zero_length_empty_string = -1; #endif out[0] = 'x'; - EXPECT_EQ(expected_result_for_zero_length, mbtowc(out, "hello", 0)); + EXPECT_EQ(-1, mbtowc(out, "hello", 0)); EXPECT_EQ('x', out[0]); - EXPECT_EQ(expected_result_for_zero_length, mbtowc(out, "hello", 0)); - EXPECT_EQ(0, mbtowc(out, "", 0)); + EXPECT_EQ(-1, mbtowc(out, "hello", 0)); + EXPECT_EQ(expected_result_for_zero_length_empty_string, mbtowc(out, "", 0)); EXPECT_EQ(1, mbtowc(out, "hello", 1)); EXPECT_EQ(L'h', out[0]); - EXPECT_EQ(expected_result_for_zero_length, mbtowc(nullptr, "hello", 0)); - EXPECT_EQ(0, mbtowc(nullptr, "", 0)); + EXPECT_EQ(-1, mbtowc(nullptr, "hello", 0)); + EXPECT_EQ(expected_result_for_zero_length_empty_string, mbtowc(nullptr, "", 0)); EXPECT_EQ(1, mbtowc(nullptr, "hello", 1)); EXPECT_EQ(0, mbtowc(nullptr, nullptr, 0)); @@ -357,16 +331,16 @@ TEST(wchar, mbrtowc) { wchar_t out[8]; out[0] = 'x'; - EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(out, "hello", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtowc(out, "hello", 0, nullptr)); EXPECT_EQ('x', out[0]); - EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(out, "hello", 0, nullptr)); - EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(out, "", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtowc(out, "hello", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtowc(out, "", 0, nullptr)); EXPECT_EQ(1U, mbrtowc(out, "hello", 1, nullptr)); EXPECT_EQ(L'h', out[0]); - EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(nullptr, "hello", 0, nullptr)); - EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(nullptr, "", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtowc(nullptr, "hello", 0, nullptr)); + EXPECT_EQ(static_cast(-2), mbrtowc(nullptr, "", 0, nullptr)); EXPECT_EQ(1U, mbrtowc(nullptr, "hello", 1, nullptr)); EXPECT_EQ(0U, mbrtowc(nullptr, nullptr, 0, nullptr));