From 3a8fed1ac339b279000ba89f85041d9e28829288 Mon Sep 17 00:00:00 2001
From: Dan Albert <danalbert@google.com>
Date: Thu, 20 Jul 2023 23:08:22 +0000
Subject: [PATCH 1/2] Fix mismatched return types for surrogate pairs.

We've had these backward all this time. The relevant quote is in a
code comment in the implementation, but the first call after
completely decoding a code point that requires a surrogate pair should
return the number of bytes decoded by the most recent call, and the
second call should return -3 (if only C had given those some named
constants that might have been more obviously wrong).

Bug: https://issuetracker.google.com/289419882
Test: Fixed the test, tests run against glibc and musl to confirm
Change-Id: Idabf01075b1cad35b604ede8d676d6f0b1dc91e6
---
 libc/bionic/mbrtoc16.cpp      | 27 +++++++++++++++++++++++---
 libc/private/bionic_mbstate.h |  8 ++++++--
 tests/uchar_test.cpp          | 36 -----------------------------------
 3 files changed, 30 insertions(+), 41 deletions(-)

diff --git a/libc/bionic/mbrtoc16.cpp b/libc/bionic/mbrtoc16.cpp
index 154b8a35b..e87991a4e 100644
--- a/libc/bionic/mbrtoc16.cpp
+++ b/libc/bionic/mbrtoc16.cpp
@@ -47,15 +47,36 @@ static size_t begin_surrogate(char32_t c32, char16_t* pc16,
   mbstate_set_byte(state, 3, nconv & 0xff);
 
   *pc16 = ((c32 & 0xffc00) >> 10) | 0xd800;
-  // Defined by POSIX as return value for first surrogate character.
-  return static_cast<size_t>(-3);
+  // https://issuetracker.google.com/289419882
+  //
+  // We misread the spec when implementing this. The first call should return
+  // the length of the decoded character, and the second call should return -3
+  // to indicate that the output is a continuation of the character decoded by
+  // the first call.
+  //
+  // C23 7.30.1.3.4:
+  //
+  //     between 1 and n inclusive if the next n or fewer bytes complete a valid
+  //     multibyte character (which is the value stored); the value returned is
+  //     the number of bytes that complete the multibyte character.
+  //
+  //     (size_t)(-3) if the next character resulting from a previous call has
+  //     been stored (no bytes from the input have been consumed by this call).
+  //
+  // The first call returns the number of bytes consumed, and the second call
+  // returns -3.
+  //
+  // All UTF-8 sequences that encode a surrogate pair are 4 bytes, but we may
+  // not have seen the full sequence yet.
+  return nconv;
 }
 
 static size_t finish_surrogate(char16_t* pc16, mbstate_t* state) {
   char16_t trail = mbstate_get_byte(state, 1) << 8 |
                    mbstate_get_byte(state, 0);
   *pc16 = trail;
-  return mbstate_reset_and_return(mbstate_get_byte(state, 3), state);
+  mbstate_reset(state);
+  return static_cast<size_t>(-3);
 }
 
 size_t mbrtoc16(char16_t* pc16, const char* s, size_t n, mbstate_t* ps) {
diff --git a/libc/private/bionic_mbstate.h b/libc/private/bionic_mbstate.h
index 29f5aa679..0e5f861f7 100644
--- a/libc/private/bionic_mbstate.h
+++ b/libc/private/bionic_mbstate.h
@@ -57,14 +57,18 @@ static inline __wur uint8_t mbstate_get_byte(const mbstate_t* ps, int n) {
   return ps->__seq[n];
 }
 
+static inline void mbstate_reset(mbstate_t* ps) {
+  *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+}
+
 static inline __wur size_t mbstate_reset_and_return_illegal(int _errno, mbstate_t* ps) {
   errno = _errno;
-  *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+  mbstate_reset(ps);
   return BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE;
 }
 
 static inline __wur size_t mbstate_reset_and_return(size_t _return, mbstate_t* ps) {
-  *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+  mbstate_reset(ps);
   return _return;
 }
 
diff --git a/tests/uchar_test.cpp b/tests/uchar_test.cpp
index 9e42f58dd..c08b5742f 100644
--- a/tests/uchar_test.cpp
+++ b/tests/uchar_test.cpp
@@ -224,32 +224,6 @@ TEST(uchar, mbrtoc16) {
   ASSERT_EQ(3U, mbrtoc16(&out, "\xe2\x82\xac" "def", 6, nullptr));
   ASSERT_EQ(static_cast<char16_t>(0x20ac), out);
   // 4-byte UTF-8 will be returned as a surrogate pair...
-#ifdef __BIONIC__
-  // https://issuetracker.google.com/289419882
-  //
-  // We misread the spec when implementing this. The first call should return
-  // the length of the decoded character, and the second call should return -3
-  // to indicate that the output is a continuation of the character decoded by
-  // the first call.
-  //
-  // C23 7.30.1.3.4:
-  //
-  //     between 1 and n inclusive if the next n or fewer bytes complete a valid
-  //     multibyte character (which is the value stored); the value returned is
-  //     the number of bytes that complete the multibyte character.
-  //
-  //     (size_t)(-3) if the next character resulting from a previous call has
-  //     been stored (no bytes from the input have been consumed by this call).
-  //
-  // Leaving the test for the wrong outputs here while we clean up and improve
-  // the rest of the tests to get a better handle on the behavior differences
-  // before fixing the bug.
-  ASSERT_EQ(static_cast<size_t>(-3),
-            mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, nullptr));
-  ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
-  ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d" "ef", 6, nullptr));
-  ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#else
   ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, nullptr));
   ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
   ASSERT_EQ(static_cast<size_t>(-3), mbrtoc16(&out,
@@ -257,7 +231,6 @@ TEST(uchar, mbrtoc16) {
                                               "ef",
                                               6, nullptr));
   ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#endif
 }
 
 TEST(uchar, mbrtoc16_long_sequences) {
@@ -326,14 +299,6 @@ void test_mbrtoc16_incomplete(mbstate_t* ps) {
   // 4-byte UTF-8.
   ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\xf4", 1, ps));
   ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\x8a\xaf", 2, ps));
-#ifdef __BIONIC__
-  // https://issuetracker.google.com/289419882
-  // See explanation in mbrtoc16 test for the same bug.
-  ASSERT_EQ(static_cast<size_t>(-3), mbrtoc16(&out, "\x8d" "ef", 3, ps));
-  ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
-  ASSERT_EQ(1U, mbrtoc16(&out, "\x80" "ef", 3, ps));
-  ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#else
   ASSERT_EQ(1U, mbrtoc16(&out,
                          "\x8d"
                          "ef",
@@ -344,7 +309,6 @@ void test_mbrtoc16_incomplete(mbstate_t* ps) {
                                               "ef",
                                               3, ps));
   ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#endif
   ASSERT_TRUE(mbsinit(ps));
 
   // Invalid 2-byte

From 16007d520423649c00e21d0a422cd54a51134cb3 Mon Sep 17 00:00:00 2001
From: Dan Albert <danalbert@google.com>
Date: Thu, 20 Jul 2023 23:38:57 +0000
Subject: [PATCH 2/2] Fix result for zero-length non-null conversion.

Bug: None
Test: deleted the xfail half of the test
Change-Id: I1a60d6ef27ebad14de79ac3bc637a6f2280334d8
---
 libc/bionic/mbrtoc32.cpp | 16 ++++++++-
 tests/uchar_test.cpp     | 41 +++++-----------------
 tests/wchar_test.cpp     | 76 +++++++++++++---------------------------
 3 files changed, 48 insertions(+), 85 deletions(-)

diff --git a/libc/bionic/mbrtoc32.cpp b/libc/bionic/mbrtoc32.cpp
index c26dd71d5..74deb4093 100644
--- a/libc/bionic/mbrtoc32.cpp
+++ b/libc/bionic/mbrtoc32.cpp
@@ -51,7 +51,21 @@ size_t mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps) {
   }
 
   if (n == 0) {
-    return 0;
+    // C23 7.30.1 (for each `mbrtoc*` function) says:
+    //
+    // Returns:
+    //
+    //     0 if the next n or fewer bytes complete the multibyte character that
+    //     corresponds to the null wide character (which is the value stored).
+    //
+    //     (size_t)(-2) if the next n bytes contribute to an incomplete (but
+    //     potentially valid) multibyte character, and all n bytes have been
+    //     processed (no value is stored).
+    //
+    // Bionic historically interpreted the behavior when n is 0 to be the next 0
+    // bytes decoding to the null. That's a pretty bad interpretation, and both
+    // glibc and musl return -2 for that case.
+    return BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE;
   }
 
   uint8_t ch;
diff --git a/tests/uchar_test.cpp b/tests/uchar_test.cpp
index c08b5742f..512f098eb 100644
--- a/tests/uchar_test.cpp
+++ b/tests/uchar_test.cpp
@@ -40,31 +40,6 @@ constexpr bool kLibcRejectsOverLongUtf8Sequences = false;
 #error kLibcRejectsOverLongUtf8Sequences must be configured for this platform
 #endif
 
-// C23 7.30.1 (for each `mbrtoc*` function) says:
-//
-// Returns:
-//
-//     0 if the next n or fewer bytes complete the multibyte character that
-//     corresponds to the null wide character (which is the value stored).
-//
-//     (size_t)(-2) if the next n bytes contribute to an incomplete (but
-//     potentially valid) multibyte character, and all n bytes have been
-//     processed (no value is stored).
-//
-// Bionic historically interpreted the behavior when n is 0 to be the next 0
-// bytes decoding to the null. That's a pretty bad interpretation, and both
-// glibc and musl return -2 for that case.
-//
-// The tests currently checks the incorrect behavior for bionic because gtest
-// doesn't support explicit xfail annotations. The behavior difference here
-// should be fixed, but danalbert wants to add more tests before tackling the
-// bugs.
-#ifdef __ANDROID__
-constexpr size_t kExpectedResultForZeroLength = 0U;
-#else
-constexpr size_t kExpectedResultForZeroLength = static_cast<size_t>(-2);
-#endif
-
 TEST(uchar, sizeof_uchar_t) {
   EXPECT_EQ(2U, sizeof(char16_t));
   EXPECT_EQ(4U, sizeof(char32_t));
@@ -199,11 +174,11 @@ TEST(uchar, mbrtoc16_zero_len) {
   char16_t out;
 
   out = L'x';
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc16(&out, "hello", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "hello", 0, nullptr));
   EXPECT_EQ(L'x', out);
 
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc16(&out, "hello", 0, nullptr));
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc16(&out, "", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "hello", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "", 0, nullptr));
   EXPECT_EQ(1U, mbrtoc16(&out, "hello", 1, nullptr));
   EXPECT_EQ(L'h', out);
 }
@@ -408,16 +383,16 @@ TEST(uchar, mbrtoc32) {
   char32_t out[8];
 
   out[0] = L'x';
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(out, "hello", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(out, "hello", 0, nullptr));
   EXPECT_EQ(static_cast<char32_t>(L'x'), out[0]);
 
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(out, "hello", 0, nullptr));
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(out, "", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(out, "hello", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(out, "", 0, nullptr));
   EXPECT_EQ(1U, mbrtoc32(out, "hello", 1, nullptr));
   EXPECT_EQ(static_cast<char32_t>(L'h'), out[0]);
 
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(nullptr, "hello", 0, nullptr));
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtoc32(nullptr, "", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(nullptr, "hello", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(nullptr, "", 0, nullptr));
   EXPECT_EQ(1U, mbrtoc32(nullptr, "hello", 1, nullptr));
 
   EXPECT_EQ(0U, mbrtoc32(nullptr, nullptr, 0, nullptr));
diff --git a/tests/wchar_test.cpp b/tests/wchar_test.cpp
index 04932ba21..28c1046ef 100644
--- a/tests/wchar_test.cpp
+++ b/tests/wchar_test.cpp
@@ -54,31 +54,6 @@ constexpr bool kLibcRejectsOverLongUtf8Sequences = false;
 #error kLibcRejectsOverLongUtf8Sequences must be configured for this platform
 #endif
 
-// C23 7.31.6.3.2 (mbrtowc) says:
-//
-// Returns:
-//
-//     0 if the next n or fewer bytes complete the multibyte character that
-//     corresponds to the null wide character (which is the value stored).
-//
-//     (size_t)(-2) if the next n bytes contribute to an incomplete (but
-//     potentially valid) multibyte character, and all n bytes have been
-//     processed (no value is stored).
-//
-// Bionic historically interpreted the behavior when n is 0 to be the next 0
-// bytes decoding to the null. That's a pretty bad interpretation, and both
-// glibc and musl return -2 for that case.
-//
-// The tests currently checks the incorrect behavior for bionic because gtest
-// doesn't support explicit xfail annotations. The behavior difference here
-// should be fixed, but danalbert wants to add more tests before tackling the
-// bugs.
-#ifdef __ANDROID__
-constexpr size_t kExpectedResultForZeroLength = 0U;
-#else
-constexpr size_t kExpectedResultForZeroLength = static_cast<size_t>(-2);
-#endif
-
 #if defined(__GLIBC__)
 constexpr bool kLibcSupportsParsingBinaryLiterals = __GLIBC_PREREQ(2, 38);
 #else
@@ -92,7 +67,7 @@ TEST(wchar, sizeof_wchar_t) {
 
 TEST(wchar, mbrlen) {
   char bytes[] = { 'h', 'e', 'l', 'l', 'o', '\0' };
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrlen(&bytes[0], 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrlen(&bytes[0], 0, nullptr));
   EXPECT_EQ(1U, mbrlen(&bytes[0], 1, nullptr));
 
   EXPECT_EQ(1U, mbrlen(&bytes[4], 1, nullptr));
@@ -317,37 +292,36 @@ TEST(wchar, wcsstr_80199) {
 TEST(wchar, mbtowc) {
   wchar_t out[8];
 
-  // bionic has the same misunderstanding of the result for a zero-length
-  // conversion for mbtowc as it does for all the other multibyte conversion
-  // functions but mbtowc returns different values than all the others:
+  // mbtowc and all the mbrto* APIs behave slightly differently when n is 0:
   //
-  // C23 7.24.7.2.4:
+  // mbrtowc returns 0 "if the next n or fewer bytes complete the multibyte
+  // character that corresponds to the null wide character"
   //
-  // If s is a null pointer, the mbtowc function returns a nonzero or zero
-  // value, if multibyte character encodings, respectively, do or do not have
-  // state-dependent encodings. If s is not a null pointer, the mbtowc function
-  // either returns 0 (if s points to the null character), or returns the number
-  // of bytes that are contained in the converted multibyte character (if the
-  // next n or fewer bytes form a valid multibyte character), or returns -1 (if
-  // they do not form a valid multibyte character).
-
-#ifdef __BIONIC__
-  int expected_result_for_zero_length = 0;
+  // mbrtoc says: "If s is not a null pointer, the mbtowc function either
+  // returns 0 (if s points to the null character)..."
+  //
+  // So mbrtowc will not provide the correct mbtowc return value for "" and
+  // n = 0.
+  //
+  // glibc gets this right, but all the BSDs (including macOS) and bionic (by
+  // way of openbsd) return -1 instead of 0.
+#ifdef __GLIBC__
+  int expected_result_for_zero_length_empty_string = 0;
 #else
-  int expected_result_for_zero_length = -1;
+  int expected_result_for_zero_length_empty_string = -1;
 #endif
 
   out[0] = 'x';
-  EXPECT_EQ(expected_result_for_zero_length, mbtowc(out, "hello", 0));
+  EXPECT_EQ(-1, mbtowc(out, "hello", 0));
   EXPECT_EQ('x', out[0]);
 
-  EXPECT_EQ(expected_result_for_zero_length, mbtowc(out, "hello", 0));
-  EXPECT_EQ(0, mbtowc(out, "", 0));
+  EXPECT_EQ(-1, mbtowc(out, "hello", 0));
+  EXPECT_EQ(expected_result_for_zero_length_empty_string, mbtowc(out, "", 0));
   EXPECT_EQ(1, mbtowc(out, "hello", 1));
   EXPECT_EQ(L'h', out[0]);
 
-  EXPECT_EQ(expected_result_for_zero_length, mbtowc(nullptr, "hello", 0));
-  EXPECT_EQ(0, mbtowc(nullptr, "", 0));
+  EXPECT_EQ(-1, mbtowc(nullptr, "hello", 0));
+  EXPECT_EQ(expected_result_for_zero_length_empty_string, mbtowc(nullptr, "", 0));
   EXPECT_EQ(1, mbtowc(nullptr, "hello", 1));
 
   EXPECT_EQ(0, mbtowc(nullptr, nullptr, 0));
@@ -357,16 +331,16 @@ TEST(wchar, mbrtowc) {
   wchar_t out[8];
 
   out[0] = 'x';
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(out, "hello", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(out, "hello", 0, nullptr));
   EXPECT_EQ('x', out[0]);
 
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(out, "hello", 0, nullptr));
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(out, "", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(out, "hello", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(out, "", 0, nullptr));
   EXPECT_EQ(1U, mbrtowc(out, "hello", 1, nullptr));
   EXPECT_EQ(L'h', out[0]);
 
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(nullptr, "hello", 0, nullptr));
-  EXPECT_EQ(kExpectedResultForZeroLength, mbrtowc(nullptr, "", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(nullptr, "hello", 0, nullptr));
+  EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(nullptr, "", 0, nullptr));
   EXPECT_EQ(1U, mbrtowc(nullptr, "hello", 1, nullptr));
 
   EXPECT_EQ(0U, mbrtowc(nullptr, nullptr, 0, nullptr));