platform_bionic/tests/iconv_test.cpp
Elliott Hughes 20c023fdb2 iconv(3): ignore src_bytes_left if src_bytes is null.
This is undefined behavior, but glibc and macOS are both lenient, and
someone hit this in the wild, so we may as well be lenient too. (The
only cost is that it's now slightly easier to write code that works on
everything except old versions of Android.)

Bug: https://issuetracker.google.com/180598400
Test: treehugger
Change-Id: Ia217169ea6283cc53f4fbf71e5abfa08356c2049
2021-02-18 10:37:22 -08:00

463 lines
13 KiB
C++

/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <gtest/gtest.h>
#include <iconv.h>
#define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
TEST(iconv, iconv_open_EINVAL) {
errno = 0;
ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "silly"));
ASSERT_EQ(EINVAL, errno);
errno = 0;
ASSERT_EQ(INVALID_ICONV_T, iconv_open("silly", "UTF-8"));
ASSERT_EQ(EINVAL, errno);
errno = 0;
ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "silly"));
ASSERT_EQ(EINVAL, errno);
}
TEST(iconv, iconv_open_comparator) {
// Examples from http://www.unicode.org/reports/tr22/#Charset_Alias_Matching:
// "For example, the following names should match: "UTF-8", "utf8", "u.t.f-008", ..."
iconv_t c;
ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "utf8"));
ASSERT_EQ(0, iconv_close(c));
ASSERT_NE(INVALID_ICONV_T, c = iconv_open("UTF-8", "u.t.f-008"));
ASSERT_EQ(0, iconv_close(c));
// "...but not "utf-80" or "ut8"."
errno = 0;
ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "utf-80"));
ASSERT_EQ(EINVAL, errno);
errno = 0;
ASSERT_EQ(INVALID_ICONV_T, iconv_open("UTF-8", "ut80"));
ASSERT_EQ(EINVAL, errno);
}
TEST(iconv, iconv_smoke) {
const char* utf8 = "a٦ᄀ"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
char buf[BUFSIZ] = {};
iconv_t c = iconv_open("UTF-32LE", "UTF-8");
ASSERT_NE(INVALID_ICONV_T, c);
char* in = const_cast<char*>(utf8);
size_t in_bytes = strlen(in);
char* out = buf;
size_t out_bytes = sizeof(buf);
EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
wchar_t* utf16 = reinterpret_cast<wchar_t*>(buf);
EXPECT_EQ(L'a', utf16[0]);
EXPECT_EQ(L'٦', utf16[1]);
EXPECT_EQ(L'', utf16[2]);
EXPECT_EQ(L'\0', utf16[3]);
EXPECT_EQ(0U, in_bytes);
EXPECT_EQ(sizeof(buf) - (3 /* chars */ * 4 /* bytes each */), out_bytes);
ASSERT_EQ(0, iconv_close(c));
}
TEST(iconv, iconv_lossy_TRANSLIT) {
const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
char buf[BUFSIZ] = {};
iconv_t c = iconv_open("ASCII//TRANSLIT", "UTF-8");
ASSERT_NE(INVALID_ICONV_T, c);
char* in = const_cast<char*>(utf8);
size_t in_bytes = strlen(in);
char* out = buf;
size_t out_bytes = sizeof(buf);
// Two of the input characters (5 input bytes) aren't representable as ASCII.
// With "//TRANSLIT", we use a replacement character, and report the number
// of replacements.
EXPECT_EQ(2U, iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ('a', buf[0]);
EXPECT_EQ('?', buf[1]);
EXPECT_EQ('?', buf[2]);
EXPECT_EQ('z', buf[3]);
EXPECT_EQ(0, buf[4]);
EXPECT_EQ(0U, in_bytes);
EXPECT_EQ(sizeof(buf) - 4, out_bytes);
ASSERT_EQ(0, iconv_close(c));
}
TEST(iconv, iconv_lossy_IGNORE) {
const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
char buf[BUFSIZ] = {};
iconv_t c = iconv_open("ASCII//IGNORE", "UTF-8");
ASSERT_NE(INVALID_ICONV_T, c);
char* in = const_cast<char*>(utf8);
size_t in_bytes = strlen(in);
char* out = buf;
size_t out_bytes = sizeof(buf);
// Two of the input characters (5 input bytes) aren't representable as ASCII.
// With "//IGNORE", we just skip them (but return failure).
errno = 0;
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(EILSEQ, errno);
EXPECT_EQ('a', buf[0]);
EXPECT_EQ('z', buf[1]);
EXPECT_EQ(0, buf[2]);
EXPECT_EQ(0U, in_bytes);
EXPECT_EQ(sizeof(buf) - 2, out_bytes);
ASSERT_EQ(0, iconv_close(c));
}
TEST(iconv, iconv_lossy) {
const char* utf8 = "a٦ᄀz"; // U+0666 ٦ 0xd9 0xa6 // U+1100 ᄀ 0xe1 0x84 0x80
char buf[BUFSIZ] = {};
iconv_t c = iconv_open("ASCII", "UTF-8");
ASSERT_NE(INVALID_ICONV_T, c);
char* in = const_cast<char*>(utf8);
size_t in_bytes = strlen(in);
char* out = buf;
size_t out_bytes = sizeof(buf);
// The second input character isn't representable as ASCII, so we stop there.
errno = 0;
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(EILSEQ, errno);
EXPECT_EQ('a', buf[0]);
EXPECT_EQ(0, buf[1]);
EXPECT_EQ(6U, in_bytes); // Two bytes for ٦, three bytes for ᄀ, and one byte for z.
EXPECT_EQ(sizeof(buf) - 1, out_bytes);
ASSERT_EQ(0, iconv_close(c));
}
TEST(iconv, iconv_malformed_sequence_EILSEQ) {
const char* utf8 = "a\xd9z"; // 0xd9 is the first byte of the two-byte U+0666 ٦.
char buf[BUFSIZ] = {};
iconv_t c = iconv_open("UTF-8", "UTF-8");
ASSERT_NE(INVALID_ICONV_T, c);
char* in = const_cast<char*>(utf8);
size_t in_bytes = strlen(in);
char* out = buf;
size_t out_bytes = sizeof(buf);
// The second input byte is a malformed character, so we stop there.
errno = 0;
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(EILSEQ, errno);
EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the invalid sequence.
++in;
--in_bytes;
errno = 0;
EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(0, errno);
EXPECT_EQ('a', buf[0]);
EXPECT_EQ('z', buf[1]);
EXPECT_EQ(0, buf[2]);
EXPECT_EQ(0U, in_bytes);
EXPECT_EQ(sizeof(buf) - 2, out_bytes);
ASSERT_EQ(0, iconv_close(c));
}
TEST(iconv, iconv_incomplete_sequence_EINVAL) {
const char* utf8 = "a\xd9"; // 0xd9 is the first byte of the two-byte U+0666 ٦.
char buf[BUFSIZ] = {};
iconv_t c = iconv_open("UTF-8", "UTF-8");
ASSERT_NE(INVALID_ICONV_T, c);
char* in = const_cast<char*>(utf8);
size_t in_bytes = strlen(in);
char* out = buf;
size_t out_bytes = sizeof(buf);
// The second input byte is just the start of a character, and we don't have any more bytes.
errno = 0;
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(EINVAL, errno);
EXPECT_EQ('\xd9', *in); // *in is left pointing to the start of the incomplete sequence.
EXPECT_EQ('a', buf[0]);
EXPECT_EQ(0, buf[1]);
EXPECT_EQ(1U, in_bytes);
EXPECT_EQ(sizeof(buf) - 1, out_bytes);
ASSERT_EQ(0, iconv_close(c));
}
TEST(iconv, iconv_E2BIG) {
const char* utf8 = "abc";
char buf[BUFSIZ] = {};
iconv_t c = iconv_open("UTF-8", "UTF-8");
ASSERT_NE(INVALID_ICONV_T, c);
char* in = const_cast<char*>(utf8);
size_t in_bytes = strlen(in);
char* out = buf;
size_t out_bytes = 1;
// We need three bytes, so one isn't enough (but we will make progress).
out_bytes = 1;
errno = 0;
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(E2BIG, errno);
EXPECT_EQ(2U, in_bytes);
EXPECT_EQ(0U, out_bytes);
// Two bytes left, so zero isn't enough (and we can't even make progress).
out_bytes = 0;
errno = 0;
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(E2BIG, errno);
EXPECT_EQ(2U, in_bytes);
EXPECT_EQ(0U, out_bytes);
// Two bytes left, so one isn't enough (but we will make progress).
out_bytes = 1;
errno = 0;
EXPECT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(E2BIG, errno);
EXPECT_EQ(1U, in_bytes);
EXPECT_EQ(0U, out_bytes);
// One byte left, so one byte is now enough.
out_bytes = 1;
errno = 0;
EXPECT_EQ(0U, iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(0, errno);
EXPECT_EQ(0U, in_bytes);
EXPECT_EQ(0U, out_bytes);
EXPECT_EQ('a', buf[0]);
EXPECT_EQ('b', buf[1]);
EXPECT_EQ('c', buf[2]);
EXPECT_EQ(0, buf[3]);
ASSERT_EQ(0, iconv_close(c));
}
TEST(iconv, iconv_invalid_converter_EBADF) {
char* in = nullptr;
char* out = nullptr;
size_t in_bytes = 0;
size_t out_bytes = 0;
errno = 0;
ASSERT_EQ(static_cast<size_t>(-1), iconv(INVALID_ICONV_T, &in, &in_bytes, &out, &out_bytes));
ASSERT_EQ(EBADF, errno);
}
TEST(iconv, iconv_close_invalid_converter_EBADF) {
errno = 0;
ASSERT_EQ(-1, iconv_close(INVALID_ICONV_T));
ASSERT_EQ(EBADF, errno);
}
static void RoundTrip(const char* dst_enc, const char* expected_bytes, size_t n) {
// Examples from https://en.wikipedia.org/wiki/UTF-16.
const char* utf8 = "$€𐐷"; // U+0024, U+20AC, U+10437.
iconv_t c = iconv_open(dst_enc, "UTF-8");
ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
char* in = const_cast<char*>(utf8);
size_t in_bytes = strlen(utf8);
char buf[BUFSIZ] = {};
char* out = buf;
size_t out_bytes = sizeof(buf);
size_t replacement_count = iconv(c, &in, &in_bytes, &out, &out_bytes);
// Check we got the bytes we were expecting.
for (size_t i = 0; i < n; ++i) {
EXPECT_EQ(expected_bytes[i], buf[i]) << i << ' '<< dst_enc;
}
ASSERT_EQ(0, iconv_close(c));
// We can't round-trip if there were replacements.
if (strstr(dst_enc, "ascii")) {
GTEST_LOG_(INFO) << "can't round-trip " << dst_enc << "\n";
return;
}
ASSERT_EQ(0U, replacement_count);
c = iconv_open("UTF-8", dst_enc);
ASSERT_NE(INVALID_ICONV_T, c) << dst_enc;
in = buf;
in_bytes = n;
char buf2[BUFSIZ] = {};
out = buf2;
out_bytes = sizeof(buf2);
iconv(c, &in, &in_bytes, &out, &out_bytes);
ASSERT_STREQ(utf8, buf2) << dst_enc;
ASSERT_EQ(0, iconv_close(c));
}
TEST(iconv, iconv_round_trip_ascii) {
RoundTrip("ascii//TRANSLIT", "$??", 3);
}
TEST(iconv, iconv_round_trip_utf8) {
RoundTrip("utf8", "\x24\xe2\x82\xac\xf0\x90\x90\xb7", 8);
}
TEST(iconv, iconv_round_trip_utf16be) {
RoundTrip("utf16be", "\x00\x24" "\x20\xac" "\xd8\x01\xdc\x37", 8);
}
TEST(iconv, iconv_round_trip_utf16le) {
RoundTrip("utf16le", "\x24\x00" "\xac\x20" "\x01\xd8\x37\xdc", 8);
}
TEST(iconv, iconv_round_trip_utf32be) {
RoundTrip("utf32be", "\x00\x00\x00\x24" "\x00\x00\x20\xac" "\x00\x01\x04\x37", 12);
}
TEST(iconv, iconv_round_trip_utf32le) {
RoundTrip("utf32le", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
}
TEST(iconv, iconv_round_trip_wchar_t) {
RoundTrip("wchar_t", "\x24\x00\x00\x00" "\xac\x20\x00\x00" "\x37\x04\x01\x00", 12);
}
static void Check(int expected_errno, const char* src_enc, const char* src, size_t n) {
iconv_t c = iconv_open("wchar_t", src_enc);
char* in = const_cast<char*>(src);
size_t in_bytes = n;
wchar_t out_buf[16];
size_t out_bytes = sizeof(out_buf);
char* out = reinterpret_cast<char*>(out_buf);
errno = 0;
ASSERT_EQ(static_cast<size_t>(-1), iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(expected_errno, errno);
EXPECT_EQ(0, iconv_close(c));
}
TEST(iconv, iconv_EILSEQ_ascii) {
Check(EILSEQ, "ASCII", "\xac", 1); // > 0x7f, so not ASCII.
}
TEST(iconv, iconv_EILSEQ_utf8_initial) {
Check(EILSEQ, "utf8", "\x82", 1); // Invalid initial byte.
}
TEST(iconv, iconv_EILSEQ_utf8_non_initial) {
Check(EILSEQ, "utf8", "\xe2\xe2\x82", 3); // Invalid second byte.
}
TEST(iconv, iconv_EILSEQ_utf16be_low_surrogate_first) {
Check(EILSEQ, "utf16be", "\xdc\x37" "\xd8\x01", 4);
}
TEST(iconv, iconv_EILSEQ_utf16le_low_surrogate_first) {
Check(EILSEQ, "utf16le", "\x37\xdc" "\x01\xd8", 4);
}
TEST(iconv, iconv_EINVAL_utf8_short) {
Check(EINVAL, "utf8", "\xe2\x82", 2); // Missing final byte of 3-byte sequence.
}
TEST(iconv, iconv_EINVAL_utf16be_short) {
Check(EINVAL, "utf16be", "\x00", 1); // Missing second byte.
}
TEST(iconv, iconv_EINVAL_utf16be_missing_low_surrogate) {
Check(EINVAL, "utf16be", "\xd8\x01", 2);
}
TEST(iconv, iconv_EINVAL_utf16be_half_low_surrogate) {
Check(EINVAL, "utf16be", "\xd8\x01\xdc", 3);
}
TEST(iconv, iconv_EINVAL_utf16le_short) {
Check(EINVAL, "utf16le", "\x24", 1); // Missing second byte.
}
TEST(iconv, iconv_EINVAL_utf16le_missing_low_surrogate) {
Check(EINVAL, "utf16le", "\x01\xd8", 2);
}
TEST(iconv, iconv_EINVAL_utf16le_half_low_surrogate) {
Check(EINVAL, "utf16le", "\x01\xd8\x37", 3);
}
TEST(iconv, iconv_EINVAL_utf32be_short) {
Check(EINVAL, "utf32be", "\x00\x00\x00", 3); // Missing final byte.
}
TEST(iconv, iconv_EINVAL_utf32le_short) {
Check(EINVAL, "utf32le", "\x24\x00\x00", 3); // Missing final byte.
}
TEST(iconv, iconv_initial_shift_state) {
// POSIX: "For state-dependent encodings, the conversion descriptor
// cd is placed into its initial shift state by a call for which inbuf
// is a null pointer, or for which inbuf points to a null pointer."
iconv_t c = iconv_open("utf8", "utf8");
char* in = nullptr;
size_t in_bytes = 0;
wchar_t out_buf[16];
size_t out_bytes = sizeof(out_buf);
char* out = reinterpret_cast<char*>(out_buf);
// Points to a null pointer...
errno = 0;
ASSERT_EQ(static_cast<size_t>(0), iconv(c, &in, &in_bytes, &out, &out_bytes));
EXPECT_EQ(0, errno);
EXPECT_EQ(sizeof(out_buf), out_bytes);
// Is a null pointer...
errno = 0;
ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, &in_bytes, &out, &out_bytes));
EXPECT_EQ(0, errno);
EXPECT_EQ(sizeof(out_buf), out_bytes);
// Is a null pointer and so is in_bytes. This isn't specified by POSIX, but
// glibc and macOS both allow that, where Android historically didn't.
// https://issuetracker.google.com/180598400
errno = 0;
ASSERT_EQ(static_cast<size_t>(0), iconv(c, nullptr, nullptr, &out, &out_bytes));
EXPECT_EQ(0, errno);
EXPECT_EQ(sizeof(out_buf), out_bytes);
EXPECT_EQ(0, iconv_close(c));
}