/* * Copyright (C) 2017 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include "private/bionic_mbstate.h" #define INVALID_ICONV_T reinterpret_cast(-1) // Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something // equivalent to (but slightly easier to use for runs of text than) . If you're // here to add more encodings, consider working on finishing the icu4c NDK wrappers instead. enum Encoding { US_ASCII, UTF_8, UTF_16_LE, UTF_16_BE, UTF_32_LE, UTF_32_BE, WCHAR_T, }; enum Mode { ERROR, IGNORE, TRANSLIT, }; // This matching is strange but true. // See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching. static bool __match_encoding(const char* lhs, const char* rhs) { while (*lhs && *rhs) { // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent. // Also implement the "delete each 0 that is not preceded by a digit" rule. for (; *lhs; ++lhs) { if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break; } // Case doesn't matter either. if (tolower(*lhs) != tolower(*rhs)) break; ++lhs; ++rhs; } // As a special case we treat the GNU "//" extensions as end of string. if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true; return false; } static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) { const char* suffix = strstr(s, "//"); if (suffix) { if (!mode) return false; if (strcmp(suffix, "//IGNORE") == 0) { *mode = IGNORE; } else if (strcmp(suffix, "//TRANSLIT") == 0) { *mode = TRANSLIT; } else { return false; } } if (__match_encoding(s, "utf8")) { *encoding = UTF_8; } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) { *encoding = US_ASCII; } else if (__match_encoding(s, "utf16le")) { *encoding = UTF_16_LE; } else if (__match_encoding(s, "utf16be")) { *encoding = UTF_16_BE; } else if (__match_encoding(s, "utf32le")) { *encoding = UTF_32_LE; } else if (__match_encoding(s, "utf32be")) { *encoding = UTF_32_BE; } else if (__match_encoding(s, "wchart")) { *encoding = WCHAR_T; } else { return false; } return true; } struct __iconv_t { Encoding src_encoding; Encoding dst_encoding; Mode mode; __iconv_t() : mode(ERROR) { } int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) { // Reset state. wc = 0; memset(&ps, 0, sizeof(ps)); replacement_count = 0; ignored = false; src_buf = src_buf0; src_bytes_left = src_bytes_left0; dst_buf = dst_buf0; dst_bytes_left = dst_bytes_left0; while (*src_bytes_left > 0) { if (!GetNext() || !Convert()) return -1; } return Done(); } private: char32_t wc; char buf[16]; size_t src_bytes_used; size_t dst_bytes_used; mbstate_t ps; size_t replacement_count; bool ignored; char** src_buf; size_t* src_bytes_left; char** dst_buf; size_t* dst_bytes_left; bool GetNext() { errno = 0; switch (src_encoding) { case US_ASCII: wc = **src_buf; src_bytes_used = 1; if (wc > 0x7f) errno = EILSEQ; break; case UTF_8: src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps); if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { break; // EILSEQ already set. } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { errno = EINVAL; return false; } break; case UTF_16_BE: case UTF_16_LE: { if (*src_bytes_left < 2) { errno = EINVAL; return false; } bool swap = (src_encoding == UTF_16_BE); wc = In16(*src_buf, swap); // 0xd800-0xdbff: high surrogates // 0xdc00-0xdfff: low surrogates if (wc >= 0xd800 && wc <= 0xdfff) { if (wc >= 0xdc00) { // Low surrogate before high surrogate. errno = EILSEQ; return false; } if (*src_bytes_left < 4) { errno = EINVAL; return false; } uint16_t hi = wc; uint16_t lo = In16(*src_buf + 2, swap); wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00); src_bytes_used = 4; } break; } case UTF_32_BE: case UTF_32_LE: case WCHAR_T: if (*src_bytes_left < 4) { errno = EINVAL; return false; } wc = In32(*src_buf, (src_encoding == UTF_32_BE)); break; } if (errno == EILSEQ) { switch (mode) { case ERROR: return false; case IGNORE: *src_buf += src_bytes_used; *src_bytes_left -= src_bytes_used; ignored = true; return GetNext(); case TRANSLIT: wc = '?'; ++replacement_count; return true; } } return true; } bool Convert() { errno = 0; switch (dst_encoding) { case US_ASCII: buf[0] = wc; dst_bytes_used = 1; if (wc > 0x7f) errno = EILSEQ; break; case UTF_8: dst_bytes_used = c32rtomb(buf, wc, &ps); if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { break; // EILSEQ already set. } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { errno = EINVAL; return false; } break; case UTF_16_BE: case UTF_16_LE: { bool swap = (dst_encoding == UTF_16_BE); if (wc < 0x10000) { // BMP. Out16(buf, wc, swap); } else { // Supplementary plane; output surrogate pair. wc -= 0x10000; char16_t hi = 0xd800 | (wc >> 10); char16_t lo = 0xdc00 | (wc & 0x3ff); Out16(buf + 0, hi, swap); Out16(buf + 2, lo, swap); dst_bytes_used = 4; } } break; case UTF_32_BE: case UTF_32_LE: case WCHAR_T: Out32(wc, (dst_encoding == UTF_32_BE)); break; } if (errno == EILSEQ) { if (mode == IGNORE) { *src_buf += src_bytes_used; *src_bytes_left -= src_bytes_used; ignored = true; return true; } else if (mode == TRANSLIT) { wc = '?'; ++replacement_count; return Convert(); } return false; } return Emit(); } uint16_t In16(const char* buf, bool swap) { const uint8_t* src = reinterpret_cast(buf); uint16_t wc = (src[0]) | (src[1] << 8); if (swap) wc = __swap16(wc); src_bytes_used = 2; return wc; } uint32_t In32(const char* buf, bool swap) { const uint8_t* src = reinterpret_cast(buf); uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24); if (swap) wc = __swap32(wc); src_bytes_used = 4; return wc; } void Out16(char* dst, char16_t ch, bool swap) { if (swap) ch = __swap16(ch); dst[0] = ch; dst[1] = ch >> 8; dst_bytes_used = 2; } void Out32(char32_t ch, bool swap) { if (swap) ch = __swap32(ch); buf[0] = ch; buf[1] = ch >> 8; buf[2] = ch >> 16; buf[3] = ch >> 24; dst_bytes_used = 4; } bool Emit() { if (dst_bytes_used > *dst_bytes_left) { errno = E2BIG; return false; } memcpy(*dst_buf, buf, dst_bytes_used); *src_buf += src_bytes_used; *src_bytes_left -= src_bytes_used; *dst_buf += dst_bytes_used; *dst_bytes_left -= dst_bytes_used; return true; } int Done() { if (mode == TRANSLIT) return replacement_count; if (ignored) { errno = EILSEQ; return -1; } return 0; } }; iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) { iconv_t result = new __iconv_t; if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) || !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) { delete result; errno = EINVAL; return INVALID_ICONV_T; } return result; } size_t iconv(iconv_t __converter, char** __src_buf, size_t* __src_bytes_left, char** __dst_buf, size_t* __dst_bytes_left) { if (__converter == INVALID_ICONV_T) { errno = EBADF; return -1; } return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left); } int iconv_close(iconv_t __converter) { if (__converter == INVALID_ICONV_T) { errno = EBADF; return -1; } delete __converter; return 0; }