platform_bionic/libc/bionic/iconv.cpp
Elliott Hughes a648733cb7 Implement <iconv.h>.
Bug: http://b/32978596
Test: ran tests
Change-Id: I56b6ae3d9c5a3a56d2b4afba33fb8f9e964bf7b9
2017-08-25 08:47:41 -07:00

368 lines
9.8 KiB
C++

/*
* Copyright (C) 2017 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <iconv.h>
#include <ctype.h>
#include <endian.h>
#include <errno.h>
#include <stdlib.h>
#include <uchar.h>
#include "private/bionic_mbstate.h"
#define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
// Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something
// equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're
// here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.
enum Encoding {
US_ASCII,
UTF_8,
UTF_16_LE,
UTF_16_BE,
UTF_32_LE,
UTF_32_BE,
WCHAR_T,
};
enum Mode {
ERROR,
IGNORE,
TRANSLIT,
};
// This matching is strange but true.
// See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.
static bool __match_encoding(const char* lhs, const char* rhs) {
while (*lhs && *rhs) {
// Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.
// Also implement the "delete each 0 that is not preceded by a digit" rule.
for (; *lhs; ++lhs) {
if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break;
}
// Case doesn't matter either.
if (tolower(*lhs) != tolower(*rhs)) break;
++lhs;
++rhs;
}
// As a special case we treat the GNU "//" extensions as end of string.
if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true;
return false;
}
static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {
const char* suffix = strstr(s, "//");
if (suffix) {
if (!mode) return false;
if (strcmp(suffix, "//IGNORE") == 0) {
*mode = IGNORE;
} else if (strcmp(suffix, "//TRANSLIT") == 0) {
*mode = TRANSLIT;
} else {
return false;
}
}
if (__match_encoding(s, "utf8")) {
*encoding = UTF_8;
} else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) {
*encoding = US_ASCII;
} else if (__match_encoding(s, "utf16le")) {
*encoding = UTF_16_LE;
} else if (__match_encoding(s, "utf16be")) {
*encoding = UTF_16_BE;
} else if (__match_encoding(s, "utf32le")) {
*encoding = UTF_32_LE;
} else if (__match_encoding(s, "utf32be")) {
*encoding = UTF_32_BE;
} else if (__match_encoding(s, "wchart")) {
*encoding = WCHAR_T;
} else {
return false;
}
return true;
}
struct __iconv_t {
Encoding src_encoding;
Encoding dst_encoding;
Mode mode;
__iconv_t() : mode(ERROR) {
}
int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {
// Reset state.
wc = 0;
memset(&ps, 0, sizeof(ps));
replacement_count = 0;
ignored = false;
src_buf = src_buf0;
src_bytes_left = src_bytes_left0;
dst_buf = dst_buf0;
dst_bytes_left = dst_bytes_left0;
while (*src_bytes_left > 0) {
if (!GetNext() || !Convert()) return -1;
}
return Done();
}
private:
char32_t wc;
char buf[16];
size_t src_bytes_used;
size_t dst_bytes_used;
mbstate_t ps;
size_t replacement_count;
bool ignored;
char** src_buf;
size_t* src_bytes_left;
char** dst_buf;
size_t* dst_bytes_left;
bool GetNext() {
errno = 0;
switch (src_encoding) {
case US_ASCII:
wc = **src_buf;
src_bytes_used = 1;
if (wc > 0x7f) errno = EILSEQ;
break;
case UTF_8:
src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps);
if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
break; // EILSEQ already set.
} else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
errno = EINVAL;
return false;
}
break;
case UTF_16_BE:
case UTF_16_LE: {
if (*src_bytes_left < 2) {
errno = EINVAL;
return false;
}
bool swap = (src_encoding == UTF_16_BE);
wc = In16(*src_buf, swap);
// 0xd800-0xdbff: high surrogates
// 0xdc00-0xdfff: low surrogates
if (wc >= 0xd800 && wc <= 0xdfff) {
if (wc >= 0xdc00) { // Low surrogate before high surrogate.
errno = EILSEQ;
return false;
}
if (*src_bytes_left < 4) {
errno = EINVAL;
return false;
}
uint16_t hi = wc;
uint16_t lo = In16(*src_buf + 2, swap);
wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);
src_bytes_used = 4;
}
break;
}
case UTF_32_BE:
case UTF_32_LE:
case WCHAR_T:
if (*src_bytes_left < 4) {
errno = EINVAL;
return false;
}
wc = In32(*src_buf, (src_encoding == UTF_32_BE));
break;
}
if (errno == EILSEQ) {
switch (mode) {
case ERROR:
return false;
case IGNORE:
*src_buf += src_bytes_used;
*src_bytes_left -= src_bytes_used;
ignored = true;
return GetNext();
case TRANSLIT:
wc = '?';
++replacement_count;
return true;
}
}
return true;
}
bool Convert() {
errno = 0;
switch (dst_encoding) {
case US_ASCII:
buf[0] = wc;
dst_bytes_used = 1;
if (wc > 0x7f) errno = EILSEQ;
break;
case UTF_8:
dst_bytes_used = c32rtomb(buf, wc, &ps);
if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
break; // EILSEQ already set.
} else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
errno = EINVAL;
return false;
}
break;
case UTF_16_BE:
case UTF_16_LE: {
bool swap = (dst_encoding == UTF_16_BE);
if (wc < 0x10000) { // BMP.
Out16(buf, wc, swap);
} else { // Supplementary plane; output surrogate pair.
wc -= 0x10000;
char16_t hi = 0xd800 | (wc >> 10);
char16_t lo = 0xdc00 | (wc & 0x3ff);
Out16(buf + 0, hi, swap);
Out16(buf + 2, lo, swap);
dst_bytes_used = 4;
}
} break;
case UTF_32_BE:
case UTF_32_LE:
case WCHAR_T:
Out32(wc, (dst_encoding == UTF_32_BE));
break;
}
if (errno == EILSEQ) {
if (mode == IGNORE) {
*src_buf += src_bytes_used;
*src_bytes_left -= src_bytes_used;
ignored = true;
return true;
} else if (mode == TRANSLIT) {
wc = '?';
++replacement_count;
return Convert();
}
return false;
}
return Emit();
}
uint16_t In16(const char* buf, bool swap) {
const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
uint16_t wc = (src[0]) | (src[1] << 8);
if (swap) wc = __swap16(wc);
src_bytes_used = 2;
return wc;
}
uint32_t In32(const char* buf, bool swap) {
const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24);
if (swap) wc = __swap32(wc);
src_bytes_used = 4;
return wc;
}
void Out16(char* dst, char16_t ch, bool swap) {
if (swap) ch = __swap16(ch);
dst[0] = ch;
dst[1] = ch >> 8;
dst_bytes_used = 2;
}
void Out32(char32_t ch, bool swap) {
if (swap) ch = __swap32(ch);
buf[0] = ch;
buf[1] = ch >> 8;
buf[2] = ch >> 16;
buf[3] = ch >> 24;
dst_bytes_used = 4;
}
bool Emit() {
if (dst_bytes_used > *dst_bytes_left) {
errno = E2BIG;
return false;
}
memcpy(*dst_buf, buf, dst_bytes_used);
*src_buf += src_bytes_used;
*src_bytes_left -= src_bytes_used;
*dst_buf += dst_bytes_used;
*dst_bytes_left -= dst_bytes_used;
return true;
}
int Done() {
if (mode == TRANSLIT) return replacement_count;
if (ignored) {
errno = EILSEQ;
return -1;
}
return 0;
}
};
iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {
iconv_t result = new __iconv_t;
if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) ||
!__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {
delete result;
errno = EINVAL;
return INVALID_ICONV_T;
}
return result;
}
size_t iconv(iconv_t __converter,
char** __src_buf, size_t* __src_bytes_left,
char** __dst_buf, size_t* __dst_bytes_left) {
if (__converter == INVALID_ICONV_T) {
errno = EBADF;
return -1;
}
return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);
}
int iconv_close(iconv_t __converter) {
if (__converter == INVALID_ICONV_T) {
errno = EBADF;
return -1;
}
delete __converter;
return 0;
}