From 9defd37c11749b535b84a2230ec856deea10972e Mon Sep 17 00:00:00 2001 From: Jo-Philipp Wich Date: Fri, 3 Feb 2012 01:28:14 +0000 Subject: [PATCH] libiconv: implement UTF-8 encode/decode Implement own UTF-8 encoding and decoding routines as uClibc's wchar procedures are mostly stubbed. Add Latin9 (ISO-8859-15) decoding support while we're at it. SVN-Revision: 30005 --- libs/libiconv/Makefile | 4 +- libs/libiconv/src/iconv.c | 154 +++++++++++++++++++++++++++++++++++--- 2 files changed, 146 insertions(+), 12 deletions(-) diff --git a/libs/libiconv/Makefile b/libs/libiconv/Makefile index 6509ce17d..8a90ef1d6 100644 --- a/libs/libiconv/Makefile +++ b/libs/libiconv/Makefile @@ -1,5 +1,5 @@ # -# Copyright (C) 2010-2011 OpenWrt.org +# Copyright (C) 2010-2012 OpenWrt.org # # This Makefile and the code shipped in src/ is free software, licensed # under the GNU Lesser General Public License, version 2.1 and later. @@ -11,7 +11,7 @@ include $(TOPDIR)/rules.mk PKG_NAME:=libiconv -PKG_RELEASE:=5 +PKG_RELEASE:=6 include $(INCLUDE_DIR)/package.mk include $(INCLUDE_DIR)/host-build.mk diff --git a/libs/libiconv/src/iconv.c b/libs/libiconv/src/iconv.c index 8b2bd97d9..8dd5a0cb3 100644 --- a/libs/libiconv/src/iconv.c +++ b/libs/libiconv/src/iconv.c @@ -99,14 +99,14 @@ iconv_t iconv_open(const char *to, const char *from) unsigned f, t; int m; - if ((t = find_charset(to)) >= 8) + if ((t = find_charset(to)) > 8) return -1; if ((f = find_charset(from)) < 255) - return 0 | (t<<1) | (f<<4); + return 0 | (t<<1) | (f<<8); if ((m = find_charmap(from)) > -1) - return 1 | (t<<1) | (m<<4); + return 1 | (t<<1) | (m<<8); return -1; } @@ -129,13 +129,143 @@ static inline void put_16(unsigned char *s, wchar_t c, int endian) s[endian^1] = c; } +static inline int utf8enc_wchar(char *outb, wchar_t c) +{ + if (c <= 0x7F) { + *outb = c; + return 1; + } + else if (c <= 0x7FF) { + *outb++ = ((c >> 6) & 0x1F) | 0xC0; + *outb++ = ( c & 0x3F) | 0x80; + return 2; + } + else if (c <= 0xFFFF) { + *outb++ = ((c >> 12) & 0x0F) | 0xE0; + *outb++ = ((c >> 6) & 0x3F) | 0x80; + *outb++ = ( c & 0x3F) | 0x80; + return 3; + } + else if (c <= 0x10FFFF) { + *outb++ = ((c >> 18) & 0x07) | 0xF0; + *outb++ = ((c >> 12) & 0x3F) | 0x80; + *outb++ = ((c >> 6) & 0x3F) | 0x80; + *outb++ = ( c & 0x3F) | 0x80; + return 4; + } + else { + *outb++ = '?'; + return 1; + } +} + +static inline int utf8seq_is_overlong(char *s, int n) +{ + switch (n) + { + case 2: + /* 1100000x (10xxxxxx) */ + return (((*s >> 1) == 0x60) && + ((*(s+1) >> 6) == 0x02)); + + case 3: + /* 11100000 100xxxxx (10xxxxxx) */ + return ((*s == 0xE0) && + ((*(s+1) >> 5) == 0x04) && + ((*(s+2) >> 6) == 0x02)); + + case 4: + /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */ + return ((*s == 0xF0) && + ((*(s+1) >> 4) == 0x08) && + ((*(s+2) >> 6) == 0x02) && + ((*(s+3) >> 6) == 0x02)); + } + + return 0; +} + +static inline int utf8seq_is_surrogate(char *s, int n) +{ + return ((n == 3) && (*s == 0xED) && (*(s+1) >= 0xA0) && (*(s+1) <= 0xBF)); +} + +static inline int utf8seq_is_illegal(char *s, int n) +{ + return ((n == 3) && (*s == 0xEF) && (*(s+1) == 0xBF) && + (*(s+2) >= 0xBE) && (*(s+2) <= 0xBF)); +} + +static inline int utf8dec_wchar(wchar_t *c, unsigned char *in, size_t inb) +{ + int i; + int n = -1; + + /* trivial char */ + if (*in <= 0x7F) { + *c = *in; + return 1; + } + + /* find utf8 sequence length */ + if ((*in & 0xE0) == 0xC0) n = 2; + else if ((*in & 0xF0) == 0xE0) n = 3; + else if ((*in & 0xF8) == 0xF0) n = 4; + else if ((*in & 0xFC) == 0xF8) n = 5; + else if ((*in & 0xFE) == 0xFC) n = 6; + + /* starved? */ + if (n > inb) + return -2; + + /* decode ... */ + if (n > 1 && n < 5) { + /* reject invalid sequences */ + if (utf8seq_is_overlong(in, n) || + utf8seq_is_surrogate(in, n) || + utf8seq_is_illegal(in, n)) + return -1; + + /* decode ... */ + *c = (char)(*in++ & (0x7F >> n)); + + for (i = 1; i < n; i++) { + /* illegal continuation byte */ + if (*in < 0x80 || *in > 0xBF) + return -1; + + *c = (*c << 6) | (*in++ & 0x3F); + } + + return n; + } + + /* unmapped sequence (> 4) */ + return -1; +} + +static inline char latin9_translit(wchar_t c) +{ + /* a number of trivial iso-8859-15 <> utf-8 transliterations */ + switch (c) { + case 0x20AC: return 0xA4; /* Euro */ + case 0x0160: return 0xA6; /* S caron */ + case 0x0161: return 0xA8; /* s caron */ + case 0x017D: return 0xB4; /* Z caron */ + case 0x017E: return 0xB8; /* z caron */ + case 0x0152: return 0xBC; /* OE */ + case 0x0153: return 0xBD; /* oe */ + case 0x0178: return 0xBE; /* Y diaeresis */ + default: return '?'; + } +} + size_t iconv(iconv_t cd, char **in, size_t *inb, char **out, size_t *outb) { size_t x=0; - unsigned char to = (cd>>1)&7; + unsigned char to = (cd>>1)&127; unsigned char from = 255; const unsigned char *map = 0; - mbstate_t st = {0}; char tmp[MB_LEN_MAX]; wchar_t c, d; size_t k, l; @@ -144,9 +274,9 @@ size_t iconv(iconv_t cd, char **in, size_t *inb, char **out, size_t *outb) if (!in || !*in || !*inb) return 0; if (cd & 1) - map = charmaps[cd>>4].map; + map = charmaps[cd>>8].map; else - from = cd>>4; + from = cd>>8; for (; *inb; *in+=l, *inb-=l) { c = *(unsigned char *)*in; @@ -159,7 +289,7 @@ size_t iconv(iconv_t cd, char **in, size_t *inb, char **out, size_t *outb) c = *(wchar_t *)*in; break; case UTF_8: - l = mbrtowc(&c, *in, *inb, &st); + l = utf8dec_wchar(&c, *in, *inb); if (!l) l++; else if (l == (size_t)-1) goto ilseq; else if (l == (size_t)-2) goto starved; @@ -249,16 +379,20 @@ charok: break; case UTF_8: if (*outb < 4) { - k = wctomb(tmp, c); + k = utf8enc_wchar(tmp, c); if (*outb < k) goto toobig; memcpy(*out, tmp, k); - } else k = wctomb(*out, c); + } else k = utf8enc_wchar(*out, c); *out += k; *outb -= k; break; case US_ASCII: if (c > 0x7f) c = 0xfffd; /* fall thru and count replacement in latin1 case */ + case LATIN_9: + if (c >= 0x100 && c != 0xfffd) + c = latin9_translit(c); + /* fall through */ case LATIN_1: if (!*outb) goto toobig; if (c < 0x100) **out = c; -- 2.30.2