iconv.c 5.5 KB
Newer Older
R
Rich Felker 已提交
1 2 3 4 5 6 7 8
#include <iconv.h>
#include <errno.h>
#include <wchar.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>
#include <stdint.h>

9 10 11 12 13 14 15 16 17
#define UTF_32BE    0300
#define UTF_16LE    0301
#define UTF_16BE    0302
#define UTF_32LE    0303
#define UCS2BE      0304
#define UCS2LE      0305
#define US_ASCII    0306
#define WCHAR_T     0307
#define UTF_8       0310
R
Rich Felker 已提交
18 19 20 21 22 23 24 25 26

/* FIXME: these are not implemented yet
 * EUC:   A1-FE A1-FE
 * GBK:   81-FE 40-7E,80-FE
 * Big5:  A1-FE 40-7E,A1-FE
 */

/* Definitions of charmaps. Each charmap consists of:
 * 1. Empty-string-terminated list of null-terminated aliases.
27 28
 * 2. Special type code or number of elided entries.
 * 3. Character table (size determined by field 2). */
R
Rich Felker 已提交
29 30

static const unsigned char charmaps[] =
31 32 33 34 35 36 37 38 39 40
"utf8\0\0\310"
"wchart\0\0\307"
"ucs2\0ucs2be\0\0\304"
"ucs2le\0\0\305"
"utf16\0utf16be\0\0\302"
"utf16le\0\0\301"
"ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
"ucs4le\0utf32le\0\0\303"
"ascii\0usascii\0iso646\0iso646us\0\0\306"
#include "codepages.h"
R
Rich Felker 已提交
41 42
;

43 44 45
static const unsigned short legacy_chars[] = {
#include "legacychars.h"
};
R
Rich Felker 已提交
46

47
static int fuzzycmp(const unsigned char *a, const unsigned char *b)
R
Rich Felker 已提交
48 49 50 51 52 53 54 55
{
	for (; *a && *b; a++, b++) {
		while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
		if ((*a|32U) != *b) return 1;
	}
	return *a != *b;
}

56
static size_t find_charmap(const void *name)
R
Rich Felker 已提交
57 58 59 60
{
	const unsigned char *s;
	for (s=charmaps; *s; ) {
		if (!fuzzycmp(name, s)) {
61
			for (; *s; s+=strlen((void *)s)+1);
R
Rich Felker 已提交
62 63
			return s+1-charmaps;
		}
64
		s += strlen((void *)s)+1;
65 66 67 68
		if (!*s) {
			if (s[1] > 0200) s+=2;
			else s+=2+(128U-s[1])/4*5;
		}
R
Rich Felker 已提交
69 70 71 72 73 74
	}
	return -1;
}

iconv_t iconv_open(const char *to, const char *from)
{
75
	size_t f, t;
R
Rich Felker 已提交
76

77
	if ((t = find_charmap(to))==-1 || (f = find_charmap(from))==-1) {
R
Rich Felker 已提交
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
		errno = EINVAL;
		return (iconv_t)-1;
	}

	return (void *)(f<<16 | t);
}

int iconv_close(iconv_t cd)
{
	return 0;
}

static unsigned get_16(const unsigned char *s, int e)
{
	e &= 1;
	return s[e]<<8 | s[1-e];
}

static void put_16(unsigned char *s, unsigned c, int e)
{
	e &= 1;
	s[e] = c>>8;
	s[1-e] = c;
}

static unsigned get_32(const unsigned char *s, int e)
{
105
	e &= 3;
R
Rich Felker 已提交
106 107 108 109 110
	return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
}

static void put_32(unsigned char *s, unsigned c, int e)
{
111
	e &= 3;
R
Rich Felker 已提交
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
	s[e^0] = c>>24;
	s[e^1] = c>>16;
	s[e^2] = c>>8;
	s[e^3] = c;
}

/* Adapt as needed */
#define mbrtowc_utf8 mbrtowc
#define wctomb_utf8 wctomb

#include <stdio.h>
size_t iconv(iconv_t cd0, char **in, size_t *inb, char **out, size_t *outb)
{
	size_t x=0;
	unsigned long cd = (unsigned long)cd0;
	unsigned to = cd & 0xffff;
	unsigned from = cd >> 16;
129 130
	const unsigned char *map = charmaps+from+1;
	const unsigned char *tomap = charmaps+to+1;
R
Rich Felker 已提交
131 132 133 134 135
	mbstate_t st = {0};
	wchar_t wc;
	unsigned c, d;
	size_t k, l;
	int err;
136 137
	unsigned char type = map[-1];
	unsigned char totype = tomap[-1];
R
Rich Felker 已提交
138 139 140 141 142 143

	if (!in || !*in || !*inb) return 0;

	for (; *inb; *in+=l, *inb-=l) {
		c = *(unsigned char *)*in;
		l = 1;
144 145

		if (c >= 128) switch (type) {
R
Rich Felker 已提交
146 147 148 149 150 151 152 153
		case UTF_8:
			l = mbrtowc_utf8(&wc, *in, *inb, &st);
			if (!l) l++;
			else if (l == (size_t)-1) goto ilseq;
			else if (l == (size_t)-2) goto starved;
			c = wc;
			break;
		case US_ASCII:
154
			goto ilseq;
R
Rich Felker 已提交
155 156 157 158 159 160 161 162 163
		case WCHAR_T:
			l = sizeof(wchar_t);
			if (*inb < l) goto starved;
			c = *(wchar_t *)*in;
			if (0) {
		case UTF_32BE:
		case UTF_32LE:
			l = 4;
			if (*inb < 4) goto starved;
164
			c = get_32((void *)*in, type);
R
Rich Felker 已提交
165 166 167 168 169 170 171 172 173
			}
			if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
			break;
		case UCS2BE:
		case UCS2LE:
		case UTF_16BE:
		case UTF_16LE:
			l = 2;
			if (*inb < 2) goto starved;
174
			c = get_16((void *)*in, type);
R
Rich Felker 已提交
175 176 177 178 179
			if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
			if ((unsigned)(c-0xd800) < 0x400) {
				if (type-UCS2BE < 2U) goto ilseq;
				l = 4;
				if (*inb < 4) goto starved;
180
				d = get_16((void *)(*in + 2), from);
R
Rich Felker 已提交
181 182 183 184
				if ((unsigned)(c-0xdc00) >= 0x400) goto ilseq;
				c = ((c-0xd800)<<10) | (d-0xdc00);
			}
			break;
185 186 187 188 189 190 191
		default:
			if (c < 128+type) break;
			c -= 128+type;
			c = legacy_chars[ map[c*5/4]>>2*c%8 |
				map[c*5/4+1]<<8-2*c%8 & 1023 ];
			if (!c) c = *(unsigned char *)*in;
			if (c==1) goto ilseq;
R
Rich Felker 已提交
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
		}

		switch (totype) {
		case WCHAR_T:
			if (*outb < sizeof(wchar_t)) goto toobig;
			*(wchar_t *)*out = c;
			*out += sizeof(wchar_t);
			*outb -= sizeof(wchar_t);
			break;
		case UTF_8:
			if (*outb < 4) {
				char tmp[4];
				k = wctomb_utf8(tmp, c);
				if (*outb < k) goto toobig;
				memcpy(*out, tmp, k);
			} else k = wctomb_utf8(*out, c);
			*out += k;
			*outb -= k;
			break;
211 212 213
		case US_ASCII:
			if (c > 0x7f) subst: x++, c='*';
		default:
R
Rich Felker 已提交
214
			if (*outb < 1) goto toobig;
215
			if (c < 128+totype) {
R
Rich Felker 已提交
216 217 218 219 220
			revout:
				*(*out)++ = c;
				*outb -= 1;
				break;
			}
221 222 223 224 225
			d = c;
			for (c=0; c<128-totype; c++) {
				if (d == legacy_chars[ map[c*5/4]>>2*c%8 |
					map[c*5/4+1]<<8-2*c%8 & 1023 ]) {
					c += 128;
R
Rich Felker 已提交
226 227 228
					goto revout;
				}
			}
229
			goto subst;
R
Rich Felker 已提交
230 231 232 233 234 235
		case UCS2BE:
		case UCS2LE:
		case UTF_16BE:
		case UTF_16LE:
			if (c < 0x10000) {
				if (*outb < 2) goto toobig;
236
				put_16((void *)*out, c, totype);
R
Rich Felker 已提交
237 238 239 240 241 242
				*out += 2;
				*outb -= 2;
				break;
			}
			if (type-UCS2BE < 2U) goto ilseq;
			if (*outb < 4) goto toobig;
243 244
			put_16((void *)*out, (c>>10)|0xd800, totype);
			put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
R
Rich Felker 已提交
245 246 247 248 249 250
			*out += 4;
			*outb -= 4;
			break;
		case UTF_32BE:
		case UTF_32LE:
			if (*outb < 4) goto toobig;
251
			put_32((void *)*out, c, totype);
R
Rich Felker 已提交
252 253 254 255 256 257 258 259 260 261 262 263
			*out += 4;
			*outb -= 4;
			break;
		}
	}
	return x;
ilseq:
	err = EILSEQ;
	x = -1;
	goto end;
toobig:
	err = E2BIG;
264
	x = -1;
R
Rich Felker 已提交
265 266 267
	goto end;
starved:
	err = EINVAL;
268
	x = -1;
R
Rich Felker 已提交
269 270 271 272
end:
	errno = err;
	return x;
}