iconv.c 7.4 KB
Newer Older
R
Rich Felker 已提交
1 2 3 4 5 6 7 8
#include <iconv.h>
#include <errno.h>
#include <wchar.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>
#include <stdint.h>

9 10 11 12 13 14 15 16 17
#define UTF_32BE    0300
#define UTF_16LE    0301
#define UTF_16BE    0302
#define UTF_32LE    0303
#define UCS2BE      0304
#define UCS2LE      0305
#define US_ASCII    0306
#define WCHAR_T     0307
#define UTF_8       0310
18 19
#define EUC_JP      0320
#define SHIFT_JIS   0321
20 21 22
#define GB18030     0330
#define GBK         0331
#define GB2312      0332
R
Rich Felker 已提交
23 24 25 26 27 28 29 30 31

/* FIXME: these are not implemented yet
 * EUC:   A1-FE A1-FE
 * GBK:   81-FE 40-7E,80-FE
 * Big5:  A1-FE 40-7E,A1-FE
 */

/* Definitions of charmaps. Each charmap consists of:
 * 1. Empty-string-terminated list of null-terminated aliases.
32 33
 * 2. Special type code or number of elided entries.
 * 3. Character table (size determined by field 2). */
R
Rich Felker 已提交
34 35

static const unsigned char charmaps[] =
36 37 38 39 40 41 42 43 44
"utf8\0\0\310"
"wchart\0\0\307"
"ucs2\0ucs2be\0\0\304"
"ucs2le\0\0\305"
"utf16\0utf16be\0\0\302"
"utf16le\0\0\301"
"ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
"ucs4le\0utf32le\0\0\303"
"ascii\0usascii\0iso646\0iso646us\0\0\306"
45 46
"eucjp\0\0\320"
"shiftjis\0sjis\0\0\321"
47 48 49
"gb18030\0\0\330"
"gbk\0\0\331"
"gb2312\0\0\332"
50
#include "codepages.h"
R
Rich Felker 已提交
51 52
;

53 54 55
static const unsigned short legacy_chars[] = {
#include "legacychars.h"
};
R
Rich Felker 已提交
56

57 58 59 60
static const unsigned short jis0208[84][94] = {
#include "jis0208.h"
};

61 62 63 64
static const unsigned short gb18030[126][190] = {
#include "gb18030.h"
};

65
static int fuzzycmp(const unsigned char *a, const unsigned char *b)
R
Rich Felker 已提交
66 67 68 69 70 71 72 73
{
	for (; *a && *b; a++, b++) {
		while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
		if ((*a|32U) != *b) return 1;
	}
	return *a != *b;
}

74
static size_t find_charmap(const void *name)
R
Rich Felker 已提交
75 76 77 78
{
	const unsigned char *s;
	for (s=charmaps; *s; ) {
		if (!fuzzycmp(name, s)) {
79
			for (; *s; s+=strlen((void *)s)+1);
R
Rich Felker 已提交
80 81
			return s+1-charmaps;
		}
82
		s += strlen((void *)s)+1;
83 84 85 86
		if (!*s) {
			if (s[1] > 0200) s+=2;
			else s+=2+(128U-s[1])/4*5;
		}
R
Rich Felker 已提交
87 88 89 90 91 92
	}
	return -1;
}

iconv_t iconv_open(const char *to, const char *from)
{
93
	size_t f, t;
R
Rich Felker 已提交
94

95 96
	if ((t = find_charmap(to))==-1
	 || (f = find_charmap(from))==-1
97
	 || (charmaps[t] >= 0320)) {
R
Rich Felker 已提交
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
		errno = EINVAL;
		return (iconv_t)-1;
	}

	return (void *)(f<<16 | t);
}

int iconv_close(iconv_t cd)
{
	return 0;
}

static unsigned get_16(const unsigned char *s, int e)
{
	e &= 1;
	return s[e]<<8 | s[1-e];
}

static void put_16(unsigned char *s, unsigned c, int e)
{
	e &= 1;
	s[e] = c>>8;
	s[1-e] = c;
}

static unsigned get_32(const unsigned char *s, int e)
{
125
	e &= 3;
R
Rich Felker 已提交
126 127 128 129 130
	return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
}

static void put_32(unsigned char *s, unsigned c, int e)
{
131
	e &= 3;
R
Rich Felker 已提交
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
	s[e^0] = c>>24;
	s[e^1] = c>>16;
	s[e^2] = c>>8;
	s[e^3] = c;
}

/* Adapt as needed */
#define mbrtowc_utf8 mbrtowc
#define wctomb_utf8 wctomb

size_t iconv(iconv_t cd0, char **in, size_t *inb, char **out, size_t *outb)
{
	size_t x=0;
	unsigned long cd = (unsigned long)cd0;
	unsigned to = cd & 0xffff;
	unsigned from = cd >> 16;
148 149
	const unsigned char *map = charmaps+from+1;
	const unsigned char *tomap = charmaps+to+1;
R
Rich Felker 已提交
150 151 152 153 154
	mbstate_t st = {0};
	wchar_t wc;
	unsigned c, d;
	size_t k, l;
	int err;
155 156
	unsigned char type = map[-1];
	unsigned char totype = tomap[-1];
R
Rich Felker 已提交
157 158 159 160 161 162

	if (!in || !*in || !*inb) return 0;

	for (; *inb; *in+=l, *inb-=l) {
		c = *(unsigned char *)*in;
		l = 1;
163 164

		if (c >= 128) switch (type) {
R
Rich Felker 已提交
165 166 167 168 169 170 171 172
		case UTF_8:
			l = mbrtowc_utf8(&wc, *in, *inb, &st);
			if (!l) l++;
			else if (l == (size_t)-1) goto ilseq;
			else if (l == (size_t)-2) goto starved;
			c = wc;
			break;
		case US_ASCII:
173
			goto ilseq;
R
Rich Felker 已提交
174 175 176 177 178 179 180 181 182
		case WCHAR_T:
			l = sizeof(wchar_t);
			if (*inb < l) goto starved;
			c = *(wchar_t *)*in;
			if (0) {
		case UTF_32BE:
		case UTF_32LE:
			l = 4;
			if (*inb < 4) goto starved;
183
			c = get_32((void *)*in, type);
R
Rich Felker 已提交
184 185 186 187 188 189 190 191 192
			}
			if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
			break;
		case UCS2BE:
		case UCS2LE:
		case UTF_16BE:
		case UTF_16LE:
			l = 2;
			if (*inb < 2) goto starved;
193
			c = get_16((void *)*in, type);
R
Rich Felker 已提交
194 195 196 197 198
			if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
			if ((unsigned)(c-0xd800) < 0x400) {
				if (type-UCS2BE < 2U) goto ilseq;
				l = 4;
				if (*inb < 4) goto starved;
199
				d = get_16((void *)(*in + 2), from);
R
Rich Felker 已提交
200 201 202 203
				if ((unsigned)(c-0xdc00) >= 0x400) goto ilseq;
				c = ((c-0xd800)<<10) | (d-0xdc00);
			}
			break;
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
		case SHIFT_JIS:
			if (c-0xa1 <= 0xdf-0xa1) {
				c += 0xff61-0xa1;
				break;
			}
			l = 2;
			if (*inb < 2) goto starved;
			d = *((unsigned char *)*in + 1);
			if (c-129 <= 159-129) c -= 129;
			else if (c-224 <= 239-224) c -= 193;
			else goto ilseq;
			c *= 2;
			if (d-64 <= 158-64) {
				if (d==127) goto ilseq;
				if (d>127) d--;
				d -= 64;
			} else if (d-159 <= 252-159) {
				c++;
				d -= 159;
			}
			c = jis0208[c][d];
			if (!c) goto ilseq;
			break;
		case EUC_JP:
			l = 2;
			if (*inb < 2) goto starved;
			d = *((unsigned char *)*in + 1);
			if (c==0x8e) {
				c = d;
				if (c-0xa1 > 0xdf-0xa1) goto ilseq;
				c += 0xff61 - 0xa1;
				break;
			}
			c -= 0xa1;
			d -= 0xa1;
			if (c >= 84 || d >= 94) goto ilseq;
			c = jis0208[c][d];
			if (!c) goto ilseq;
			break;
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
		case GB2312:
			if (c < 0xa1) goto ilseq;
		case GBK:
		case GB18030:
			c -= 0x81;
			if (c >= 126) goto ilseq;
			l = 2;
			if (*inb < 2) goto starved;
			d = *((unsigned char *)*in + 1);
			if (d < 0xa1 && type == GB2312) goto ilseq;
			if (d-0x40>=191 || d==127) {
				if (d-'0'>9 || type != GB18030)
					goto ilseq;
				l = 4;
				if (*inb < 4) goto starved;
				c = (10*c + d-'0') * 1260;
				d = *((unsigned char *)*in + 2);
				if (d-0x81>126) goto ilseq;
				c += 10*(d-0x81);
				d = *((unsigned char *)*in + 3);
				if (d-'0'>9) goto ilseq;
				c += d-'0';
				c += 128;
				for (d=0; d<=c; ) {
					k = 0;
					for (int i=0; i<126; i++)
						for (int j=0; j<190; j++)
							if (gb18030[i][j]-d <= c-d)
								k++;
					d = c+1;
					c += k;
				}
				break;
			}
			d -= 0x40;
			if (d>63) d--;
			c = gb18030[c][d];
			break;
281 282 283 284 285 286 287
		default:
			if (c < 128+type) break;
			c -= 128+type;
			c = legacy_chars[ map[c*5/4]>>2*c%8 |
				map[c*5/4+1]<<8-2*c%8 & 1023 ];
			if (!c) c = *(unsigned char *)*in;
			if (c==1) goto ilseq;
R
Rich Felker 已提交
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
		}

		switch (totype) {
		case WCHAR_T:
			if (*outb < sizeof(wchar_t)) goto toobig;
			*(wchar_t *)*out = c;
			*out += sizeof(wchar_t);
			*outb -= sizeof(wchar_t);
			break;
		case UTF_8:
			if (*outb < 4) {
				char tmp[4];
				k = wctomb_utf8(tmp, c);
				if (*outb < k) goto toobig;
				memcpy(*out, tmp, k);
			} else k = wctomb_utf8(*out, c);
			*out += k;
			*outb -= k;
			break;
307 308 309
		case US_ASCII:
			if (c > 0x7f) subst: x++, c='*';
		default:
R
Rich Felker 已提交
310
			if (*outb < 1) goto toobig;
311
			if (c < 128+totype) {
R
Rich Felker 已提交
312 313 314 315 316
			revout:
				*(*out)++ = c;
				*outb -= 1;
				break;
			}
317 318 319 320 321
			d = c;
			for (c=0; c<128-totype; c++) {
				if (d == legacy_chars[ map[c*5/4]>>2*c%8 |
					map[c*5/4+1]<<8-2*c%8 & 1023 ]) {
					c += 128;
R
Rich Felker 已提交
322 323 324
					goto revout;
				}
			}
325
			goto subst;
R
Rich Felker 已提交
326 327 328 329 330 331
		case UCS2BE:
		case UCS2LE:
		case UTF_16BE:
		case UTF_16LE:
			if (c < 0x10000) {
				if (*outb < 2) goto toobig;
332
				put_16((void *)*out, c, totype);
R
Rich Felker 已提交
333 334 335 336 337 338
				*out += 2;
				*outb -= 2;
				break;
			}
			if (type-UCS2BE < 2U) goto ilseq;
			if (*outb < 4) goto toobig;
339 340
			put_16((void *)*out, (c>>10)|0xd800, totype);
			put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
R
Rich Felker 已提交
341 342 343 344 345 346
			*out += 4;
			*outb -= 4;
			break;
		case UTF_32BE:
		case UTF_32LE:
			if (*outb < 4) goto toobig;
347
			put_32((void *)*out, c, totype);
R
Rich Felker 已提交
348 349 350 351 352 353 354 355 356 357 358 359
			*out += 4;
			*outb -= 4;
			break;
		}
	}
	return x;
ilseq:
	err = EILSEQ;
	x = -1;
	goto end;
toobig:
	err = E2BIG;
360
	x = -1;
R
Rich Felker 已提交
361 362 363
	goto end;
starved:
	err = EINVAL;
364
	x = -1;
R
Rich Felker 已提交
365 366 367 368
end:
	errno = err;
	return x;
}