wchar.c 8.6 KB
Newer Older
M
Marc G. Fournier 已提交
1
/*
M
 
Marc G. Fournier 已提交
2
 * conversion functions between pg_wchar and multi-byte streams.
M
Marc G. Fournier 已提交
3
 * Tatsuo Ishii
4
 * $Id: wchar.c,v 1.16 2001/03/08 00:24:34 tgl Exp $
5 6 7
 *
 * WIN1250 client encoding updated by Pavel Behal
 *
M
Marc G. Fournier 已提交
8
 */
9 10
/* can be used in either frontend or backend */
#include "postgres_fe.h"
M
 
Marc G. Fournier 已提交
11 12

#include "mb/pg_wchar.h"
13

M
Marc G. Fournier 已提交
14
/*
15 16 17
 * conversion to pg_wchar is done by "table driven."
 * to add an encoding support, define mb2wchar_with_len(), mblen()
 * for the particular encoding. Note that if the encoding is only
18
 * supported in the client, you don't need to define
19
 * mb2wchar_with_len() function (SJIS is the case).
M
Marc G. Fournier 已提交
20
 */
B
Bruce Momjian 已提交
21 22 23 24

/*
 * SQL/ASCII
 */
25
static int pg_ascii2wchar_with_len
26
			(const unsigned char *from, pg_wchar * to, int len)
B
Bruce Momjian 已提交
27
{
28 29
	int cnt = 0;

30
	while (len > 0 && *from)
31 32 33
	{
		*to++ = *from++;
		len--;
34
		cnt++;
35 36
	}
	*to = 0;
37
	return(cnt);
B
Bruce Momjian 已提交
38 39
}

40 41
static int
pg_ascii_mblen(const unsigned char *s)
B
Bruce Momjian 已提交
42
{
43
	return (1);
B
Bruce Momjian 已提交
44 45 46 47 48 49
}

/*
 * EUC
 */

50
static int pg_euc2wchar_with_len
51
			(const unsigned char *from, pg_wchar * to, int len)
M
Marc G. Fournier 已提交
52
{
53 54
	int cnt = 0;

55
	while (len > 0 && *from)
56
	{
57
		if (*from == SS2 && len >= 2)
58 59 60
		{
			from++;
			*to = 0xff & *from++;
61
			len -= 2;
62
		}
63
		else if (*from == SS3 && len >= 3)
64 65 66 67 68 69
		{
			from++;
			*to = *from++ << 8;
			*to |= 0x3f & *from++;
			len -= 3;
		}
70
		else if ((*from & 0x80) && len >= 2)
71 72 73 74 75 76 77 78 79 80 81
		{
			*to = *from++ << 8;
			*to |= *from++;
			len -= 2;
		}
		else
		{
			*to = *from++;
			len--;
		}
		to++;
82
		cnt++;
83 84
	}
	*to = 0;
85
	return(cnt);
M
Marc G. Fournier 已提交
86 87
}

88 89
static int
pg_euc_mblen(const unsigned char *s)
M
Marc G. Fournier 已提交
90
{
91
	int			len;
92

93 94 95 96 97 98 99 100 101
	if (*s == SS2)
		len = 2;
	else if (*s == SS3)
		len = 3;
	else if (*s & 0x80)
		len = 2;
	else
		len = 1;
	return (len);
M
Marc G. Fournier 已提交
102 103 104
}

/*
105
 * EUC_JP
M
Marc G. Fournier 已提交
106
 */
107
static int pg_eucjp2wchar_with_len
108
			(const unsigned char *from, pg_wchar * to, int len)
M
Marc G. Fournier 已提交
109
{
110
	return(pg_euc2wchar_with_len(from, to, len));
M
Marc G. Fournier 已提交
111 112
}

113 114
static int
pg_eucjp_mblen(const unsigned char *s)
M
Marc G. Fournier 已提交
115
{
116
	return (pg_euc_mblen(s));
M
Marc G. Fournier 已提交
117 118 119
}

/*
120
 * EUC_KR
M
Marc G. Fournier 已提交
121
 */
122
static int pg_euckr2wchar_with_len
123
			(const unsigned char *from, pg_wchar * to, int len)
M
Marc G. Fournier 已提交
124
{
125
	return(pg_euc2wchar_with_len(from, to, len));
M
Marc G. Fournier 已提交
126 127
}

128 129
static int
pg_euckr_mblen(const unsigned char *s)
M
Marc G. Fournier 已提交
130
{
131
	return (pg_euc_mblen(s));
M
Marc G. Fournier 已提交
132 133
}

134 135 136
/*
 * EUC_CN
 */
137
static int pg_euccn2wchar_with_len
138
			(const unsigned char *from, pg_wchar * to, int len)
M
Marc G. Fournier 已提交
139
{
140 141
	int cnt = 0;

142
	while (len > 0 && *from)
143
	{
144
		if (*from == SS2 && len >= 3)
145 146 147 148
		{
			from++;
			*to = 0x3f00 & (*from++ << 8);
			*to = *from++;
149
			len -= 3;
150
		}
151
		else if (*from == SS3 && len >= 3)
152 153 154 155 156 157
		{
			from++;
			*to = *from++ << 8;
			*to |= 0x3f & *from++;
			len -= 3;
		}
158
		else if ((*from & 0x80) && len >= 2)
159 160 161 162 163 164 165 166 167 168 169
		{
			*to = *from++ << 8;
			*to |= *from++;
			len -= 2;
		}
		else
		{
			*to = *from++;
			len--;
		}
		to++;
170
		cnt++;
171 172
	}
	*to = 0;
173
	return(cnt);
M
Marc G. Fournier 已提交
174 175
}

176 177
static int
pg_euccn_mblen(const unsigned char *s)
178
{
179
	int			len;
180

181
	if (*s & 0x80)
182 183 184 185
		len = 2;
	else
		len = 1;
	return (len);
186 187 188 189 190
}

/*
 * EUC_TW
 */
191
static int pg_euctw2wchar_with_len
192
			(const unsigned char *from, pg_wchar * to, int len)
M
Marc G. Fournier 已提交
193
{
194 195
	int cnt = 0;

196
	while (len > 0 && *from)
197
	{
198
		if (*from == SS2 && len >= 4)
199 200 201 202 203
		{
			from++;
			*to = *from++ << 16;
			*to |= *from++ << 8;
			*to |= *from++;
204
			len -= 4;
205
		}
206
		else if (*from == SS3 && len >= 3)
207 208 209 210 211 212
		{
			from++;
			*to = *from++ << 8;
			*to |= 0x3f & *from++;
			len -= 3;
		}
213
		else if ((*from & 0x80) && len >= 2)
214 215 216 217 218 219 220 221 222 223 224
		{
			*to = *from++ << 8;
			*to |= *from++;
			len -= 2;
		}
		else
		{
			*to = *from++;
			len--;
		}
		to++;
225
		cnt++;
226 227
	}
	*to = 0;
228
	return(cnt);
M
Marc G. Fournier 已提交
229 230
}

231 232
static int
pg_euctw_mblen(const unsigned char *s)
233
{
234
	int			len;
235

236 237 238 239 240 241 242 243 244
	if (*s == SS2)
		len = 4;
	else if (*s == SS3)
		len = 3;
	else if (*s & 0x80)
		len = 2;
	else
		len = 1;
	return (len);
245 246
}

M
Marc G. Fournier 已提交
247
/*
248
 * convert UTF-8 string to pg_wchar (UCS-2)
M
Marc G. Fournier 已提交
249 250 251 252
 * caller should allocate enough space for "to"
 * len: length of from.
 * "from" not necessarily null terminated.
 */
253
static int
254
pg_utf2wchar_with_len(const unsigned char *from, pg_wchar * to, int len)
M
Marc G. Fournier 已提交
255
{
256 257 258
	unsigned char c1,
				c2,
				c3;
259
	int cnt = 0;
260

261
	while (len > 0 && *from)
262 263 264 265 266 267
	{
		if ((*from & 0x80) == 0)
		{
			*to = *from++;
			len--;
		}
268
		else if ((*from & 0xe0) == 0xc0 && len >= 2)
269 270 271 272 273
		{
			c1 = *from++ & 0x1f;
			c2 = *from++ & 0x3f;
			*to = c1 << 6;
			*to |= c2;
274
			len -= 2;
275
		}
276
		else if ((*from & 0xe0) == 0xe0 && len >= 3)
277 278 279 280 281 282 283
		{
			c1 = *from++ & 0x0f;
			c2 = *from++ & 0x3f;
			c3 = *from++ & 0x3f;
			*to = c1 << 12;
			*to |= c2 << 6;
			*to |= c3;
284
			len -= 3;
285
		}
286 287 288 289 290
		else
		{
			*to = *from++;
			len--;
		}
291
		to++;
292
		cnt++;
293 294
	}
	*to = 0;
295
	return(cnt);
M
Marc G. Fournier 已提交
296 297
}

298 299 300 301
/*
 * returns the byte length of a UTF-8 word pointed to by s
 */
int
302
pg_utf_mblen(const unsigned char *s)
303
{
304
	int			len = 1;
305

306 307 308 309 310 311 312
	if ((*s & 0x80) == 0)
		len = 1;
	else if ((*s & 0xe0) == 0xc0)
		len = 2;
	else if ((*s & 0xe0) == 0xe0)
		len = 3;
	return (len);
313 314
}

M
Marc G. Fournier 已提交
315 316 317 318 319 320
/*
 * convert mule internal code to pg_wchar
 * caller should allocate enough space for "to"
 * len: length of from.
 * "from" not necessarily null terminated.
 */
321
static int
322
pg_mule2wchar_with_len(const unsigned char *from, pg_wchar * to, int len)
M
Marc G. Fournier 已提交
323
{
324 325
	int cnt = 0;

326
	while (len > 0 && *from)
327
	{
328
		if (IS_LC1(*from) && len >= 2)
329 330 331 332 333
		{
			*to = *from++ << 16;
			*to |= *from++;
			len -= 2;
		}
334
		else if (IS_LCPRV1(*from) && len >= 3)
335 336 337 338 339 340
		{
			from++;
			*to = *from++ << 16;
			*to |= *from++;
			len -= 3;
		}
341
		else if (IS_LC2(*from) && len >= 3)
342 343 344 345 346 347
		{
			*to = *from++ << 16;
			*to |= *from++ << 8;
			*to |= *from++;
			len -= 3;
		}
348
		else if (IS_LCPRV2(*from) && len >= 4)
349 350 351 352 353 354 355 356 357 358 359 360 361
		{
			from++;
			*to = *from++ << 16;
			*to |= *from++ << 8;
			*to |= *from++;
			len -= 4;
		}
		else
		{						/* assume ASCII */
			*to = (unsigned char) *from++;
			len--;
		}
		to++;
362
		cnt++;
363 364
	}
	*to = 0;
365
	return(cnt);
M
Marc G. Fournier 已提交
366 367
}

368 369
int
pg_mule_mblen(const unsigned char *s)
M
Marc G. Fournier 已提交
370
{
371
	int			len;
M
Marc G. Fournier 已提交
372

373 374 375 376 377 378 379 380 381 382 383 384 385
	if (IS_LC1(*s))
		len = 2;
	else if (IS_LCPRV1(*s))
		len = 3;
	else if (IS_LC2(*s))
		len = 3;
	else if (IS_LCPRV2(*s))
		len = 4;
	else
	{							/* assume ASCII */
		len = 1;
	}
	return (len);
M
Marc G. Fournier 已提交
386 387
}

388 389 390
/*
 * ISO8859-1
 */
391
static int
392
pg_latin12wchar_with_len(const unsigned char *from, pg_wchar * to, int len)
M
Marc G. Fournier 已提交
393
{
394 395
	int cnt = 0;

396
	while (len > 0 && *from)
397
	{
398
		*to++ = *from++;
399
		len--;
400 401
		cnt++;
	}
402
	*to = 0;
403
	return(cnt);
M
Marc G. Fournier 已提交
404 405
}

406 407
static int
pg_latin1_mblen(const unsigned char *s)
M
Marc G. Fournier 已提交
408
{
409
	return (1);
M
Marc G. Fournier 已提交
410 411
}

412 413 414
/*
 * SJIS
 */
415 416
static int
pg_sjis_mblen(const unsigned char *s)
M
Marc G. Fournier 已提交
417
{
418
	int			len;
M
Marc G. Fournier 已提交
419

420 421 422 423 424 425 426 427 428 429 430 431 432
	if (*s >= 0xa1 && *s <= 0xdf)
	{							/* 1 byte kana? */
		len = 1;
	}
	else if (*s > 0x7f)
	{							/* kanji? */
		len = 2;
	}
	else
	{							/* should be ASCII */
		len = 1;
	}
	return (len);
M
Marc G. Fournier 已提交
433 434
}

435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
/*
 * Big5
 */
static int
pg_big5_mblen(const unsigned char *s)
{
	int			len;

	if (*s > 0x7f)
	{							/* kanji? */
		len = 2;
	}
	else
	{							/* should be ASCII */
		len = 1;
	}
	return (len);
}

M
 
Marc G. Fournier 已提交
454
pg_wchar_tbl pg_wchar_table[] = {
T
Tatsuo Ishii 已提交
455 456 457 458 459 460 461
	{pg_ascii2wchar_with_len, pg_ascii_mblen},	/* 0 */
	{pg_eucjp2wchar_with_len, pg_eucjp_mblen},	/* 1 */
	{pg_euccn2wchar_with_len, pg_euccn_mblen},	/* 2 */
	{pg_euckr2wchar_with_len, pg_euckr_mblen},	/* 3 */
	{pg_euctw2wchar_with_len, pg_euctw_mblen},	/* 4 */
	{pg_utf2wchar_with_len, pg_utf_mblen},		/* 5 */
	{pg_mule2wchar_with_len, pg_mule_mblen},	/* 6 */
B
Bruce Momjian 已提交
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 7 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 8 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 9 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 10 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 11 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 12 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 13 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 14 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 15 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 16 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 17 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 18 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 19 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 20 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 21 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 22 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 23 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 24 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 25 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 26 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 27 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 28 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 29 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 30 */
	{pg_latin12wchar_with_len, pg_latin1_mblen},		/* 31 */
	{0, pg_sjis_mblen},			/* 32 */
488
	{0, pg_big5_mblen},			/* 33 */
489
	{pg_latin12wchar_with_len, pg_latin1_mblen} /* 34 */
490 491 492
};

/* returns the byte length of a word for mule internal code */
493 494
int
pg_mic_mblen(const unsigned char *mbstr)
495
{
496
	return (pg_mule_mblen(mbstr));
497
}
498 499 500 501 502 503 504 505 506

/* 
 * Returns the byte length of a multi-byte word.
 */
int
pg_encoding_mblen(int encoding, const unsigned char *mbstr)
{
	return ((*pg_wchar_table[encoding].mblen) (mbstr));
}