unicode.c 10.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
/*
 * unicode.c
 *
 * PURPOSE
 *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
 *      Also handles filename mangling
 *
 * DESCRIPTION
 *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
 *		http://www.osta.org/
 *	UTF-8 is explained in the IETF RFC XXXX.
 *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
 *
 * COPYRIGHT
 *	This file is distributed under the terms of the GNU General Public
 *	License (GPL). Copies of the GPL can be obtained from:
 *		ftp://prep.ai.mit.edu/pub/gnu/GPL
 *	Each contributing author retains all rights to their own work.
 */

#include "udfdecl.h"

#include <linux/kernel.h>
#include <linux/string.h>	/* for memset */
#include <linux/nls.h>

#include "udf_sb.h"

static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);

31
static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
L
Linus Torvalds 已提交
32
{
33
	if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
L
Linus Torvalds 已提交
34
		return 0;
35

L
Linus Torvalds 已提交
36 37 38 39
	memset(dest, 0, sizeof(struct ustr));
	memcpy(dest->u_name, src, strlen);
	dest->u_cmpID = 0x08;
	dest->u_len = strlen;
40

L
Linus Torvalds 已提交
41 42 43 44 45 46
	return strlen;
}

/*
 * udf_build_ustr
 */
47
int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
L
Linus Torvalds 已提交
48 49 50
{
	int usesize;

51
	if ((!dest) || (!ptr) || (!size))
L
Linus Torvalds 已提交
52 53 54
		return -1;

	memset(dest, 0, sizeof(struct ustr));
55 56 57 58
	usesize = (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size;
	dest->u_cmpID = ptr[0];
	dest->u_len = ptr[size - 1];
	memcpy(dest->u_name, ptr + 1, usesize - 1);
59

L
Linus Torvalds 已提交
60 61 62 63 64 65
	return 0;
}

/*
 * udf_build_ustr_exact
 */
66
static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
L
Linus Torvalds 已提交
67
{
68
	if ((!dest) || (!ptr) || (!exactsize))
L
Linus Torvalds 已提交
69 70 71
		return -1;

	memset(dest, 0, sizeof(struct ustr));
72 73 74
	dest->u_cmpID = ptr[0];
	dest->u_len = exactsize - 1;
	memcpy(dest->u_name, ptr + 1, exactsize - 1);
75

L
Linus Torvalds 已提交
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
	return 0;
}

/*
 * udf_ocu_to_utf8
 *
 * PURPOSE
 *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
 *
 * DESCRIPTION
 *	This routine is only called by udf_filldir().
 *
 * PRE-CONDITIONS
 *	utf			Pointer to UTF-8 output buffer.
 *	ocu			Pointer to OSTA Compressed Unicode input buffer
 *				of size UDF_NAME_LEN bytes.
 * 				both of type "struct ustr *"
 *
 * POST-CONDITIONS
 *	<return>		Zero on success.
 *
 * HISTORY
 *	November 12, 1997 - Andrew E. Mileski
 *	Written, tested, and released.
 */
int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i)
{
	uint8_t *ocu;
	uint32_t c;
	uint8_t cmp_id, ocu_len;
	int i;

	ocu = ocu_i->u_name;

	ocu_len = ocu_i->u_len;
	cmp_id = ocu_i->u_cmpID;
	utf_o->u_len = 0;

114
	if (ocu_len == 0) {
L
Linus Torvalds 已提交
115 116 117 118 119 120
		memset(utf_o, 0, sizeof(struct ustr));
		utf_o->u_cmpID = 0;
		utf_o->u_len = 0;
		return 0;
	}

121 122 123
	if ((cmp_id != 8) && (cmp_id != 16)) {
		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
		       cmp_id, ocu_i->u_name);
L
Linus Torvalds 已提交
124 125 126
		return 0;
	}

127
	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
L
Linus Torvalds 已提交
128 129 130 131 132 133 134

		/* Expand OSTA compressed Unicode to Unicode */
		c = ocu[i++];
		if (cmp_id == 16)
			c = (c << 8) | ocu[i++];

		/* Compress Unicode to UTF-8 */
135 136 137
		if (c < 0x80U) {
			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
		} else if (c < 0x800U) {
M
Marcin Slusarz 已提交
138 139 140 141
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0xc0 | (c >> 6));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 | (c & 0x3f));
142
		} else {
M
Marcin Slusarz 已提交
143 144 145 146 147 148 149
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0xe0 | (c >> 12));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 |
							  ((c >> 6) & 0x3f));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 | (c & 0x3f));
L
Linus Torvalds 已提交
150 151
		}
	}
152
	utf_o->u_cmpID = 8;
L
Linus Torvalds 已提交
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179

	return utf_o->u_len;
}

/*
 *
 * udf_utf8_to_ocu
 *
 * PURPOSE
 *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
 *
 * DESCRIPTION
 *	This routine is only called by udf_lookup().
 *
 * PRE-CONDITIONS
 *	ocu			Pointer to OSTA Compressed Unicode output
 *				buffer of size UDF_NAME_LEN bytes.
 *	utf			Pointer to UTF-8 input buffer.
 *	utf_len			Length of UTF-8 input buffer in bytes.
 *
 * POST-CONDITIONS
 *	<return>		Zero on success.
 *
 * HISTORY
 *	November 12, 1997 - Andrew E. Mileski
 *	Written, tested, and released.
 */
180
static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
L
Linus Torvalds 已提交
181 182 183 184 185 186 187 188
{
	unsigned c, i, max_val, utf_char;
	int utf_cnt, u_len;

	memset(ocu, 0, sizeof(dstring) * length);
	ocu[0] = 8;
	max_val = 0xffU;

189
try_again:
L
Linus Torvalds 已提交
190 191 192
	u_len = 0U;
	utf_char = 0U;
	utf_cnt = 0U;
193
	for (i = 0U; i < utf->u_len; i++) {
194
		c = (uint8_t)utf->u_name[i];
L
Linus Torvalds 已提交
195 196

		/* Complete a multi-byte UTF-8 character */
197
		if (utf_cnt) {
L
Linus Torvalds 已提交
198 199 200
			utf_char = (utf_char << 6) | (c & 0x3fU);
			if (--utf_cnt)
				continue;
201
		} else {
L
Linus Torvalds 已提交
202
			/* Check for a multi-byte UTF-8 character */
203
			if (c & 0x80U) {
L
Linus Torvalds 已提交
204
				/* Start a multi-byte UTF-8 character */
205
				if ((c & 0xe0U) == 0xc0U) {
L
Linus Torvalds 已提交
206 207
					utf_char = c & 0x1fU;
					utf_cnt = 1;
208
				} else if ((c & 0xf0U) == 0xe0U) {
L
Linus Torvalds 已提交
209 210
					utf_char = c & 0x0fU;
					utf_cnt = 2;
211
				} else if ((c & 0xf8U) == 0xf0U) {
L
Linus Torvalds 已提交
212 213
					utf_char = c & 0x07U;
					utf_cnt = 3;
214
				} else if ((c & 0xfcU) == 0xf8U) {
L
Linus Torvalds 已提交
215 216
					utf_char = c & 0x03U;
					utf_cnt = 4;
217
				} else if ((c & 0xfeU) == 0xfcU) {
L
Linus Torvalds 已提交
218 219
					utf_char = c & 0x01U;
					utf_cnt = 5;
220
				} else {
L
Linus Torvalds 已提交
221
					goto error_out;
222
				}
L
Linus Torvalds 已提交
223
				continue;
224
			} else {
L
Linus Torvalds 已提交
225 226
				/* Single byte UTF-8 character (most common) */
				utf_char = c;
227
			}
L
Linus Torvalds 已提交
228 229 230
		}

		/* Choose no compression if necessary */
231
		if (utf_char > max_val) {
232
			if (max_val == 0xffU) {
L
Linus Torvalds 已提交
233
				max_val = 0xffffU;
234
				ocu[0] = (uint8_t)0x10U;
L
Linus Torvalds 已提交
235 236 237 238 239
				goto try_again;
			}
			goto error_out;
		}

M
Marcin Slusarz 已提交
240
		if (max_val == 0xffffU)
241 242
			ocu[++u_len] = (uint8_t)(utf_char >> 8);
		ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
L
Linus Torvalds 已提交
243 244
	}

245
	if (utf_cnt) {
246
error_out:
L
Linus Torvalds 已提交
247 248 249 250
		ocu[++u_len] = '?';
		printk(KERN_DEBUG "udf: bad UTF-8 character\n");
	}

251 252
	ocu[length - 1] = (uint8_t)u_len + 1;

L
Linus Torvalds 已提交
253 254 255
	return u_len + 1;
}

256 257
static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
			struct ustr *ocu_i)
L
Linus Torvalds 已提交
258 259 260 261 262 263 264 265 266 267 268 269
{
	uint8_t *ocu;
	uint32_t c;
	uint8_t cmp_id, ocu_len;
	int i;

	ocu = ocu_i->u_name;

	ocu_len = ocu_i->u_len;
	cmp_id = ocu_i->u_cmpID;
	utf_o->u_len = 0;

270
	if (ocu_len == 0) {
L
Linus Torvalds 已提交
271 272 273 274 275 276
		memset(utf_o, 0, sizeof(struct ustr));
		utf_o->u_cmpID = 0;
		utf_o->u_len = 0;
		return 0;
	}

277 278 279
	if ((cmp_id != 8) && (cmp_id != 16)) {
		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
		       cmp_id, ocu_i->u_name);
L
Linus Torvalds 已提交
280 281 282
		return 0;
	}

283
	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
L
Linus Torvalds 已提交
284 285 286 287 288
		/* Expand OSTA compressed Unicode to Unicode */
		c = ocu[i++];
		if (cmp_id == 16)
			c = (c << 8) | ocu[i++];

289 290
		utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
					      UDF_NAME_LEN - utf_o->u_len);
L
Linus Torvalds 已提交
291
	}
292
	utf_o->u_cmpID = 8;
L
Linus Torvalds 已提交
293 294 295 296

	return utf_o->u_len;
}

297
static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
298
			int length)
L
Linus Torvalds 已提交
299 300 301 302 303 304 305 306 307
{
	unsigned len, i, max_val;
	uint16_t uni_char;
	int u_len;

	memset(ocu, 0, sizeof(dstring) * length);
	ocu[0] = 8;
	max_val = 0xffU;

308
try_again:
L
Linus Torvalds 已提交
309
	u_len = 0U;
310 311
	for (i = 0U; i < uni->u_len; i++) {
		len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
L
Linus Torvalds 已提交
312 313 314
		if (len <= 0)
			continue;

315
		if (uni_char > max_val) {
L
Linus Torvalds 已提交
316
			max_val = 0xffffU;
317
			ocu[0] = (uint8_t)0x10U;
L
Linus Torvalds 已提交
318 319
			goto try_again;
		}
320

L
Linus Torvalds 已提交
321
		if (max_val == 0xffffU)
322 323
			ocu[++u_len] = (uint8_t)(uni_char >> 8);
		ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
L
Linus Torvalds 已提交
324 325 326
		i += len - 1;
	}

327
	ocu[length - 1] = (uint8_t)u_len + 1;
L
Linus Torvalds 已提交
328 329 330
	return u_len + 1;
}

331
int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
332
		     int flen)
L
Linus Torvalds 已提交
333 334 335 336
{
	struct ustr filename, unifilename;
	int len;

M
Marcin Slusarz 已提交
337
	if (udf_build_ustr_exact(&unifilename, sname, flen))
L
Linus Torvalds 已提交
338 339
		return 0;

340 341
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
		if (!udf_CS0toUTF8(&filename, &unifilename)) {
M
Marcin Slusarz 已提交
342 343
			udf_debug("Failed in udf_get_filename: sname = %s\n",
				  sname);
L
Linus Torvalds 已提交
344 345
			return 0;
		}
346
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
M
Marcin Slusarz 已提交
347 348 349 350
		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename,
				  &unifilename)) {
			udf_debug("Failed in udf_get_filename: sname = %s\n",
				  sname);
L
Linus Torvalds 已提交
351 352
			return 0;
		}
M
Marcin Slusarz 已提交
353
	} else
L
Linus Torvalds 已提交
354 355
		return 0;

356 357
	len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
				     unifilename.u_name, unifilename.u_len);
M
Marcin Slusarz 已提交
358
	if (len)
L
Linus Torvalds 已提交
359
		return len;
360

L
Linus Torvalds 已提交
361 362 363
	return 0;
}

364 365
int udf_put_filename(struct super_block *sb, const uint8_t *sname,
		     uint8_t *dname, int flen)
L
Linus Torvalds 已提交
366 367 368 369
{
	struct ustr unifilename;
	int namelen;

M
Marcin Slusarz 已提交
370
	if (!udf_char_to_ustr(&unifilename, sname, flen))
L
Linus Torvalds 已提交
371 372
		return 0;

373
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
374
		namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
M
Marcin Slusarz 已提交
375
		if (!namelen)
L
Linus Torvalds 已提交
376
			return 0;
377
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
M
Marcin Slusarz 已提交
378 379 380
		namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
					&unifilename, UDF_NAME_LEN);
		if (!namelen)
L
Linus Torvalds 已提交
381
			return 0;
M
Marcin Slusarz 已提交
382
	} else
L
Linus Torvalds 已提交
383 384 385 386 387 388
		return 0;

	return namelen;
}

#define ILLEGAL_CHAR_MARK	'_'
389 390 391
#define EXT_MARK		'.'
#define CRC_MARK		'#'
#define EXT_SIZE 		5
L
Linus Torvalds 已提交
392

M
Marcin Slusarz 已提交
393 394 395
static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
				  int udfLen, uint8_t *fidName,
				  int fidNameLen)
L
Linus Torvalds 已提交
396
{
397
	int index, newIndex = 0, needsCRC = 0;
L
Linus Torvalds 已提交
398 399 400 401 402
	int extIndex = 0, newExtIndex = 0, hasExt = 0;
	unsigned short valueCRC;
	uint8_t curr;
	const uint8_t hexChar[] = "0123456789ABCDEF";

403 404
	if (udfName[0] == '.' &&
	    (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
L
Linus Torvalds 已提交
405 406 407
		needsCRC = 1;
		newIndex = udfLen;
		memcpy(newName, udfName, udfLen);
408 409
	} else {
		for (index = 0; index < udfLen; index++) {
L
Linus Torvalds 已提交
410
			curr = udfName[index];
411
			if (curr == '/' || curr == 0) {
L
Linus Torvalds 已提交
412 413
				needsCRC = 1;
				curr = ILLEGAL_CHAR_MARK;
M
Marcin Slusarz 已提交
414 415 416
				while (index + 1 < udfLen &&
						(udfName[index + 1] == '/' ||
						 udfName[index + 1] == 0))
L
Linus Torvalds 已提交
417
					index++;
M
Marcin Slusarz 已提交
418 419 420 421
			}
			if (curr == EXT_MARK &&
					(udfLen - index - 1) <= EXT_SIZE) {
				if (udfLen == index + 1)
L
Linus Torvalds 已提交
422
					hasExt = 0;
M
Marcin Slusarz 已提交
423
				else {
L
Linus Torvalds 已提交
424 425 426 427 428 429 430 431 432 433 434
					hasExt = 1;
					extIndex = index;
					newExtIndex = newIndex;
				}
			}
			if (newIndex < 256)
				newName[newIndex++] = curr;
			else
				needsCRC = 1;
		}
	}
435
	if (needsCRC) {
L
Linus Torvalds 已提交
436 437 438
		uint8_t ext[EXT_SIZE];
		int localExtIndex = 0;

439
		if (hasExt) {
L
Linus Torvalds 已提交
440
			int maxFilenameLen;
M
Marcin Slusarz 已提交
441 442 443
			for (index = 0;
			     index < EXT_SIZE && extIndex + index + 1 < udfLen;
			     index++) {
L
Linus Torvalds 已提交
444 445
				curr = udfName[extIndex + index + 1];

446
				if (curr == '/' || curr == 0) {
L
Linus Torvalds 已提交
447 448
					needsCRC = 1;
					curr = ILLEGAL_CHAR_MARK;
M
Marcin Slusarz 已提交
449 450 451 452
					while (extIndex + index + 2 < udfLen &&
					      (index + 1 < EXT_SIZE &&
						(udfName[extIndex + index + 2] == '/' ||
						 udfName[extIndex + index + 2] == 0)))
L
Linus Torvalds 已提交
453 454 455 456 457 458 459 460 461
						index++;
				}
				ext[localExtIndex++] = curr;
			}
			maxFilenameLen = 250 - localExtIndex;
			if (newIndex > maxFilenameLen)
				newIndex = maxFilenameLen;
			else
				newIndex = newExtIndex;
M
Marcin Slusarz 已提交
462
		} else if (newIndex > 250)
L
Linus Torvalds 已提交
463 464 465 466 467 468 469 470
			newIndex = 250;
		newName[newIndex++] = CRC_MARK;
		valueCRC = udf_crc(fidName, fidNameLen, 0);
		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];

471
		if (hasExt) {
L
Linus Torvalds 已提交
472
			newName[newIndex++] = EXT_MARK;
473
			for (index = 0; index < localExtIndex; index++)
L
Linus Torvalds 已提交
474 475 476
				newName[newIndex++] = ext[index];
		}
	}
477

L
Linus Torvalds 已提交
478 479
	return newIndex;
}