unicode.c 11.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * unicode.c
 *
 * PURPOSE
 *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
 *      Also handles filename mangling
 *
 * DESCRIPTION
 *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
 *		http://www.osta.org/
 *	UTF-8 is explained in the IETF RFC XXXX.
 *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
 *
 * COPYRIGHT
 *	This file is distributed under the terms of the GNU General Public
 *	License (GPL). Copies of the GPL can be obtained from:
 *		ftp://prep.ai.mit.edu/pub/gnu/GPL
 *	Each contributing author retains all rights to their own work.
 */

#include "udfdecl.h"

#include <linux/kernel.h>
#include <linux/string.h>	/* for memset */
#include <linux/nls.h>
26
#include <linux/crc-itu-t.h>
27
#include <linux/slab.h>
L
Linus Torvalds 已提交
28 29 30 31 32

#include "udf_sb.h"

static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);

33
static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
L
Linus Torvalds 已提交
34
{
35
	if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
L
Linus Torvalds 已提交
36
		return 0;
37

L
Linus Torvalds 已提交
38 39 40 41
	memset(dest, 0, sizeof(struct ustr));
	memcpy(dest->u_name, src, strlen);
	dest->u_cmpID = 0x08;
	dest->u_len = strlen;
42

L
Linus Torvalds 已提交
43 44 45 46 47 48
	return strlen;
}

/*
 * udf_build_ustr
 */
49
int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
L
Linus Torvalds 已提交
50 51 52
{
	int usesize;

M
Marcin Slusarz 已提交
53
	if (!dest || !ptr || !size)
L
Linus Torvalds 已提交
54
		return -1;
M
Marcin Slusarz 已提交
55
	BUG_ON(size < 2);
L
Linus Torvalds 已提交
56

M
Marcin Slusarz 已提交
57 58
	usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
	usesize = min(usesize, size - 2);
59
	dest->u_cmpID = ptr[0];
M
Marcin Slusarz 已提交
60 61 62
	dest->u_len = usesize;
	memcpy(dest->u_name, ptr + 1, usesize);
	memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
63

L
Linus Torvalds 已提交
64 65 66 67 68 69
	return 0;
}

/*
 * udf_build_ustr_exact
 */
70
static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
L
Linus Torvalds 已提交
71
{
72
	if ((!dest) || (!ptr) || (!exactsize))
L
Linus Torvalds 已提交
73 74 75
		return -1;

	memset(dest, 0, sizeof(struct ustr));
76 77 78
	dest->u_cmpID = ptr[0];
	dest->u_len = exactsize - 1;
	memcpy(dest->u_name, ptr + 1, exactsize - 1);
79

L
Linus Torvalds 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
	return 0;
}

/*
 * udf_ocu_to_utf8
 *
 * PURPOSE
 *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
 *
 * PRE-CONDITIONS
 *	utf			Pointer to UTF-8 output buffer.
 *	ocu			Pointer to OSTA Compressed Unicode input buffer
 *				of size UDF_NAME_LEN bytes.
 * 				both of type "struct ustr *"
 *
 * POST-CONDITIONS
 *	<return>		Zero on success.
 *
 * HISTORY
 *	November 12, 1997 - Andrew E. Mileski
 *	Written, tested, and released.
 */
102
int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
L
Linus Torvalds 已提交
103
{
104
	const uint8_t *ocu;
L
Linus Torvalds 已提交
105 106 107 108
	uint8_t cmp_id, ocu_len;
	int i;

	ocu_len = ocu_i->u_len;
109
	if (ocu_len == 0) {
L
Linus Torvalds 已提交
110 111 112 113
		memset(utf_o, 0, sizeof(struct ustr));
		return 0;
	}

114 115 116
	cmp_id = ocu_i->u_cmpID;
	if (cmp_id != 8 && cmp_id != 16) {
		memset(utf_o, 0, sizeof(struct ustr));
J
Joe Perches 已提交
117
		pr_err("unknown compression code (%d) stri=%s\n",
118
		       cmp_id, ocu_i->u_name);
L
Linus Torvalds 已提交
119 120 121
		return 0;
	}

122 123
	ocu = ocu_i->u_name;
	utf_o->u_len = 0;
124
	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
L
Linus Torvalds 已提交
125 126

		/* Expand OSTA compressed Unicode to Unicode */
127
		uint32_t c = ocu[i++];
L
Linus Torvalds 已提交
128 129 130 131
		if (cmp_id == 16)
			c = (c << 8) | ocu[i++];

		/* Compress Unicode to UTF-8 */
132
		if (c < 0x80U)
133
			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
134
		else if (c < 0x800U) {
M
Marcin Slusarz 已提交
135 136 137 138
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0xc0 | (c >> 6));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 | (c & 0x3f));
139
		} else {
M
Marcin Slusarz 已提交
140 141 142 143 144 145 146
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0xe0 | (c >> 12));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 |
							  ((c >> 6) & 0x3f));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 | (c & 0x3f));
L
Linus Torvalds 已提交
147 148
		}
	}
149
	utf_o->u_cmpID = 8;
L
Linus Torvalds 已提交
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176

	return utf_o->u_len;
}

/*
 *
 * udf_utf8_to_ocu
 *
 * PURPOSE
 *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
 *
 * DESCRIPTION
 *	This routine is only called by udf_lookup().
 *
 * PRE-CONDITIONS
 *	ocu			Pointer to OSTA Compressed Unicode output
 *				buffer of size UDF_NAME_LEN bytes.
 *	utf			Pointer to UTF-8 input buffer.
 *	utf_len			Length of UTF-8 input buffer in bytes.
 *
 * POST-CONDITIONS
 *	<return>		Zero on success.
 *
 * HISTORY
 *	November 12, 1997 - Andrew E. Mileski
 *	Written, tested, and released.
 */
177
static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
L
Linus Torvalds 已提交
178 179 180 181 182 183 184 185
{
	unsigned c, i, max_val, utf_char;
	int utf_cnt, u_len;

	memset(ocu, 0, sizeof(dstring) * length);
	ocu[0] = 8;
	max_val = 0xffU;

186
try_again:
L
Linus Torvalds 已提交
187 188 189
	u_len = 0U;
	utf_char = 0U;
	utf_cnt = 0U;
190
	for (i = 0U; i < utf->u_len; i++) {
191
		c = (uint8_t)utf->u_name[i];
L
Linus Torvalds 已提交
192 193

		/* Complete a multi-byte UTF-8 character */
194
		if (utf_cnt) {
L
Linus Torvalds 已提交
195 196 197
			utf_char = (utf_char << 6) | (c & 0x3fU);
			if (--utf_cnt)
				continue;
198
		} else {
L
Linus Torvalds 已提交
199
			/* Check for a multi-byte UTF-8 character */
200
			if (c & 0x80U) {
L
Linus Torvalds 已提交
201
				/* Start a multi-byte UTF-8 character */
202
				if ((c & 0xe0U) == 0xc0U) {
L
Linus Torvalds 已提交
203 204
					utf_char = c & 0x1fU;
					utf_cnt = 1;
205
				} else if ((c & 0xf0U) == 0xe0U) {
L
Linus Torvalds 已提交
206 207
					utf_char = c & 0x0fU;
					utf_cnt = 2;
208
				} else if ((c & 0xf8U) == 0xf0U) {
L
Linus Torvalds 已提交
209 210
					utf_char = c & 0x07U;
					utf_cnt = 3;
211
				} else if ((c & 0xfcU) == 0xf8U) {
L
Linus Torvalds 已提交
212 213
					utf_char = c & 0x03U;
					utf_cnt = 4;
214
				} else if ((c & 0xfeU) == 0xfcU) {
L
Linus Torvalds 已提交
215 216
					utf_char = c & 0x01U;
					utf_cnt = 5;
217
				} else {
L
Linus Torvalds 已提交
218
					goto error_out;
219
				}
L
Linus Torvalds 已提交
220
				continue;
221
			} else {
L
Linus Torvalds 已提交
222 223
				/* Single byte UTF-8 character (most common) */
				utf_char = c;
224
			}
L
Linus Torvalds 已提交
225 226 227
		}

		/* Choose no compression if necessary */
228
		if (utf_char > max_val) {
229
			if (max_val == 0xffU) {
L
Linus Torvalds 已提交
230
				max_val = 0xffffU;
231
				ocu[0] = (uint8_t)0x10U;
L
Linus Torvalds 已提交
232 233 234 235 236
				goto try_again;
			}
			goto error_out;
		}

M
Marcin Slusarz 已提交
237
		if (max_val == 0xffffU)
238 239
			ocu[++u_len] = (uint8_t)(utf_char >> 8);
		ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
L
Linus Torvalds 已提交
240 241
	}

242
	if (utf_cnt) {
243
error_out:
L
Linus Torvalds 已提交
244
		ocu[++u_len] = '?';
J
Joe Perches 已提交
245
		printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
L
Linus Torvalds 已提交
246 247
	}

248 249
	ocu[length - 1] = (uint8_t)u_len + 1;

L
Linus Torvalds 已提交
250 251 252
	return u_len + 1;
}

253
static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
254
			const struct ustr *ocu_i)
L
Linus Torvalds 已提交
255
{
256
	const uint8_t *ocu;
L
Linus Torvalds 已提交
257
	uint8_t cmp_id, ocu_len;
258
	int i, len;
L
Linus Torvalds 已提交
259 260 261


	ocu_len = ocu_i->u_len;
262
	if (ocu_len == 0) {
L
Linus Torvalds 已提交
263 264 265 266
		memset(utf_o, 0, sizeof(struct ustr));
		return 0;
	}

267 268 269
	cmp_id = ocu_i->u_cmpID;
	if (cmp_id != 8 && cmp_id != 16) {
		memset(utf_o, 0, sizeof(struct ustr));
J
Joe Perches 已提交
270
		pr_err("unknown compression code (%d) stri=%s\n",
271
		       cmp_id, ocu_i->u_name);
L
Linus Torvalds 已提交
272 273 274
		return 0;
	}

275 276
	ocu = ocu_i->u_name;
	utf_o->u_len = 0;
277
	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
L
Linus Torvalds 已提交
278
		/* Expand OSTA compressed Unicode to Unicode */
279
		uint32_t c = ocu[i++];
L
Linus Torvalds 已提交
280 281 282
		if (cmp_id == 16)
			c = (c << 8) | ocu[i++];

283 284 285 286 287 288 289
		len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
				    UDF_NAME_LEN - utf_o->u_len);
		/* Valid character? */
		if (len >= 0)
			utf_o->u_len += len;
		else
			utf_o->u_name[utf_o->u_len++] = '?';
L
Linus Torvalds 已提交
290
	}
291
	utf_o->u_cmpID = 8;
L
Linus Torvalds 已提交
292 293 294 295

	return utf_o->u_len;
}

296
static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
297
			int length)
L
Linus Torvalds 已提交
298
{
299 300
	int len;
	unsigned i, max_val;
L
Linus Torvalds 已提交
301 302 303 304 305 306 307
	uint16_t uni_char;
	int u_len;

	memset(ocu, 0, sizeof(dstring) * length);
	ocu[0] = 8;
	max_val = 0xffU;

308
try_again:
L
Linus Torvalds 已提交
309
	u_len = 0U;
310 311
	for (i = 0U; i < uni->u_len; i++) {
		len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
312
		if (!len)
L
Linus Torvalds 已提交
313
			continue;
314 315 316 317 318
		/* Invalid character, deal with it */
		if (len < 0) {
			len = 1;
			uni_char = '?';
		}
L
Linus Torvalds 已提交
319

320
		if (uni_char > max_val) {
L
Linus Torvalds 已提交
321
			max_val = 0xffffU;
322
			ocu[0] = (uint8_t)0x10U;
L
Linus Torvalds 已提交
323 324
			goto try_again;
		}
325

L
Linus Torvalds 已提交
326
		if (max_val == 0xffffU)
327 328
			ocu[++u_len] = (uint8_t)(uni_char >> 8);
		ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
L
Linus Torvalds 已提交
329 330 331
		i += len - 1;
	}

332
	ocu[length - 1] = (uint8_t)u_len + 1;
L
Linus Torvalds 已提交
333 334 335
	return u_len + 1;
}

336
int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
337
		     int flen)
L
Linus Torvalds 已提交
338
{
339 340
	struct ustr *filename, *unifilename;
	int len = 0;
L
Linus Torvalds 已提交
341

342 343
	filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
	if (!filename)
L
Linus Torvalds 已提交
344 345
		return 0;

346 347 348 349 350 351 352
	unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
	if (!unifilename)
		goto out1;

	if (udf_build_ustr_exact(unifilename, sname, flen))
		goto out2;

353
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
354
		if (!udf_CS0toUTF8(filename, unifilename)) {
M
Marcin Slusarz 已提交
355 356
			udf_debug("Failed in udf_get_filename: sname = %s\n",
				  sname);
357
			goto out2;
L
Linus Torvalds 已提交
358
		}
359
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
360 361
		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
				  unifilename)) {
M
Marcin Slusarz 已提交
362 363
			udf_debug("Failed in udf_get_filename: sname = %s\n",
				  sname);
364
			goto out2;
L
Linus Torvalds 已提交
365
		}
M
Marcin Slusarz 已提交
366
	} else
367 368 369 370 371 372 373 374 375
		goto out2;

	len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
				     unifilename->u_name, unifilename->u_len);
out2:
	kfree(unifilename);
out1:
	kfree(filename);
	return len;
L
Linus Torvalds 已提交
376 377
}

378 379
int udf_put_filename(struct super_block *sb, const uint8_t *sname,
		     uint8_t *dname, int flen)
L
Linus Torvalds 已提交
380 381 382 383
{
	struct ustr unifilename;
	int namelen;

M
Marcin Slusarz 已提交
384
	if (!udf_char_to_ustr(&unifilename, sname, flen))
L
Linus Torvalds 已提交
385 386
		return 0;

387
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
388
		namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
M
Marcin Slusarz 已提交
389
		if (!namelen)
L
Linus Torvalds 已提交
390
			return 0;
391
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
M
Marcin Slusarz 已提交
392 393 394
		namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
					&unifilename, UDF_NAME_LEN);
		if (!namelen)
L
Linus Torvalds 已提交
395
			return 0;
M
Marcin Slusarz 已提交
396
	} else
L
Linus Torvalds 已提交
397 398 399 400 401 402
		return 0;

	return namelen;
}

#define ILLEGAL_CHAR_MARK	'_'
403 404 405
#define EXT_MARK		'.'
#define CRC_MARK		'#'
#define EXT_SIZE 		5
L
Linus Torvalds 已提交
406

M
Marcin Slusarz 已提交
407 408 409
static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
				  int udfLen, uint8_t *fidName,
				  int fidNameLen)
L
Linus Torvalds 已提交
410
{
411
	int index, newIndex = 0, needsCRC = 0;
L
Linus Torvalds 已提交
412 413 414 415 416
	int extIndex = 0, newExtIndex = 0, hasExt = 0;
	unsigned short valueCRC;
	uint8_t curr;
	const uint8_t hexChar[] = "0123456789ABCDEF";

417 418
	if (udfName[0] == '.' &&
	    (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
L
Linus Torvalds 已提交
419 420 421
		needsCRC = 1;
		newIndex = udfLen;
		memcpy(newName, udfName, udfLen);
422 423
	} else {
		for (index = 0; index < udfLen; index++) {
L
Linus Torvalds 已提交
424
			curr = udfName[index];
425
			if (curr == '/' || curr == 0) {
L
Linus Torvalds 已提交
426 427
				needsCRC = 1;
				curr = ILLEGAL_CHAR_MARK;
M
Marcin Slusarz 已提交
428 429 430
				while (index + 1 < udfLen &&
						(udfName[index + 1] == '/' ||
						 udfName[index + 1] == 0))
L
Linus Torvalds 已提交
431
					index++;
M
Marcin Slusarz 已提交
432 433 434 435
			}
			if (curr == EXT_MARK &&
					(udfLen - index - 1) <= EXT_SIZE) {
				if (udfLen == index + 1)
L
Linus Torvalds 已提交
436
					hasExt = 0;
M
Marcin Slusarz 已提交
437
				else {
L
Linus Torvalds 已提交
438 439 440 441 442 443 444 445 446 447 448
					hasExt = 1;
					extIndex = index;
					newExtIndex = newIndex;
				}
			}
			if (newIndex < 256)
				newName[newIndex++] = curr;
			else
				needsCRC = 1;
		}
	}
449
	if (needsCRC) {
L
Linus Torvalds 已提交
450 451 452
		uint8_t ext[EXT_SIZE];
		int localExtIndex = 0;

453
		if (hasExt) {
L
Linus Torvalds 已提交
454
			int maxFilenameLen;
M
Marcin Slusarz 已提交
455 456 457
			for (index = 0;
			     index < EXT_SIZE && extIndex + index + 1 < udfLen;
			     index++) {
L
Linus Torvalds 已提交
458 459
				curr = udfName[extIndex + index + 1];

460
				if (curr == '/' || curr == 0) {
L
Linus Torvalds 已提交
461 462
					needsCRC = 1;
					curr = ILLEGAL_CHAR_MARK;
M
Marcin Slusarz 已提交
463 464 465 466
					while (extIndex + index + 2 < udfLen &&
					      (index + 1 < EXT_SIZE &&
						(udfName[extIndex + index + 2] == '/' ||
						 udfName[extIndex + index + 2] == 0)))
L
Linus Torvalds 已提交
467 468 469 470 471 472 473 474 475
						index++;
				}
				ext[localExtIndex++] = curr;
			}
			maxFilenameLen = 250 - localExtIndex;
			if (newIndex > maxFilenameLen)
				newIndex = maxFilenameLen;
			else
				newIndex = newExtIndex;
M
Marcin Slusarz 已提交
476
		} else if (newIndex > 250)
L
Linus Torvalds 已提交
477 478
			newIndex = 250;
		newName[newIndex++] = CRC_MARK;
479
		valueCRC = crc_itu_t(0, fidName, fidNameLen);
L
Linus Torvalds 已提交
480 481 482 483 484
		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];

485
		if (hasExt) {
L
Linus Torvalds 已提交
486
			newName[newIndex++] = EXT_MARK;
487
			for (index = 0; index < localExtIndex; index++)
L
Linus Torvalds 已提交
488 489 490
				newName[newIndex++] = ext[index];
		}
	}
491

L
Linus Torvalds 已提交
492 493
	return newIndex;
}