unicode.c 11.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * unicode.c
 *
 * PURPOSE
 *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
 *      Also handles filename mangling
 *
 * DESCRIPTION
 *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
 *		http://www.osta.org/
 *	UTF-8 is explained in the IETF RFC XXXX.
 *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
 *
 * COPYRIGHT
 *	This file is distributed under the terms of the GNU General Public
 *	License (GPL). Copies of the GPL can be obtained from:
 *		ftp://prep.ai.mit.edu/pub/gnu/GPL
 *	Each contributing author retains all rights to their own work.
 */

#include "udfdecl.h"

#include <linux/kernel.h>
#include <linux/string.h>	/* for memset */
#include <linux/nls.h>
26
#include <linux/crc-itu-t.h>
27
#include <linux/slab.h>
L
Linus Torvalds 已提交
28 29 30

#include "udf_sb.h"

31 32
static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
				  int);
L
Linus Torvalds 已提交
33

34
static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
L
Linus Torvalds 已提交
35
{
36
	if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
L
Linus Torvalds 已提交
37
		return 0;
38

L
Linus Torvalds 已提交
39 40 41 42
	memset(dest, 0, sizeof(struct ustr));
	memcpy(dest->u_name, src, strlen);
	dest->u_cmpID = 0x08;
	dest->u_len = strlen;
43

L
Linus Torvalds 已提交
44 45 46 47 48 49
	return strlen;
}

/*
 * udf_build_ustr
 */
50
int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
L
Linus Torvalds 已提交
51 52 53
{
	int usesize;

M
Marcin Slusarz 已提交
54
	if (!dest || !ptr || !size)
L
Linus Torvalds 已提交
55
		return -1;
M
Marcin Slusarz 已提交
56
	BUG_ON(size < 2);
L
Linus Torvalds 已提交
57

M
Marcin Slusarz 已提交
58 59
	usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
	usesize = min(usesize, size - 2);
60
	dest->u_cmpID = ptr[0];
M
Marcin Slusarz 已提交
61 62 63
	dest->u_len = usesize;
	memcpy(dest->u_name, ptr + 1, usesize);
	memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
64

L
Linus Torvalds 已提交
65 66 67 68 69 70
	return 0;
}

/*
 * udf_build_ustr_exact
 */
71
static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
L
Linus Torvalds 已提交
72 73
{
	memset(dest, 0, sizeof(struct ustr));
74 75 76
	dest->u_cmpID = ptr[0];
	dest->u_len = exactsize - 1;
	memcpy(dest->u_name, ptr + 1, exactsize - 1);
L
Linus Torvalds 已提交
77 78 79
}

/*
80
 * udf_CS0toUTF8
L
Linus Torvalds 已提交
81 82 83 84 85 86 87 88 89 90 91
 *
 * PURPOSE
 *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
 *
 * PRE-CONDITIONS
 *	utf			Pointer to UTF-8 output buffer.
 *	ocu			Pointer to OSTA Compressed Unicode input buffer
 *				of size UDF_NAME_LEN bytes.
 * 				both of type "struct ustr *"
 *
 * POST-CONDITIONS
92
 *	<return>		>= 0 on success.
L
Linus Torvalds 已提交
93 94 95 96 97
 *
 * HISTORY
 *	November 12, 1997 - Andrew E. Mileski
 *	Written, tested, and released.
 */
98
int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
L
Linus Torvalds 已提交
99
{
100
	const uint8_t *ocu;
L
Linus Torvalds 已提交
101 102 103 104
	uint8_t cmp_id, ocu_len;
	int i;

	ocu_len = ocu_i->u_len;
105
	if (ocu_len == 0) {
L
Linus Torvalds 已提交
106 107 108 109
		memset(utf_o, 0, sizeof(struct ustr));
		return 0;
	}

110 111 112
	cmp_id = ocu_i->u_cmpID;
	if (cmp_id != 8 && cmp_id != 16) {
		memset(utf_o, 0, sizeof(struct ustr));
J
Joe Perches 已提交
113
		pr_err("unknown compression code (%d) stri=%s\n",
114
		       cmp_id, ocu_i->u_name);
115
		return -EINVAL;
L
Linus Torvalds 已提交
116 117
	}

118 119
	ocu = ocu_i->u_name;
	utf_o->u_len = 0;
120
	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
L
Linus Torvalds 已提交
121 122

		/* Expand OSTA compressed Unicode to Unicode */
123
		uint32_t c = ocu[i++];
L
Linus Torvalds 已提交
124 125 126 127
		if (cmp_id == 16)
			c = (c << 8) | ocu[i++];

		/* Compress Unicode to UTF-8 */
128
		if (c < 0x80U)
129
			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
130
		else if (c < 0x800U) {
M
Marcin Slusarz 已提交
131 132 133 134
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0xc0 | (c >> 6));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 | (c & 0x3f));
135
		} else {
M
Marcin Slusarz 已提交
136 137 138 139 140 141 142
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0xe0 | (c >> 12));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 |
							  ((c >> 6) & 0x3f));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 | (c & 0x3f));
L
Linus Torvalds 已提交
143 144
		}
	}
145
	utf_o->u_cmpID = 8;
L
Linus Torvalds 已提交
146 147 148 149 150 151

	return utf_o->u_len;
}

/*
 *
152
 * udf_UTF8toCS0
L
Linus Torvalds 已提交
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
 *
 * PURPOSE
 *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
 *
 * DESCRIPTION
 *	This routine is only called by udf_lookup().
 *
 * PRE-CONDITIONS
 *	ocu			Pointer to OSTA Compressed Unicode output
 *				buffer of size UDF_NAME_LEN bytes.
 *	utf			Pointer to UTF-8 input buffer.
 *	utf_len			Length of UTF-8 input buffer in bytes.
 *
 * POST-CONDITIONS
 *	<return>		Zero on success.
 *
 * HISTORY
 *	November 12, 1997 - Andrew E. Mileski
 *	Written, tested, and released.
 */
173
static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
L
Linus Torvalds 已提交
174 175 176 177 178 179 180 181
{
	unsigned c, i, max_val, utf_char;
	int utf_cnt, u_len;

	memset(ocu, 0, sizeof(dstring) * length);
	ocu[0] = 8;
	max_val = 0xffU;

182
try_again:
L
Linus Torvalds 已提交
183 184 185
	u_len = 0U;
	utf_char = 0U;
	utf_cnt = 0U;
186
	for (i = 0U; i < utf->u_len; i++) {
187
		c = (uint8_t)utf->u_name[i];
L
Linus Torvalds 已提交
188 189

		/* Complete a multi-byte UTF-8 character */
190
		if (utf_cnt) {
L
Linus Torvalds 已提交
191 192 193
			utf_char = (utf_char << 6) | (c & 0x3fU);
			if (--utf_cnt)
				continue;
194
		} else {
L
Linus Torvalds 已提交
195
			/* Check for a multi-byte UTF-8 character */
196
			if (c & 0x80U) {
L
Linus Torvalds 已提交
197
				/* Start a multi-byte UTF-8 character */
198
				if ((c & 0xe0U) == 0xc0U) {
L
Linus Torvalds 已提交
199 200
					utf_char = c & 0x1fU;
					utf_cnt = 1;
201
				} else if ((c & 0xf0U) == 0xe0U) {
L
Linus Torvalds 已提交
202 203
					utf_char = c & 0x0fU;
					utf_cnt = 2;
204
				} else if ((c & 0xf8U) == 0xf0U) {
L
Linus Torvalds 已提交
205 206
					utf_char = c & 0x07U;
					utf_cnt = 3;
207
				} else if ((c & 0xfcU) == 0xf8U) {
L
Linus Torvalds 已提交
208 209
					utf_char = c & 0x03U;
					utf_cnt = 4;
210
				} else if ((c & 0xfeU) == 0xfcU) {
L
Linus Torvalds 已提交
211 212
					utf_char = c & 0x01U;
					utf_cnt = 5;
213
				} else {
L
Linus Torvalds 已提交
214
					goto error_out;
215
				}
L
Linus Torvalds 已提交
216
				continue;
217
			} else {
L
Linus Torvalds 已提交
218 219
				/* Single byte UTF-8 character (most common) */
				utf_char = c;
220
			}
L
Linus Torvalds 已提交
221 222 223
		}

		/* Choose no compression if necessary */
224
		if (utf_char > max_val) {
225
			if (max_val == 0xffU) {
L
Linus Torvalds 已提交
226
				max_val = 0xffffU;
227
				ocu[0] = (uint8_t)0x10U;
L
Linus Torvalds 已提交
228 229 230 231 232
				goto try_again;
			}
			goto error_out;
		}

M
Marcin Slusarz 已提交
233
		if (max_val == 0xffffU)
234 235
			ocu[++u_len] = (uint8_t)(utf_char >> 8);
		ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
L
Linus Torvalds 已提交
236 237
	}

238
	if (utf_cnt) {
239
error_out:
L
Linus Torvalds 已提交
240
		ocu[++u_len] = '?';
J
Joe Perches 已提交
241
		printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
L
Linus Torvalds 已提交
242 243
	}

244 245
	ocu[length - 1] = (uint8_t)u_len + 1;

L
Linus Torvalds 已提交
246 247 248
	return u_len + 1;
}

249
static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
250
			const struct ustr *ocu_i)
L
Linus Torvalds 已提交
251
{
252
	const uint8_t *ocu;
L
Linus Torvalds 已提交
253
	uint8_t cmp_id, ocu_len;
254
	int i, len;
L
Linus Torvalds 已提交
255 256 257


	ocu_len = ocu_i->u_len;
258
	if (ocu_len == 0) {
L
Linus Torvalds 已提交
259 260 261 262
		memset(utf_o, 0, sizeof(struct ustr));
		return 0;
	}

263 264 265
	cmp_id = ocu_i->u_cmpID;
	if (cmp_id != 8 && cmp_id != 16) {
		memset(utf_o, 0, sizeof(struct ustr));
J
Joe Perches 已提交
266
		pr_err("unknown compression code (%d) stri=%s\n",
267
		       cmp_id, ocu_i->u_name);
268
		return -EINVAL;
L
Linus Torvalds 已提交
269 270
	}

271 272
	ocu = ocu_i->u_name;
	utf_o->u_len = 0;
273
	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
L
Linus Torvalds 已提交
274
		/* Expand OSTA compressed Unicode to Unicode */
275
		uint32_t c = ocu[i++];
L
Linus Torvalds 已提交
276 277 278
		if (cmp_id == 16)
			c = (c << 8) | ocu[i++];

279 280 281 282 283 284 285
		len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
				    UDF_NAME_LEN - utf_o->u_len);
		/* Valid character? */
		if (len >= 0)
			utf_o->u_len += len;
		else
			utf_o->u_name[utf_o->u_len++] = '?';
L
Linus Torvalds 已提交
286
	}
287
	utf_o->u_cmpID = 8;
L
Linus Torvalds 已提交
288 289 290 291

	return utf_o->u_len;
}

292
static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
293
			int length)
L
Linus Torvalds 已提交
294
{
295 296
	int len;
	unsigned i, max_val;
L
Linus Torvalds 已提交
297 298 299 300 301 302 303
	uint16_t uni_char;
	int u_len;

	memset(ocu, 0, sizeof(dstring) * length);
	ocu[0] = 8;
	max_val = 0xffU;

304
try_again:
L
Linus Torvalds 已提交
305
	u_len = 0U;
306 307
	for (i = 0U; i < uni->u_len; i++) {
		len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
308
		if (!len)
L
Linus Torvalds 已提交
309
			continue;
310 311 312 313 314
		/* Invalid character, deal with it */
		if (len < 0) {
			len = 1;
			uni_char = '?';
		}
L
Linus Torvalds 已提交
315

316
		if (uni_char > max_val) {
L
Linus Torvalds 已提交
317
			max_val = 0xffffU;
318
			ocu[0] = (uint8_t)0x10U;
L
Linus Torvalds 已提交
319 320
			goto try_again;
		}
321

L
Linus Torvalds 已提交
322
		if (max_val == 0xffffU)
323 324
			ocu[++u_len] = (uint8_t)(uni_char >> 8);
		ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
L
Linus Torvalds 已提交
325 326 327
		i += len - 1;
	}

328
	ocu[length - 1] = (uint8_t)u_len + 1;
L
Linus Torvalds 已提交
329 330 331
	return u_len + 1;
}

332 333
int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
		     uint8_t *dname, int dlen)
L
Linus Torvalds 已提交
334
{
335
	struct ustr *filename, *unifilename;
336
	int ret = 0;
L
Linus Torvalds 已提交
337

338 339 340
	if (!slen)
		return -EIO;

341 342
	filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
	if (!filename)
343
		return -ENOMEM;
L
Linus Torvalds 已提交
344

345
	unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
346 347
	if (!unifilename) {
		ret = -ENOMEM;
348
		goto out1;
349
	}
350

351
	udf_build_ustr_exact(unifilename, sname, slen);
352
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
353 354
		ret = udf_CS0toUTF8(filename, unifilename);
		if (ret < 0) {
M
Marcin Slusarz 已提交
355 356
			udf_debug("Failed in udf_get_filename: sname = %s\n",
				  sname);
357
			goto out2;
L
Linus Torvalds 已提交
358
		}
359
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
360 361 362
		ret = udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
				   unifilename);
		if (ret < 0) {
M
Marcin Slusarz 已提交
363 364
			udf_debug("Failed in udf_get_filename: sname = %s\n",
				  sname);
365
			goto out2;
L
Linus Torvalds 已提交
366
		}
M
Marcin Slusarz 已提交
367
	} else
368
		BUG();
369

370
	ret = udf_translate_to_linux(dname, dlen,
371
				     filename->u_name, filename->u_len,
372 373 374 375 376
				     unifilename->u_name, unifilename->u_len);
out2:
	kfree(unifilename);
out1:
	kfree(filename);
377
	return ret;
L
Linus Torvalds 已提交
378 379
}

380 381
int udf_put_filename(struct super_block *sb, const uint8_t *sname,
		     uint8_t *dname, int flen)
L
Linus Torvalds 已提交
382 383 384 385
{
	struct ustr unifilename;
	int namelen;

M
Marcin Slusarz 已提交
386
	if (!udf_char_to_ustr(&unifilename, sname, flen))
L
Linus Torvalds 已提交
387 388
		return 0;

389
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
390
		namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
M
Marcin Slusarz 已提交
391
		if (!namelen)
L
Linus Torvalds 已提交
392
			return 0;
393
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
M
Marcin Slusarz 已提交
394 395 396
		namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
					&unifilename, UDF_NAME_LEN);
		if (!namelen)
L
Linus Torvalds 已提交
397
			return 0;
M
Marcin Slusarz 已提交
398
	} else
L
Linus Torvalds 已提交
399 400 401 402 403 404
		return 0;

	return namelen;
}

#define ILLEGAL_CHAR_MARK	'_'
405 406 407
#define EXT_MARK		'.'
#define CRC_MARK		'#'
#define EXT_SIZE 		5
408 409
/* Number of chars we need to store generated CRC to make filename unique */
#define CRC_LEN			5
L
Linus Torvalds 已提交
410

411 412 413
static int udf_translate_to_linux(uint8_t *newName, int newLen,
				  uint8_t *udfName, int udfLen,
				  uint8_t *fidName, int fidNameLen)
L
Linus Torvalds 已提交
414
{
415
	int index, newIndex = 0, needsCRC = 0;
L
Linus Torvalds 已提交
416 417 418 419
	int extIndex = 0, newExtIndex = 0, hasExt = 0;
	unsigned short valueCRC;
	uint8_t curr;

420 421
	if (udfName[0] == '.' &&
	    (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
L
Linus Torvalds 已提交
422 423 424
		needsCRC = 1;
		newIndex = udfLen;
		memcpy(newName, udfName, udfLen);
425 426
	} else {
		for (index = 0; index < udfLen; index++) {
L
Linus Torvalds 已提交
427
			curr = udfName[index];
428
			if (curr == '/' || curr == 0) {
L
Linus Torvalds 已提交
429 430
				needsCRC = 1;
				curr = ILLEGAL_CHAR_MARK;
M
Marcin Slusarz 已提交
431 432 433
				while (index + 1 < udfLen &&
						(udfName[index + 1] == '/' ||
						 udfName[index + 1] == 0))
L
Linus Torvalds 已提交
434
					index++;
M
Marcin Slusarz 已提交
435 436 437 438
			}
			if (curr == EXT_MARK &&
					(udfLen - index - 1) <= EXT_SIZE) {
				if (udfLen == index + 1)
L
Linus Torvalds 已提交
439
					hasExt = 0;
M
Marcin Slusarz 已提交
440
				else {
L
Linus Torvalds 已提交
441 442 443 444 445
					hasExt = 1;
					extIndex = index;
					newExtIndex = newIndex;
				}
			}
446
			if (newIndex < newLen)
L
Linus Torvalds 已提交
447 448 449 450 451
				newName[newIndex++] = curr;
			else
				needsCRC = 1;
		}
	}
452
	if (needsCRC) {
L
Linus Torvalds 已提交
453 454 455
		uint8_t ext[EXT_SIZE];
		int localExtIndex = 0;

456
		if (hasExt) {
L
Linus Torvalds 已提交
457
			int maxFilenameLen;
M
Marcin Slusarz 已提交
458 459 460
			for (index = 0;
			     index < EXT_SIZE && extIndex + index + 1 < udfLen;
			     index++) {
L
Linus Torvalds 已提交
461 462
				curr = udfName[extIndex + index + 1];

463
				if (curr == '/' || curr == 0) {
L
Linus Torvalds 已提交
464 465
					needsCRC = 1;
					curr = ILLEGAL_CHAR_MARK;
M
Marcin Slusarz 已提交
466 467 468 469
					while (extIndex + index + 2 < udfLen &&
					      (index + 1 < EXT_SIZE &&
						(udfName[extIndex + index + 2] == '/' ||
						 udfName[extIndex + index + 2] == 0)))
L
Linus Torvalds 已提交
470 471 472 473
						index++;
				}
				ext[localExtIndex++] = curr;
			}
474
			maxFilenameLen = newLen - CRC_LEN - localExtIndex;
L
Linus Torvalds 已提交
475 476 477 478
			if (newIndex > maxFilenameLen)
				newIndex = maxFilenameLen;
			else
				newIndex = newExtIndex;
479 480
		} else if (newIndex > newLen - CRC_LEN)
			newIndex = newLen - CRC_LEN;
L
Linus Torvalds 已提交
481
		newName[newIndex++] = CRC_MARK;
482
		valueCRC = crc_itu_t(0, fidName, fidNameLen);
483 484 485 486
		newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
		newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
		newName[newIndex++] = hex_asc_upper_hi(valueCRC);
		newName[newIndex++] = hex_asc_upper_lo(valueCRC);
L
Linus Torvalds 已提交
487

488
		if (hasExt) {
L
Linus Torvalds 已提交
489
			newName[newIndex++] = EXT_MARK;
490
			for (index = 0; index < localExtIndex; index++)
L
Linus Torvalds 已提交
491 492 493
				newName[newIndex++] = ext[index];
		}
	}
494

L
Linus Torvalds 已提交
495 496
	return newIndex;
}