unicode.c 11.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * unicode.c
 *
 * PURPOSE
 *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
 *      Also handles filename mangling
 *
 * DESCRIPTION
 *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
 *		http://www.osta.org/
 *	UTF-8 is explained in the IETF RFC XXXX.
 *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
 *
 * COPYRIGHT
 *	This file is distributed under the terms of the GNU General Public
 *	License (GPL). Copies of the GPL can be obtained from:
 *		ftp://prep.ai.mit.edu/pub/gnu/GPL
 *	Each contributing author retains all rights to their own work.
 */

#include "udfdecl.h"

#include <linux/kernel.h>
#include <linux/string.h>	/* for memset */
#include <linux/nls.h>
26
#include <linux/crc-itu-t.h>
27
#include <linux/slab.h>
L
Linus Torvalds 已提交
28 29 30

#include "udf_sb.h"

31 32
static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
				  int);
L
Linus Torvalds 已提交
33

34
static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
L
Linus Torvalds 已提交
35
{
36
	if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
L
Linus Torvalds 已提交
37
		return 0;
38

L
Linus Torvalds 已提交
39 40 41 42
	memset(dest, 0, sizeof(struct ustr));
	memcpy(dest->u_name, src, strlen);
	dest->u_cmpID = 0x08;
	dest->u_len = strlen;
43

L
Linus Torvalds 已提交
44 45 46 47 48 49
	return strlen;
}

/*
 * udf_build_ustr
 */
50
int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
L
Linus Torvalds 已提交
51 52 53
{
	int usesize;

M
Marcin Slusarz 已提交
54
	if (!dest || !ptr || !size)
L
Linus Torvalds 已提交
55
		return -1;
M
Marcin Slusarz 已提交
56
	BUG_ON(size < 2);
L
Linus Torvalds 已提交
57

M
Marcin Slusarz 已提交
58 59
	usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
	usesize = min(usesize, size - 2);
60
	dest->u_cmpID = ptr[0];
M
Marcin Slusarz 已提交
61 62 63
	dest->u_len = usesize;
	memcpy(dest->u_name, ptr + 1, usesize);
	memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
64

L
Linus Torvalds 已提交
65 66 67 68 69 70
	return 0;
}

/*
 * udf_build_ustr_exact
 */
71
static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
L
Linus Torvalds 已提交
72 73
{
	memset(dest, 0, sizeof(struct ustr));
74 75 76
	dest->u_cmpID = ptr[0];
	dest->u_len = exactsize - 1;
	memcpy(dest->u_name, ptr + 1, exactsize - 1);
L
Linus Torvalds 已提交
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
}

/*
 * udf_ocu_to_utf8
 *
 * PURPOSE
 *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
 *
 * PRE-CONDITIONS
 *	utf			Pointer to UTF-8 output buffer.
 *	ocu			Pointer to OSTA Compressed Unicode input buffer
 *				of size UDF_NAME_LEN bytes.
 * 				both of type "struct ustr *"
 *
 * POST-CONDITIONS
 *	<return>		Zero on success.
 *
 * HISTORY
 *	November 12, 1997 - Andrew E. Mileski
 *	Written, tested, and released.
 */
98
int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
L
Linus Torvalds 已提交
99
{
100
	const uint8_t *ocu;
L
Linus Torvalds 已提交
101 102 103 104
	uint8_t cmp_id, ocu_len;
	int i;

	ocu_len = ocu_i->u_len;
105
	if (ocu_len == 0) {
L
Linus Torvalds 已提交
106 107 108 109
		memset(utf_o, 0, sizeof(struct ustr));
		return 0;
	}

110 111 112
	cmp_id = ocu_i->u_cmpID;
	if (cmp_id != 8 && cmp_id != 16) {
		memset(utf_o, 0, sizeof(struct ustr));
J
Joe Perches 已提交
113
		pr_err("unknown compression code (%d) stri=%s\n",
114
		       cmp_id, ocu_i->u_name);
L
Linus Torvalds 已提交
115 116 117
		return 0;
	}

118 119
	ocu = ocu_i->u_name;
	utf_o->u_len = 0;
120
	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
L
Linus Torvalds 已提交
121 122

		/* Expand OSTA compressed Unicode to Unicode */
123
		uint32_t c = ocu[i++];
L
Linus Torvalds 已提交
124 125 126 127
		if (cmp_id == 16)
			c = (c << 8) | ocu[i++];

		/* Compress Unicode to UTF-8 */
128
		if (c < 0x80U)
129
			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
130
		else if (c < 0x800U) {
M
Marcin Slusarz 已提交
131 132 133 134
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0xc0 | (c >> 6));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 | (c & 0x3f));
135
		} else {
M
Marcin Slusarz 已提交
136 137 138 139 140 141 142
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0xe0 | (c >> 12));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 |
							  ((c >> 6) & 0x3f));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 | (c & 0x3f));
L
Linus Torvalds 已提交
143 144
		}
	}
145
	utf_o->u_cmpID = 8;
L
Linus Torvalds 已提交
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172

	return utf_o->u_len;
}

/*
 *
 * udf_utf8_to_ocu
 *
 * PURPOSE
 *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
 *
 * DESCRIPTION
 *	This routine is only called by udf_lookup().
 *
 * PRE-CONDITIONS
 *	ocu			Pointer to OSTA Compressed Unicode output
 *				buffer of size UDF_NAME_LEN bytes.
 *	utf			Pointer to UTF-8 input buffer.
 *	utf_len			Length of UTF-8 input buffer in bytes.
 *
 * POST-CONDITIONS
 *	<return>		Zero on success.
 *
 * HISTORY
 *	November 12, 1997 - Andrew E. Mileski
 *	Written, tested, and released.
 */
173
static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
L
Linus Torvalds 已提交
174 175 176 177 178 179 180 181
{
	unsigned c, i, max_val, utf_char;
	int utf_cnt, u_len;

	memset(ocu, 0, sizeof(dstring) * length);
	ocu[0] = 8;
	max_val = 0xffU;

182
try_again:
L
Linus Torvalds 已提交
183 184 185
	u_len = 0U;
	utf_char = 0U;
	utf_cnt = 0U;
186
	for (i = 0U; i < utf->u_len; i++) {
187
		c = (uint8_t)utf->u_name[i];
L
Linus Torvalds 已提交
188 189

		/* Complete a multi-byte UTF-8 character */
190
		if (utf_cnt) {
L
Linus Torvalds 已提交
191 192 193
			utf_char = (utf_char << 6) | (c & 0x3fU);
			if (--utf_cnt)
				continue;
194
		} else {
L
Linus Torvalds 已提交
195
			/* Check for a multi-byte UTF-8 character */
196
			if (c & 0x80U) {
L
Linus Torvalds 已提交
197
				/* Start a multi-byte UTF-8 character */
198
				if ((c & 0xe0U) == 0xc0U) {
L
Linus Torvalds 已提交
199 200
					utf_char = c & 0x1fU;
					utf_cnt = 1;
201
				} else if ((c & 0xf0U) == 0xe0U) {
L
Linus Torvalds 已提交
202 203
					utf_char = c & 0x0fU;
					utf_cnt = 2;
204
				} else if ((c & 0xf8U) == 0xf0U) {
L
Linus Torvalds 已提交
205 206
					utf_char = c & 0x07U;
					utf_cnt = 3;
207
				} else if ((c & 0xfcU) == 0xf8U) {
L
Linus Torvalds 已提交
208 209
					utf_char = c & 0x03U;
					utf_cnt = 4;
210
				} else if ((c & 0xfeU) == 0xfcU) {
L
Linus Torvalds 已提交
211 212
					utf_char = c & 0x01U;
					utf_cnt = 5;
213
				} else {
L
Linus Torvalds 已提交
214
					goto error_out;
215
				}
L
Linus Torvalds 已提交
216
				continue;
217
			} else {
L
Linus Torvalds 已提交
218 219
				/* Single byte UTF-8 character (most common) */
				utf_char = c;
220
			}
L
Linus Torvalds 已提交
221 222 223
		}

		/* Choose no compression if necessary */
224
		if (utf_char > max_val) {
225
			if (max_val == 0xffU) {
L
Linus Torvalds 已提交
226
				max_val = 0xffffU;
227
				ocu[0] = (uint8_t)0x10U;
L
Linus Torvalds 已提交
228 229 230 231 232
				goto try_again;
			}
			goto error_out;
		}

M
Marcin Slusarz 已提交
233
		if (max_val == 0xffffU)
234 235
			ocu[++u_len] = (uint8_t)(utf_char >> 8);
		ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
L
Linus Torvalds 已提交
236 237
	}

238
	if (utf_cnt) {
239
error_out:
L
Linus Torvalds 已提交
240
		ocu[++u_len] = '?';
J
Joe Perches 已提交
241
		printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
L
Linus Torvalds 已提交
242 243
	}

244 245
	ocu[length - 1] = (uint8_t)u_len + 1;

L
Linus Torvalds 已提交
246 247 248
	return u_len + 1;
}

249
static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
250
			const struct ustr *ocu_i)
L
Linus Torvalds 已提交
251
{
252
	const uint8_t *ocu;
L
Linus Torvalds 已提交
253
	uint8_t cmp_id, ocu_len;
254
	int i, len;
L
Linus Torvalds 已提交
255 256 257


	ocu_len = ocu_i->u_len;
258
	if (ocu_len == 0) {
L
Linus Torvalds 已提交
259 260 261 262
		memset(utf_o, 0, sizeof(struct ustr));
		return 0;
	}

263 264 265
	cmp_id = ocu_i->u_cmpID;
	if (cmp_id != 8 && cmp_id != 16) {
		memset(utf_o, 0, sizeof(struct ustr));
J
Joe Perches 已提交
266
		pr_err("unknown compression code (%d) stri=%s\n",
267
		       cmp_id, ocu_i->u_name);
L
Linus Torvalds 已提交
268 269 270
		return 0;
	}

271 272
	ocu = ocu_i->u_name;
	utf_o->u_len = 0;
273
	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
L
Linus Torvalds 已提交
274
		/* Expand OSTA compressed Unicode to Unicode */
275
		uint32_t c = ocu[i++];
L
Linus Torvalds 已提交
276 277 278
		if (cmp_id == 16)
			c = (c << 8) | ocu[i++];

279 280 281 282 283 284 285
		len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
				    UDF_NAME_LEN - utf_o->u_len);
		/* Valid character? */
		if (len >= 0)
			utf_o->u_len += len;
		else
			utf_o->u_name[utf_o->u_len++] = '?';
L
Linus Torvalds 已提交
286
	}
287
	utf_o->u_cmpID = 8;
L
Linus Torvalds 已提交
288 289 290 291

	return utf_o->u_len;
}

292
static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
293
			int length)
L
Linus Torvalds 已提交
294
{
295 296
	int len;
	unsigned i, max_val;
L
Linus Torvalds 已提交
297 298 299 300 301 302 303
	uint16_t uni_char;
	int u_len;

	memset(ocu, 0, sizeof(dstring) * length);
	ocu[0] = 8;
	max_val = 0xffU;

304
try_again:
L
Linus Torvalds 已提交
305
	u_len = 0U;
306 307
	for (i = 0U; i < uni->u_len; i++) {
		len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
308
		if (!len)
L
Linus Torvalds 已提交
309
			continue;
310 311 312 313 314
		/* Invalid character, deal with it */
		if (len < 0) {
			len = 1;
			uni_char = '?';
		}
L
Linus Torvalds 已提交
315

316
		if (uni_char > max_val) {
L
Linus Torvalds 已提交
317
			max_val = 0xffffU;
318
			ocu[0] = (uint8_t)0x10U;
L
Linus Torvalds 已提交
319 320
			goto try_again;
		}
321

L
Linus Torvalds 已提交
322
		if (max_val == 0xffffU)
323 324
			ocu[++u_len] = (uint8_t)(uni_char >> 8);
		ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
L
Linus Torvalds 已提交
325 326 327
		i += len - 1;
	}

328
	ocu[length - 1] = (uint8_t)u_len + 1;
L
Linus Torvalds 已提交
329 330 331
	return u_len + 1;
}

332 333
int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
		     uint8_t *dname, int dlen)
L
Linus Torvalds 已提交
334
{
335
	struct ustr *filename, *unifilename;
336
	int ret = 0;
L
Linus Torvalds 已提交
337

338 339 340
	if (!slen)
		return -EIO;

341 342
	filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
	if (!filename)
343
		return -ENOMEM;
L
Linus Torvalds 已提交
344

345
	unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
346 347
	if (!unifilename) {
		ret = -ENOMEM;
348
		goto out1;
349
	}
350

351
	udf_build_ustr_exact(unifilename, sname, slen);
352
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
353
		if (!udf_CS0toUTF8(filename, unifilename)) {
M
Marcin Slusarz 已提交
354 355
			udf_debug("Failed in udf_get_filename: sname = %s\n",
				  sname);
356
			goto out2;
L
Linus Torvalds 已提交
357
		}
358
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
359 360
		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
				  unifilename)) {
M
Marcin Slusarz 已提交
361 362
			udf_debug("Failed in udf_get_filename: sname = %s\n",
				  sname);
363
			goto out2;
L
Linus Torvalds 已提交
364
		}
M
Marcin Slusarz 已提交
365
	} else
366 367
		goto out2;

368
	ret = udf_translate_to_linux(dname, dlen,
369
				     filename->u_name, filename->u_len,
370 371 372 373 374
				     unifilename->u_name, unifilename->u_len);
out2:
	kfree(unifilename);
out1:
	kfree(filename);
375
	return ret;
L
Linus Torvalds 已提交
376 377
}

378 379
int udf_put_filename(struct super_block *sb, const uint8_t *sname,
		     uint8_t *dname, int flen)
L
Linus Torvalds 已提交
380 381 382 383
{
	struct ustr unifilename;
	int namelen;

M
Marcin Slusarz 已提交
384
	if (!udf_char_to_ustr(&unifilename, sname, flen))
L
Linus Torvalds 已提交
385 386
		return 0;

387
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
388
		namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
M
Marcin Slusarz 已提交
389
		if (!namelen)
L
Linus Torvalds 已提交
390
			return 0;
391
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
M
Marcin Slusarz 已提交
392 393 394
		namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
					&unifilename, UDF_NAME_LEN);
		if (!namelen)
L
Linus Torvalds 已提交
395
			return 0;
M
Marcin Slusarz 已提交
396
	} else
L
Linus Torvalds 已提交
397 398 399 400 401 402
		return 0;

	return namelen;
}

#define ILLEGAL_CHAR_MARK	'_'
403 404 405
#define EXT_MARK		'.'
#define CRC_MARK		'#'
#define EXT_SIZE 		5
406 407
/* Number of chars we need to store generated CRC to make filename unique */
#define CRC_LEN			5
L
Linus Torvalds 已提交
408

409 410 411
static int udf_translate_to_linux(uint8_t *newName, int newLen,
				  uint8_t *udfName, int udfLen,
				  uint8_t *fidName, int fidNameLen)
L
Linus Torvalds 已提交
412
{
413
	int index, newIndex = 0, needsCRC = 0;
L
Linus Torvalds 已提交
414 415 416 417
	int extIndex = 0, newExtIndex = 0, hasExt = 0;
	unsigned short valueCRC;
	uint8_t curr;

418 419
	if (udfName[0] == '.' &&
	    (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
L
Linus Torvalds 已提交
420 421 422
		needsCRC = 1;
		newIndex = udfLen;
		memcpy(newName, udfName, udfLen);
423 424
	} else {
		for (index = 0; index < udfLen; index++) {
L
Linus Torvalds 已提交
425
			curr = udfName[index];
426
			if (curr == '/' || curr == 0) {
L
Linus Torvalds 已提交
427 428
				needsCRC = 1;
				curr = ILLEGAL_CHAR_MARK;
M
Marcin Slusarz 已提交
429 430 431
				while (index + 1 < udfLen &&
						(udfName[index + 1] == '/' ||
						 udfName[index + 1] == 0))
L
Linus Torvalds 已提交
432
					index++;
M
Marcin Slusarz 已提交
433 434 435 436
			}
			if (curr == EXT_MARK &&
					(udfLen - index - 1) <= EXT_SIZE) {
				if (udfLen == index + 1)
L
Linus Torvalds 已提交
437
					hasExt = 0;
M
Marcin Slusarz 已提交
438
				else {
L
Linus Torvalds 已提交
439 440 441 442 443
					hasExt = 1;
					extIndex = index;
					newExtIndex = newIndex;
				}
			}
444
			if (newIndex < newLen)
L
Linus Torvalds 已提交
445 446 447 448 449
				newName[newIndex++] = curr;
			else
				needsCRC = 1;
		}
	}
450
	if (needsCRC) {
L
Linus Torvalds 已提交
451 452 453
		uint8_t ext[EXT_SIZE];
		int localExtIndex = 0;

454
		if (hasExt) {
L
Linus Torvalds 已提交
455
			int maxFilenameLen;
M
Marcin Slusarz 已提交
456 457 458
			for (index = 0;
			     index < EXT_SIZE && extIndex + index + 1 < udfLen;
			     index++) {
L
Linus Torvalds 已提交
459 460
				curr = udfName[extIndex + index + 1];

461
				if (curr == '/' || curr == 0) {
L
Linus Torvalds 已提交
462 463
					needsCRC = 1;
					curr = ILLEGAL_CHAR_MARK;
M
Marcin Slusarz 已提交
464 465 466 467
					while (extIndex + index + 2 < udfLen &&
					      (index + 1 < EXT_SIZE &&
						(udfName[extIndex + index + 2] == '/' ||
						 udfName[extIndex + index + 2] == 0)))
L
Linus Torvalds 已提交
468 469 470 471
						index++;
				}
				ext[localExtIndex++] = curr;
			}
472
			maxFilenameLen = newLen - CRC_LEN - localExtIndex;
L
Linus Torvalds 已提交
473 474 475 476
			if (newIndex > maxFilenameLen)
				newIndex = maxFilenameLen;
			else
				newIndex = newExtIndex;
477 478
		} else if (newIndex > newLen - CRC_LEN)
			newIndex = newLen - CRC_LEN;
L
Linus Torvalds 已提交
479
		newName[newIndex++] = CRC_MARK;
480
		valueCRC = crc_itu_t(0, fidName, fidNameLen);
481 482 483 484
		newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
		newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
		newName[newIndex++] = hex_asc_upper_hi(valueCRC);
		newName[newIndex++] = hex_asc_upper_lo(valueCRC);
L
Linus Torvalds 已提交
485

486
		if (hasExt) {
L
Linus Torvalds 已提交
487
			newName[newIndex++] = EXT_MARK;
488
			for (index = 0; index < localExtIndex; index++)
L
Linus Torvalds 已提交
489 490 491
				newName[newIndex++] = ext[index];
		}
	}
492

L
Linus Torvalds 已提交
493 494
	return newIndex;
}