unicode.c 10.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * unicode.c
 *
 * PURPOSE
 *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
 *      Also handles filename mangling
 *
 * DESCRIPTION
 *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
 *		http://www.osta.org/
 *	UTF-8 is explained in the IETF RFC XXXX.
 *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
 *
 * COPYRIGHT
 *	This file is distributed under the terms of the GNU General Public
 *	License (GPL). Copies of the GPL can be obtained from:
 *		ftp://prep.ai.mit.edu/pub/gnu/GPL
 *	Each contributing author retains all rights to their own work.
 */

#include "udfdecl.h"

#include <linux/kernel.h>
#include <linux/string.h>	/* for memset */
#include <linux/nls.h>
26
#include <linux/crc-itu-t.h>
L
Linus Torvalds 已提交
27 28 29 30 31

#include "udf_sb.h"

static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);

32
static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
L
Linus Torvalds 已提交
33
{
34
	if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
L
Linus Torvalds 已提交
35
		return 0;
36

L
Linus Torvalds 已提交
37 38 39 40
	memset(dest, 0, sizeof(struct ustr));
	memcpy(dest->u_name, src, strlen);
	dest->u_cmpID = 0x08;
	dest->u_len = strlen;
41

L
Linus Torvalds 已提交
42 43 44 45 46 47
	return strlen;
}

/*
 * udf_build_ustr
 */
48
int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
L
Linus Torvalds 已提交
49 50 51
{
	int usesize;

M
Marcin Slusarz 已提交
52
	if (!dest || !ptr || !size)
L
Linus Torvalds 已提交
53
		return -1;
M
Marcin Slusarz 已提交
54
	BUG_ON(size < 2);
L
Linus Torvalds 已提交
55

M
Marcin Slusarz 已提交
56 57
	usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
	usesize = min(usesize, size - 2);
58
	dest->u_cmpID = ptr[0];
M
Marcin Slusarz 已提交
59 60 61
	dest->u_len = usesize;
	memcpy(dest->u_name, ptr + 1, usesize);
	memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
62

L
Linus Torvalds 已提交
63 64 65 66 67 68
	return 0;
}

/*
 * udf_build_ustr_exact
 */
69
static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
L
Linus Torvalds 已提交
70
{
71
	if ((!dest) || (!ptr) || (!exactsize))
L
Linus Torvalds 已提交
72 73 74
		return -1;

	memset(dest, 0, sizeof(struct ustr));
75 76 77
	dest->u_cmpID = ptr[0];
	dest->u_len = exactsize - 1;
	memcpy(dest->u_name, ptr + 1, exactsize - 1);
78

L
Linus Torvalds 已提交
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
	return 0;
}

/*
 * udf_ocu_to_utf8
 *
 * PURPOSE
 *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
 *
 * PRE-CONDITIONS
 *	utf			Pointer to UTF-8 output buffer.
 *	ocu			Pointer to OSTA Compressed Unicode input buffer
 *				of size UDF_NAME_LEN bytes.
 * 				both of type "struct ustr *"
 *
 * POST-CONDITIONS
 *	<return>		Zero on success.
 *
 * HISTORY
 *	November 12, 1997 - Andrew E. Mileski
 *	Written, tested, and released.
 */
101
int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
L
Linus Torvalds 已提交
102
{
103
	const uint8_t *ocu;
L
Linus Torvalds 已提交
104 105 106 107
	uint8_t cmp_id, ocu_len;
	int i;

	ocu_len = ocu_i->u_len;
108
	if (ocu_len == 0) {
L
Linus Torvalds 已提交
109 110 111 112
		memset(utf_o, 0, sizeof(struct ustr));
		return 0;
	}

113 114 115
	cmp_id = ocu_i->u_cmpID;
	if (cmp_id != 8 && cmp_id != 16) {
		memset(utf_o, 0, sizeof(struct ustr));
116 117
		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
		       cmp_id, ocu_i->u_name);
L
Linus Torvalds 已提交
118 119 120
		return 0;
	}

121 122
	ocu = ocu_i->u_name;
	utf_o->u_len = 0;
123
	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
L
Linus Torvalds 已提交
124 125

		/* Expand OSTA compressed Unicode to Unicode */
126
		uint32_t c = ocu[i++];
L
Linus Torvalds 已提交
127 128 129 130
		if (cmp_id == 16)
			c = (c << 8) | ocu[i++];

		/* Compress Unicode to UTF-8 */
131
		if (c < 0x80U)
132
			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
133
		else if (c < 0x800U) {
M
Marcin Slusarz 已提交
134 135 136 137
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0xc0 | (c >> 6));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 | (c & 0x3f));
138
		} else {
M
Marcin Slusarz 已提交
139 140 141 142 143 144 145
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0xe0 | (c >> 12));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 |
							  ((c >> 6) & 0x3f));
			utf_o->u_name[utf_o->u_len++] =
						(uint8_t)(0x80 | (c & 0x3f));
L
Linus Torvalds 已提交
146 147
		}
	}
148
	utf_o->u_cmpID = 8;
L
Linus Torvalds 已提交
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175

	return utf_o->u_len;
}

/*
 *
 * udf_utf8_to_ocu
 *
 * PURPOSE
 *	Convert UTF-8 to the OSTA Compressed Unicode equivalent.
 *
 * DESCRIPTION
 *	This routine is only called by udf_lookup().
 *
 * PRE-CONDITIONS
 *	ocu			Pointer to OSTA Compressed Unicode output
 *				buffer of size UDF_NAME_LEN bytes.
 *	utf			Pointer to UTF-8 input buffer.
 *	utf_len			Length of UTF-8 input buffer in bytes.
 *
 * POST-CONDITIONS
 *	<return>		Zero on success.
 *
 * HISTORY
 *	November 12, 1997 - Andrew E. Mileski
 *	Written, tested, and released.
 */
176
static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
L
Linus Torvalds 已提交
177 178 179 180 181 182 183 184
{
	unsigned c, i, max_val, utf_char;
	int utf_cnt, u_len;

	memset(ocu, 0, sizeof(dstring) * length);
	ocu[0] = 8;
	max_val = 0xffU;

185
try_again:
L
Linus Torvalds 已提交
186 187 188
	u_len = 0U;
	utf_char = 0U;
	utf_cnt = 0U;
189
	for (i = 0U; i < utf->u_len; i++) {
190
		c = (uint8_t)utf->u_name[i];
L
Linus Torvalds 已提交
191 192

		/* Complete a multi-byte UTF-8 character */
193
		if (utf_cnt) {
L
Linus Torvalds 已提交
194 195 196
			utf_char = (utf_char << 6) | (c & 0x3fU);
			if (--utf_cnt)
				continue;
197
		} else {
L
Linus Torvalds 已提交
198
			/* Check for a multi-byte UTF-8 character */
199
			if (c & 0x80U) {
L
Linus Torvalds 已提交
200
				/* Start a multi-byte UTF-8 character */
201
				if ((c & 0xe0U) == 0xc0U) {
L
Linus Torvalds 已提交
202 203
					utf_char = c & 0x1fU;
					utf_cnt = 1;
204
				} else if ((c & 0xf0U) == 0xe0U) {
L
Linus Torvalds 已提交
205 206
					utf_char = c & 0x0fU;
					utf_cnt = 2;
207
				} else if ((c & 0xf8U) == 0xf0U) {
L
Linus Torvalds 已提交
208 209
					utf_char = c & 0x07U;
					utf_cnt = 3;
210
				} else if ((c & 0xfcU) == 0xf8U) {
L
Linus Torvalds 已提交
211 212
					utf_char = c & 0x03U;
					utf_cnt = 4;
213
				} else if ((c & 0xfeU) == 0xfcU) {
L
Linus Torvalds 已提交
214 215
					utf_char = c & 0x01U;
					utf_cnt = 5;
216
				} else {
L
Linus Torvalds 已提交
217
					goto error_out;
218
				}
L
Linus Torvalds 已提交
219
				continue;
220
			} else {
L
Linus Torvalds 已提交
221 222
				/* Single byte UTF-8 character (most common) */
				utf_char = c;
223
			}
L
Linus Torvalds 已提交
224 225 226
		}

		/* Choose no compression if necessary */
227
		if (utf_char > max_val) {
228
			if (max_val == 0xffU) {
L
Linus Torvalds 已提交
229
				max_val = 0xffffU;
230
				ocu[0] = (uint8_t)0x10U;
L
Linus Torvalds 已提交
231 232 233 234 235
				goto try_again;
			}
			goto error_out;
		}

M
Marcin Slusarz 已提交
236
		if (max_val == 0xffffU)
237 238
			ocu[++u_len] = (uint8_t)(utf_char >> 8);
		ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
L
Linus Torvalds 已提交
239 240
	}

241
	if (utf_cnt) {
242
error_out:
L
Linus Torvalds 已提交
243 244 245 246
		ocu[++u_len] = '?';
		printk(KERN_DEBUG "udf: bad UTF-8 character\n");
	}

247 248
	ocu[length - 1] = (uint8_t)u_len + 1;

L
Linus Torvalds 已提交
249 250 251
	return u_len + 1;
}

252
static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
253
			const struct ustr *ocu_i)
L
Linus Torvalds 已提交
254
{
255
	const uint8_t *ocu;
L
Linus Torvalds 已提交
256 257 258 259 260
	uint8_t cmp_id, ocu_len;
	int i;


	ocu_len = ocu_i->u_len;
261
	if (ocu_len == 0) {
L
Linus Torvalds 已提交
262 263 264 265
		memset(utf_o, 0, sizeof(struct ustr));
		return 0;
	}

266 267 268
	cmp_id = ocu_i->u_cmpID;
	if (cmp_id != 8 && cmp_id != 16) {
		memset(utf_o, 0, sizeof(struct ustr));
269 270
		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
		       cmp_id, ocu_i->u_name);
L
Linus Torvalds 已提交
271 272 273
		return 0;
	}

274 275
	ocu = ocu_i->u_name;
	utf_o->u_len = 0;
276
	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
L
Linus Torvalds 已提交
277
		/* Expand OSTA compressed Unicode to Unicode */
278
		uint32_t c = ocu[i++];
L
Linus Torvalds 已提交
279 280 281
		if (cmp_id == 16)
			c = (c << 8) | ocu[i++];

282 283
		utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
					      UDF_NAME_LEN - utf_o->u_len);
L
Linus Torvalds 已提交
284
	}
285
	utf_o->u_cmpID = 8;
L
Linus Torvalds 已提交
286 287 288 289

	return utf_o->u_len;
}

290
static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
291
			int length)
L
Linus Torvalds 已提交
292 293 294 295 296 297 298 299 300
{
	unsigned len, i, max_val;
	uint16_t uni_char;
	int u_len;

	memset(ocu, 0, sizeof(dstring) * length);
	ocu[0] = 8;
	max_val = 0xffU;

301
try_again:
L
Linus Torvalds 已提交
302
	u_len = 0U;
303 304
	for (i = 0U; i < uni->u_len; i++) {
		len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
L
Linus Torvalds 已提交
305 306 307
		if (len <= 0)
			continue;

308
		if (uni_char > max_val) {
L
Linus Torvalds 已提交
309
			max_val = 0xffffU;
310
			ocu[0] = (uint8_t)0x10U;
L
Linus Torvalds 已提交
311 312
			goto try_again;
		}
313

L
Linus Torvalds 已提交
314
		if (max_val == 0xffffU)
315 316
			ocu[++u_len] = (uint8_t)(uni_char >> 8);
		ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
L
Linus Torvalds 已提交
317 318 319
		i += len - 1;
	}

320
	ocu[length - 1] = (uint8_t)u_len + 1;
L
Linus Torvalds 已提交
321 322 323
	return u_len + 1;
}

324
int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
325
		     int flen)
L
Linus Torvalds 已提交
326
{
327 328
	struct ustr *filename, *unifilename;
	int len = 0;
L
Linus Torvalds 已提交
329

330 331
	filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
	if (!filename)
L
Linus Torvalds 已提交
332 333
		return 0;

334 335 336 337 338 339 340
	unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
	if (!unifilename)
		goto out1;

	if (udf_build_ustr_exact(unifilename, sname, flen))
		goto out2;

341
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
342
		if (!udf_CS0toUTF8(filename, unifilename)) {
M
Marcin Slusarz 已提交
343 344
			udf_debug("Failed in udf_get_filename: sname = %s\n",
				  sname);
345
			goto out2;
L
Linus Torvalds 已提交
346
		}
347
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
348 349
		if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
				  unifilename)) {
M
Marcin Slusarz 已提交
350 351
			udf_debug("Failed in udf_get_filename: sname = %s\n",
				  sname);
352
			goto out2;
L
Linus Torvalds 已提交
353
		}
M
Marcin Slusarz 已提交
354
	} else
355 356 357 358 359 360 361 362 363
		goto out2;

	len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
				     unifilename->u_name, unifilename->u_len);
out2:
	kfree(unifilename);
out1:
	kfree(filename);
	return len;
L
Linus Torvalds 已提交
364 365
}

366 367
int udf_put_filename(struct super_block *sb, const uint8_t *sname,
		     uint8_t *dname, int flen)
L
Linus Torvalds 已提交
368 369 370 371
{
	struct ustr unifilename;
	int namelen;

M
Marcin Slusarz 已提交
372
	if (!udf_char_to_ustr(&unifilename, sname, flen))
L
Linus Torvalds 已提交
373 374
		return 0;

375
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
376
		namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
M
Marcin Slusarz 已提交
377
		if (!namelen)
L
Linus Torvalds 已提交
378
			return 0;
379
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
M
Marcin Slusarz 已提交
380 381 382
		namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
					&unifilename, UDF_NAME_LEN);
		if (!namelen)
L
Linus Torvalds 已提交
383
			return 0;
M
Marcin Slusarz 已提交
384
	} else
L
Linus Torvalds 已提交
385 386 387 388 389 390
		return 0;

	return namelen;
}

#define ILLEGAL_CHAR_MARK	'_'
391 392 393
#define EXT_MARK		'.'
#define CRC_MARK		'#'
#define EXT_SIZE 		5
L
Linus Torvalds 已提交
394

M
Marcin Slusarz 已提交
395 396 397
static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
				  int udfLen, uint8_t *fidName,
				  int fidNameLen)
L
Linus Torvalds 已提交
398
{
399
	int index, newIndex = 0, needsCRC = 0;
L
Linus Torvalds 已提交
400 401 402 403 404
	int extIndex = 0, newExtIndex = 0, hasExt = 0;
	unsigned short valueCRC;
	uint8_t curr;
	const uint8_t hexChar[] = "0123456789ABCDEF";

405 406
	if (udfName[0] == '.' &&
	    (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
L
Linus Torvalds 已提交
407 408 409
		needsCRC = 1;
		newIndex = udfLen;
		memcpy(newName, udfName, udfLen);
410 411
	} else {
		for (index = 0; index < udfLen; index++) {
L
Linus Torvalds 已提交
412
			curr = udfName[index];
413
			if (curr == '/' || curr == 0) {
L
Linus Torvalds 已提交
414 415
				needsCRC = 1;
				curr = ILLEGAL_CHAR_MARK;
M
Marcin Slusarz 已提交
416 417 418
				while (index + 1 < udfLen &&
						(udfName[index + 1] == '/' ||
						 udfName[index + 1] == 0))
L
Linus Torvalds 已提交
419
					index++;
M
Marcin Slusarz 已提交
420 421 422 423
			}
			if (curr == EXT_MARK &&
					(udfLen - index - 1) <= EXT_SIZE) {
				if (udfLen == index + 1)
L
Linus Torvalds 已提交
424
					hasExt = 0;
M
Marcin Slusarz 已提交
425
				else {
L
Linus Torvalds 已提交
426 427 428 429 430 431 432 433 434 435 436
					hasExt = 1;
					extIndex = index;
					newExtIndex = newIndex;
				}
			}
			if (newIndex < 256)
				newName[newIndex++] = curr;
			else
				needsCRC = 1;
		}
	}
437
	if (needsCRC) {
L
Linus Torvalds 已提交
438 439 440
		uint8_t ext[EXT_SIZE];
		int localExtIndex = 0;

441
		if (hasExt) {
L
Linus Torvalds 已提交
442
			int maxFilenameLen;
M
Marcin Slusarz 已提交
443 444 445
			for (index = 0;
			     index < EXT_SIZE && extIndex + index + 1 < udfLen;
			     index++) {
L
Linus Torvalds 已提交
446 447
				curr = udfName[extIndex + index + 1];

448
				if (curr == '/' || curr == 0) {
L
Linus Torvalds 已提交
449 450
					needsCRC = 1;
					curr = ILLEGAL_CHAR_MARK;
M
Marcin Slusarz 已提交
451 452 453 454
					while (extIndex + index + 2 < udfLen &&
					      (index + 1 < EXT_SIZE &&
						(udfName[extIndex + index + 2] == '/' ||
						 udfName[extIndex + index + 2] == 0)))
L
Linus Torvalds 已提交
455 456 457 458 459 460 461 462 463
						index++;
				}
				ext[localExtIndex++] = curr;
			}
			maxFilenameLen = 250 - localExtIndex;
			if (newIndex > maxFilenameLen)
				newIndex = maxFilenameLen;
			else
				newIndex = newExtIndex;
M
Marcin Slusarz 已提交
464
		} else if (newIndex > 250)
L
Linus Torvalds 已提交
465 466
			newIndex = 250;
		newName[newIndex++] = CRC_MARK;
467
		valueCRC = crc_itu_t(0, fidName, fidNameLen);
L
Linus Torvalds 已提交
468 469 470 471 472
		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];

473
		if (hasExt) {
L
Linus Torvalds 已提交
474
			newName[newIndex++] = EXT_MARK;
475
			for (index = 0; index < localExtIndex; index++)
L
Linus Torvalds 已提交
476 477 478
				newName[newIndex++] = ext[index];
		}
	}
479

L
Linus Torvalds 已提交
480 481
	return newIndex;
}