unicode.c 8.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * unicode.c
 *
 * PURPOSE
 *	Routines for converting between UTF-8 and OSTA Compressed Unicode.
 *      Also handles filename mangling
 *
 * DESCRIPTION
 *	OSTA Compressed Unicode is explained in the OSTA UDF specification.
 *		http://www.osta.org/
 *	UTF-8 is explained in the IETF RFC XXXX.
 *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
 *
 * COPYRIGHT
 *	This file is distributed under the terms of the GNU General Public
 *	License (GPL). Copies of the GPL can be obtained from:
 *		ftp://prep.ai.mit.edu/pub/gnu/GPL
 *	Each contributing author retains all rights to their own work.
 */

#include "udfdecl.h"

#include <linux/kernel.h>
#include <linux/string.h>	/* for memset */
#include <linux/nls.h>
26
#include <linux/crc-itu-t.h>
27
#include <linux/slab.h>
L
Linus Torvalds 已提交
28 29 30

#include "udf_sb.h"

31 32 33
static int udf_uni2char_utf8(wchar_t uni,
			     unsigned char *out,
			     int boundlen)
L
Linus Torvalds 已提交
34
{
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
	int u_len = 0;

	if (boundlen <= 0)
		return -ENAMETOOLONG;

	if (uni < 0x80) {
		out[u_len++] = (unsigned char)uni;
	} else if (uni < 0x800) {
		if (boundlen < 2)
			return -ENAMETOOLONG;
		out[u_len++] = (unsigned char)(0xc0 | (uni >> 6));
		out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
	} else {
		if (boundlen < 3)
			return -ENAMETOOLONG;
		out[u_len++] = (unsigned char)(0xe0 | (uni >> 12));
		out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f));
		out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
L
Linus Torvalds 已提交
53
	}
54
	return u_len;
L
Linus Torvalds 已提交
55 56
}

57 58 59
static int udf_char2uni_utf8(const unsigned char *in,
			     int boundlen,
			     wchar_t *uni)
L
Linus Torvalds 已提交
60
{
61 62 63
	unsigned int utf_char;
	unsigned char c;
	int utf_cnt, u_len;
64

65 66 67 68
	utf_char = 0;
	utf_cnt = 0;
	for (u_len = 0; u_len < boundlen;) {
		c = in[u_len++];
L
Linus Torvalds 已提交
69 70

		/* Complete a multi-byte UTF-8 character */
71
		if (utf_cnt) {
72
			utf_char = (utf_char << 6) | (c & 0x3f);
L
Linus Torvalds 已提交
73 74
			if (--utf_cnt)
				continue;
75
		} else {
L
Linus Torvalds 已提交
76
			/* Check for a multi-byte UTF-8 character */
77
			if (c & 0x80) {
L
Linus Torvalds 已提交
78
				/* Start a multi-byte UTF-8 character */
79 80
				if ((c & 0xe0) == 0xc0) {
					utf_char = c & 0x1f;
L
Linus Torvalds 已提交
81
					utf_cnt = 1;
82 83
				} else if ((c & 0xf0) == 0xe0) {
					utf_char = c & 0x0f;
L
Linus Torvalds 已提交
84
					utf_cnt = 2;
85 86
				} else if ((c & 0xf8) == 0xf0) {
					utf_char = c & 0x07;
L
Linus Torvalds 已提交
87
					utf_cnt = 3;
88 89
				} else if ((c & 0xfc) == 0xf8) {
					utf_char = c & 0x03;
L
Linus Torvalds 已提交
90
					utf_cnt = 4;
91 92
				} else if ((c & 0xfe) == 0xfc) {
					utf_char = c & 0x01;
L
Linus Torvalds 已提交
93
					utf_cnt = 5;
94
				} else {
95 96
					utf_cnt = -1;
					break;
97
				}
L
Linus Torvalds 已提交
98
				continue;
99
			} else {
L
Linus Torvalds 已提交
100 101
				/* Single byte UTF-8 character (most common) */
				utf_char = c;
102
			}
L
Linus Torvalds 已提交
103
		}
104 105
		*uni = utf_char;
		break;
L
Linus Torvalds 已提交
106
	}
107
	if (utf_cnt) {
108 109
		*uni = '?';
		return -EINVAL;
L
Linus Torvalds 已提交
110
	}
111
	return u_len;
L
Linus Torvalds 已提交
112 113
}

114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
#define ILLEGAL_CHAR_MARK	'_'
#define EXT_MARK		'.'
#define CRC_MARK		'#'
#define EXT_SIZE		5
/* Number of chars we need to store generated CRC to make filename unique */
#define CRC_LEN			5

static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
			      int *str_o_idx,
			      const uint8_t *str_i, int str_i_max_len,
			      int *str_i_idx,
			      int u_ch, int *needsCRC,
			      int (*conv_f)(wchar_t, unsigned char *, int),
			      int translate)
{
	uint32_t c;
	int illChar = 0;
	int len, gotch = 0;

	for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
		if (*str_o_idx >= str_o_max_len) {
			*needsCRC = 1;
			return gotch;
		}

		/* Expand OSTA compressed Unicode to Unicode */
		c = str_i[*str_i_idx];
		if (u_ch > 1)
			c = (c << 8) | str_i[*str_i_idx + 1];

		if (translate && (c == '/' || c == 0))
			illChar = 1;
		else if (illChar)
			break;
		else
			gotch = 1;
	}
	if (illChar) {
		*needsCRC = 1;
		c = ILLEGAL_CHAR_MARK;
		gotch = 1;
	}
	if (gotch) {
		len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
		/* Valid character? */
		if (len >= 0)
			*str_o_idx += len;
		else if (len == -ENAMETOOLONG) {
			*needsCRC = 1;
			gotch = 0;
		} else {
			str_o[(*str_o_idx)++] = '?';
			*needsCRC = 1;
		}
	}
	return gotch;
}

172 173
static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
			     const uint8_t *ocu, int ocu_len,
174 175
			     int (*conv_f)(wchar_t, unsigned char *, int),
			     int translate)
L
Linus Torvalds 已提交
176
{
177
	uint32_t c;
178
	uint8_t cmp_id;
179 180 181 182 183 184 185 186 187 188 189 190
	int idx, len;
	int u_ch;
	int needsCRC = 0;
	int ext_i_len, ext_max_len;
	int str_o_len = 0;	/* Length of resulting output */
	int ext_o_len = 0;	/* Extension output length */
	int ext_crc_len = 0;	/* Extension output length if used with CRC */
	int i_ext = -1;		/* Extension position in input buffer */
	int o_crc = 0;		/* Rightmost possible output pos for CRC+ext */
	unsigned short valueCRC;
	uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
	uint8_t crc[CRC_LEN];
L
Linus Torvalds 已提交
191

192 193
	if (str_max_len <= 0)
		return 0;
L
Linus Torvalds 已提交
194

195
	if (ocu_len == 0) {
196
		memset(str_o, 0, str_max_len);
L
Linus Torvalds 已提交
197 198 199
		return 0;
	}

200
	cmp_id = ocu[0];
201
	if (cmp_id != 8 && cmp_id != 16) {
202
		memset(str_o, 0, str_max_len);
203
		pr_err("unknown compression code (%d)\n", cmp_id);
204
		return -EINVAL;
L
Linus Torvalds 已提交
205
	}
206
	u_ch = cmp_id >> 3;
L
Linus Torvalds 已提交
207

208 209
	ocu++;
	ocu_len--;
L
Linus Torvalds 已提交
210

211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
	if (ocu_len % u_ch) {
		pr_err("incorrect filename length (%d)\n", ocu_len + 1);
		return -EINVAL;
	}

	if (translate) {
		/* Look for extension */
		for (idx = ocu_len - u_ch, ext_i_len = 0;
		     (idx >= 0) && (ext_i_len < EXT_SIZE);
		     idx -= u_ch, ext_i_len++) {
			c = ocu[idx];
			if (u_ch > 1)
				c = (c << 8) | ocu[idx + 1];

			if (c == EXT_MARK) {
				if (ext_i_len)
					i_ext = idx;
				break;
			}
		}
		if (i_ext >= 0) {
			/* Convert extension */
			ext_max_len = min_t(int, sizeof(ext), str_max_len);
			ext[ext_o_len++] = EXT_MARK;
			idx = i_ext + u_ch;
			while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
						  ocu, ocu_len, &idx,
						  u_ch, &needsCRC,
						  conv_f, translate)) {
				if ((ext_o_len + CRC_LEN) < str_max_len)
					ext_crc_len = ext_o_len;
			}
		}
	}

	idx = 0;
	while (1) {
		if (translate && (idx == i_ext)) {
			if (str_o_len > (str_max_len - ext_o_len))
				needsCRC = 1;
251
			break;
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
		}

		if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
					ocu, ocu_len, &idx,
					u_ch, &needsCRC, conv_f, translate))
			break;

		if (translate &&
		    (str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
			o_crc = str_o_len;
	}

	if (translate) {
		if (str_o_len <= 2 && str_o[0] == '.' &&
		    (str_o_len == 1 || str_o[1] == '.'))
			needsCRC = 1;
		if (needsCRC) {
			str_o_len = o_crc;
			valueCRC = crc_itu_t(0, ocu, ocu_len);
			crc[0] = CRC_MARK;
			crc[1] = hex_asc_upper_hi(valueCRC >> 8);
			crc[2] = hex_asc_upper_lo(valueCRC >> 8);
			crc[3] = hex_asc_upper_hi(valueCRC);
			crc[4] = hex_asc_upper_lo(valueCRC);
			len = min_t(int, CRC_LEN, str_max_len - str_o_len);
			memcpy(&str_o[str_o_len], crc, len);
			str_o_len += len;
			ext_o_len = ext_crc_len;
		}
		if (ext_o_len > 0) {
			memcpy(&str_o[str_o_len], ext, ext_o_len);
			str_o_len += ext_o_len;
		}
L
Linus Torvalds 已提交
285 286
	}

287
	return str_o_len;
L
Linus Torvalds 已提交
288 289
}

290 291
static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
			   const uint8_t *str_i, int str_len,
292
			   int (*conv_f)(const unsigned char *, int, wchar_t *))
L
Linus Torvalds 已提交
293
{
294 295 296
	int i, len;
	unsigned int max_val;
	wchar_t uni_char;
297
	int u_len, u_ch;
L
Linus Torvalds 已提交
298

299 300 301 302
	if (ocu_max_len <= 0)
		return 0;

	memset(ocu, 0, ocu_max_len);
L
Linus Torvalds 已提交
303
	ocu[0] = 8;
304
	max_val = 0xff;
305
	u_ch = 1;
L
Linus Torvalds 已提交
306

307
try_again:
308 309
	u_len = 1;
	for (i = 0; i < str_len; i++) {
310
		/* Name didn't fit? */
311
		if (u_len + u_ch > ocu_max_len)
312
			return 0;
313
		len = conv_f(&str_i[i], str_len - i, &uni_char);
314
		if (!len)
L
Linus Torvalds 已提交
315
			continue;
316 317 318 319 320
		/* Invalid character, deal with it */
		if (len < 0) {
			len = 1;
			uni_char = '?';
		}
L
Linus Torvalds 已提交
321

322
		if (uni_char > max_val) {
323 324
			max_val = 0xffff;
			ocu[0] = 0x10;
325
			u_ch = 2;
L
Linus Torvalds 已提交
326 327
			goto try_again;
		}
328

329
		if (max_val == 0xffff)
330 331
			ocu[u_len++] = (uint8_t)(uni_char >> 8);
		ocu[u_len++] = (uint8_t)(uni_char & 0xff);
L
Linus Torvalds 已提交
332 333 334
		i += len - 1;
	}

335
	return u_len;
L
Linus Torvalds 已提交
336 337
}

338 339
int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len,
		      const uint8_t *ocu_i, int i_len)
340
{
341 342 343 344 345 346 347 348 349 350 351 352
	int s_len = 0;

	if (i_len > 0) {
		s_len = ocu_i[i_len - 1];
		if (s_len >= i_len) {
			pr_err("incorrect dstring lengths (%d/%d)\n",
			       s_len, i_len);
			return -EINVAL;
		}
	}

	return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len,
353
				 udf_uni2char_utf8, 0);
354 355
}

356
int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
357
		     uint8_t *dname, int dlen)
L
Linus Torvalds 已提交
358
{
359
	int (*conv_f)(wchar_t, unsigned char *, int);
360
	int ret;
L
Linus Torvalds 已提交
361

362 363 364
	if (!slen)
		return -EIO;

365 366 367
	if (dlen <= 0)
		return 0;

368
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
369
		conv_f = udf_uni2char_utf8;
370
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
371
		conv_f = UDF_SB(sb)->s_nls_map->uni2char;
M
Marcin Slusarz 已提交
372
	} else
373
		BUG();
374

375
	ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
376 377 378
	/* Zero length filename isn't valid... */
	if (ret == 0)
		ret = -EINVAL;
379
	return ret;
L
Linus Torvalds 已提交
380 381
}

382 383
int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
		     uint8_t *dname, int dlen)
L
Linus Torvalds 已提交
384
{
385
	int (*conv_f)(const unsigned char *, int, wchar_t *);
L
Linus Torvalds 已提交
386

387
	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
388
		conv_f = udf_char2uni_utf8;
389
	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
390
		conv_f = UDF_SB(sb)->s_nls_map->char2uni;
M
Marcin Slusarz 已提交
391
	} else
392
		BUG();
L
Linus Torvalds 已提交
393

394
	return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
L
Linus Torvalds 已提交
395 396
}