提交 1507ebf8 编写于 作者: R Rich Felker

byte-based C locale, phase 1: multibyte character handling functions

this patch makes the functions which work directly on multibyte
characters treat the high bytes as individual abstract code units
rather than as multibyte sequences when MB_CUR_MAX is 1. since
MB_CUR_MAX is presently defined as a constant 4, all of the new code
added is dead code, and optimizing compilers' code generation should
not be affected at all. a future commit will activate the new code.

as abstract code units, bytes 0x80 to 0xff are represented by wchar_t
values 0xdf80 to 0xdfff, at the end of the surrogates range. this
ensures that they will never be misinterpreted as Unicode characters,
and that all wctype functions return false for these "characters"
without needing locale-specific logic. a high range outside of Unicode
such as 0x7fffff80 to 0x7fffffff was also considered, but since C11's
char16_t also needs to be able to represent conversions of these
bytes, the surrogate range was the natural choice.
上级 38e2f727
#include <stddef.h> #include <stdlib.h>
#include "locale_impl.h"
size_t __ctype_get_mb_cur_max() size_t __ctype_get_mb_cur_max()
{ {
return 4; return MB_CUR_MAX;
} }
...@@ -33,7 +33,8 @@ char *__nl_langinfo_l(nl_item item, locale_t loc) ...@@ -33,7 +33,8 @@ char *__nl_langinfo_l(nl_item item, locale_t loc)
int idx = item & 65535; int idx = item & 65535;
const char *str; const char *str;
if (item == CODESET) return "UTF-8"; if (item == CODESET)
return MB_CUR_MAX==1 ? "UTF-8-CODE-UNITS" : "UTF-8";
switch (cat) { switch (cat) {
case LC_NUMERIC: case LC_NUMERIC:
......
#include <stdio.h> #include <stdio.h>
#include <wchar.h> #include <wchar.h>
#include <stdlib.h>
#include "internal.h"
wint_t btowc(int c) wint_t btowc(int c)
{ {
c = (unsigned char)c; int b = (unsigned char)c;
return c<128U ? c : EOF; return b<128U ? b : (MB_CUR_MAX==1 && c!=EOF) ? CODEUNIT(c) : WEOF;
} }
...@@ -23,3 +23,10 @@ extern const uint32_t bittab[]; ...@@ -23,3 +23,10 @@ extern const uint32_t bittab[];
#define SA 0xc2u #define SA 0xc2u
#define SB 0xf4u #define SB 0xf4u
/* Arbitrary encoding for representing code units instead of characters. */
#define CODEUNIT(c) (0xdfff & (signed char)(c))
#define IS_CODEUNIT(c) ((unsigned)(c)-0xdf80 < 0x80)
/* Get inline definition of MB_CUR_MAX. */
#include "locale_impl.h"
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* unnecessary. * unnecessary.
*/ */
#include <stdlib.h>
#include <wchar.h> #include <wchar.h>
#include <errno.h> #include <errno.h>
#include "internal.h" #include "internal.h"
...@@ -27,6 +28,7 @@ size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate ...@@ -27,6 +28,7 @@ size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate
if (!n) return -2; if (!n) return -2;
if (!c) { if (!c) {
if (*s < 0x80) return !!(*wc = *s); if (*s < 0x80) return !!(*wc = *s);
if (MB_CUR_MAX==1) return (*wc = CODEUNIT(*s)), 1;
if (*s-SA > SB-SA) goto ilseq; if (*s-SA > SB-SA) goto ilseq;
c = bittab[*s++-SA]; n--; c = bittab[*s++-SA]; n--;
} }
......
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
#include <stdint.h> #include <stdint.h>
#include <wchar.h> #include <wchar.h>
#include <errno.h> #include <errno.h>
#include <string.h>
#include <stdlib.h>
#include "internal.h" #include "internal.h"
size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbstate_t *restrict st) size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbstate_t *restrict st)
...@@ -24,6 +26,23 @@ size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbs ...@@ -24,6 +26,23 @@ size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbs
} }
} }
if (MB_CUR_MAX==1) {
if (!ws) return strlen((const char *)s);
for (;;) {
if (!wn) {
*src = (const void *)s;
return wn0;
}
if (!*s) break;
c = *s++;
*ws++ = CODEUNIT(c);
wn--;
}
*ws = 0;
*src = 0;
return wn0-wn;
}
if (!ws) for (;;) { if (!ws) for (;;) {
if (*s-1u < 0x7f && (uintptr_t)s%4 == 0) { if (*s-1u < 0x7f && (uintptr_t)s%4 == 0) {
while (!(( *(uint32_t*)s | *(uint32_t*)s-0x01010101) & 0x80808080)) { while (!(( *(uint32_t*)s | *(uint32_t*)s-0x01010101) & 0x80808080)) {
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* unnecessary. * unnecessary.
*/ */
#include <stdlib.h>
#include <wchar.h> #include <wchar.h>
#include <errno.h> #include <errno.h>
#include "internal.h" #include "internal.h"
...@@ -19,6 +20,7 @@ int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n) ...@@ -19,6 +20,7 @@ int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n)
if (!wc) wc = &dummy; if (!wc) wc = &dummy;
if (*s < 0x80) return !!(*wc = *s); if (*s < 0x80) return !!(*wc = *s);
if (MB_CUR_MAX==1) return (*wc = CODEUNIT(*s)), 1;
if (*s-SA > SB-SA) goto ilseq; if (*s-SA > SB-SA) goto ilseq;
c = bittab[*s++-SA]; c = bittab[*s++-SA];
......
...@@ -4,8 +4,10 @@ ...@@ -4,8 +4,10 @@
* unnecessary. * unnecessary.
*/ */
#include <stdlib.h>
#include <wchar.h> #include <wchar.h>
#include <errno.h> #include <errno.h>
#include "internal.h"
size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st) size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
{ {
...@@ -13,6 +15,13 @@ size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st) ...@@ -13,6 +15,13 @@ size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st)
if ((unsigned)wc < 0x80) { if ((unsigned)wc < 0x80) {
*s = wc; *s = wc;
return 1; return 1;
} else if (MB_CUR_MAX == 1) {
if (!IS_CODEUNIT(wc)) {
errno = EILSEQ;
return -1;
}
*s = wc;
return 1;
} else if ((unsigned)wc < 0x800) { } else if ((unsigned)wc < 0x800) {
*s++ = 0xc0 | (wc>>6); *s++ = 0xc0 | (wc>>6);
*s = 0x80 | (wc&0x3f); *s = 0x80 | (wc&0x3f);
......
#include <stdio.h>
#include <wchar.h> #include <wchar.h>
#include <stdlib.h>
#include "internal.h"
int wctob(wint_t c) int wctob(wint_t c)
{ {
if (c < 128U) return c; if (c < 128U) return c;
if (MB_CUR_MAX==1 && IS_CODEUNIT(c)) return (unsigned char)c;
return EOF; return EOF;
} }
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <wchar.h> #include <wchar.h>
#include <wctype.h> #include <wctype.h>
#include "locale_impl.h"
#define END 0 #define END 0
#define UNMATCHABLE -2 #define UNMATCHABLE -2
...@@ -229,7 +230,7 @@ static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n ...@@ -229,7 +230,7 @@ static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n
* On illegal sequences we may get it wrong, but in that case * On illegal sequences we may get it wrong, but in that case
* we necessarily have a matching failure anyway. */ * we necessarily have a matching failure anyway. */
for (s=endstr; s>str && tailcnt; tailcnt--) { for (s=endstr; s>str && tailcnt; tailcnt--) {
if (s[-1] < 128U) s--; if (s[-1] < 128U || MB_CUR_MAX==1) s--;
else while ((unsigned char)*--s-0x80U<0x40 && s>str); else while ((unsigned char)*--s-0x80U<0x40 && s>str);
} }
if (tailcnt) return FNM_NOMATCH; if (tailcnt) return FNM_NOMATCH;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册