Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
2dot5
ClickHouse
提交
ebbe877a
C
ClickHouse
项目概览
2dot5
/
ClickHouse
通知
3
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
C
ClickHouse
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
ebbe877a
编写于
10月 29, 2016
作者:
A
Alexey Milovidov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Preparation [#METR-23332].
上级
f9eb5368
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
34 addition
and
32 deletion
+34
-32
dbms/include/DB/IO/WriteBufferValidUTF8.h
dbms/include/DB/IO/WriteBufferValidUTF8.h
+0
-3
dbms/src/IO/WriteBufferValidUTF8.cpp
dbms/src/IO/WriteBufferValidUTF8.cpp
+34
-29
未找到文件。
dbms/include/DB/IO/WriteBufferValidUTF8.h
浏览文件 @
ebbe877a
...
...
@@ -20,9 +20,6 @@ private:
bool
just_put_replacement
=
false
;
std
::
string
replacement
;
/// Таблица взята из ConvertUTF.c от Unicode, Inc. Позволяет узнать длину последовательности по первому байту.
static
const
char
trailingBytesForUTF8
[
256
];
void
putReplacement
();
void
putValid
(
char
*
data
,
size_t
len
);
...
...
dbms/src/IO/WriteBufferValidUTF8.cpp
浏览文件 @
ebbe877a
#include <Poco/UTF8Encoding.h>
#include <DB/IO/WriteBufferValidUTF8.h>
#include <DB/Core/Types.h>
#ifdef __x86_64__
#include <emmintrin.h>
...
...
@@ -11,28 +12,31 @@ namespace DB
const
size_t
WriteBufferValidUTF8
::
DEFAULT_SIZE
=
4096
;
/** Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
* left as-is for anyone who may want to do such conversion, which was
* allowed in earlier algorithms.
*/
const
char
WriteBufferValidUTF8
::
trailingBytesForUTF8
[
256
]
=
namespace
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
4
,
4
,
4
,
4
,
5
,
5
,
5
,
5
};
/** Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
* left as-is for anyone who may want to do such conversion, which was
* allowed in earlier algorithms.
*/
const
UInt8
length_of_utf8_sequence
[
256
]
=
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
4
,
5
,
5
,
5
,
5
,
6
,
6
,
6
,
6
};
}
WriteBufferValidUTF8
::
WriteBufferValidUTF8
(
WriteBuffer
&
output_buffer
,
bool
group_replacements
,
const
char
*
replacement
,
size_t
size
)
:
BufferWithOwnMemory
<
WriteBuffer
>
(
std
::
max
(
4
LU
,
size
)),
output_buffer
(
output_buffer
),
:
BufferWithOwnMemory
<
WriteBuffer
>
(
std
::
max
(
32
LU
,
size
)),
output_buffer
(
output_buffer
),
group_replacements
(
group_replacements
),
replacement
(
replacement
)
{
}
...
...
@@ -60,13 +64,13 @@ inline void WriteBufferValidUTF8::putValid(char *data, size_t len)
void
WriteBufferValidUTF8
::
nextImpl
()
{
char
*
p
=
memory
.
data
();
char
*
valid_start
=
p
;
char
*
p
=
memory
.
data
();
char
*
valid_start
=
p
;
while
(
p
<
pos
)
{
#ifdef __x86_64__
///
Быстрый пропуск
ASCII
///
Fast skip of
ASCII
static
constexpr
size_t
SIMD_BYTES
=
16
;
const
char
*
simd_end
=
p
+
(
pos
-
p
)
/
SIMD_BYTES
*
SIMD_BYTES
;
...
...
@@ -77,11 +81,11 @@ void WriteBufferValidUTF8::nextImpl()
break
;
#endif
size_t
len
=
1
+
static_cast
<
size_t
>
(
trailingBytesForUTF8
[
static_cast
<
unsigned
char
>
(
*
p
)])
;
size_t
len
=
length_of_utf8_sequence
[
static_cast
<
unsigned
char
>
(
*
p
)]
;
if
(
len
>
4
)
{
///
Невалидное начало последовательности. Пропустим один байт
.
///
Invalid start of sequence. Skip one byte
.
putValid
(
valid_start
,
p
-
valid_start
);
putReplacement
();
++
p
;
...
...
@@ -89,17 +93,17 @@ void WriteBufferValidUTF8::nextImpl()
}
else
if
(
p
+
len
>
pos
)
{
///
Еще не вся последовательность записана
.
///
Sequence was not fully written to this buffer
.
break
;
}
else
if
(
Poco
::
UTF8Encoding
::
isLegal
(
reinterpret_cast
<
unsigned
char
*>
(
p
),
len
))
else
if
(
Poco
::
UTF8Encoding
::
isLegal
(
reinterpret_cast
<
unsigned
char
*>
(
p
),
len
))
{
///
Валидная последовательность
.
///
Valid sequence
.
p
+=
len
;
}
else
{
///
Невалидная последовательность. Пропустим только первый байт
.
///
Invalid sequence. Skip just first byte
.
putValid
(
valid_start
,
p
-
valid_start
);
putReplacement
();
++
p
;
...
...
@@ -110,7 +114,8 @@ void WriteBufferValidUTF8::nextImpl()
putValid
(
valid_start
,
p
-
valid_start
);
size_t
cnt
=
pos
-
p
;
/// Сдвинем незаконченную последовательность в начало буфера.
/// Shift unfinished sequence to start of buffer.
for
(
size_t
i
=
0
;
i
<
cnt
;
++
i
)
memory
[
i
]
=
p
[
i
];
...
...
@@ -120,10 +125,10 @@ void WriteBufferValidUTF8::nextImpl()
void
WriteBufferValidUTF8
::
finish
()
{
///
Выпишем все полные последовательности из буфера
.
///
Write all complete sequences from buffer
.
nextImpl
();
///
Если осталась незаконченная последовательность, запишем
replacement.
///
If unfinished sequence at end, then write
replacement.
if
(
working_buffer
.
begin
()
!=
memory
.
data
())
putReplacement
();
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录