未验证 提交 c92e613b 编写于 作者: Z zlx19950903 提交者: GitHub

Add a function `htmlOrXmlCoarseParse` to extract content from html or xml format string. (#19600)

* add html and xml coarse parse

* add test file

* add conditional check: hyperscan

* fix style error

* add conditional check

* bug fix

* delete unit

* typos check fix

* add unit test

* style check fix

* fix build error: case style

* acradis_skip test fix

* LINT error fix

* Remove comments
Co-authored-by: Nguojiantao <guojiantao15@mails.ucas.ac.cn>
Co-authored-by: NIvan <5627721+abyss7@users.noreply.github.com>
Co-authored-by: NIvan Lezhankin <ilezhankin@yandex-team.ru>
上级 419962db
......@@ -342,6 +342,7 @@ function run_tests
# JSON functions
01666_blns
01674_htm_xml_coarse_parse
)
(time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 ||:) | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt"
......
此差异已折叠。
......@@ -6,7 +6,9 @@ namespace DB
{
class FunctionFactory;
#if USE_HYPERSCAN
void registerFunctionHtmlOrXmlCoarseParse(FunctionFactory &);
#endif
void registerFunctionRepeat(FunctionFactory &);
void registerFunctionEmpty(FunctionFactory &);
void registerFunctionNotEmpty(FunctionFactory &);
......@@ -45,6 +47,9 @@ void registerFunctionTryBase64Decode(FunctionFactory &);
void registerFunctionsString(FunctionFactory & factory)
{
#if USE_HYPERSCAN
registerFunctionHtmlOrXmlCoarseParse(factory);
#endif
registerFunctionRepeat(factory);
registerFunctionEmpty(factory);
registerFunctionNotEmpty(factory);
......
......@@ -291,6 +291,7 @@ SRCS(
hasToken.cpp
hasTokenCaseInsensitive.cpp
hostName.cpp
htmlOrXmlCoarseParse.cpp
hypot.cpp
identity.cpp
if.cpp
......
Here is CDTATA.
This is a white space test.
This is a complex test. <script type="text/javascript">Hello, world</script> world <style> hello
hello, world
hello, world
white space collapse
SELECT htmlOrXmlCoarseParse('<script>Here is script.</script>');
SELECT htmlOrXmlCoarseParse('<style>Here is style.</style>');
SELECT htmlOrXmlCoarseParse('<![CDATA[Here is CDTATA.]]>');
SELECT htmlOrXmlCoarseParse('This is a white space test.');
SELECT htmlOrXmlCoarseParse('This is a complex test. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><![CDATA[<script type="text/javascript">Hello, world</script> ]]><hello />world<![CDATA[ <style> ]]> hello</style>\n<script><![CDATA[</script>]]>hello</script>\n</html>');
DROP TABLE IF EXISTS defaults;
CREATE TABLE defaults
(
stringColumn String
) ENGINE = Memory();
INSERT INTO defaults values ('<common tag>hello, world<tag>'), ('<script desc=content> some content </script>'), ('<![CDATA[hello, world]]>'), ('white space collapse');
SELECT htmlOrXmlCoarseParse(stringColumn) FROM defaults;
DROP table defaults;
......@@ -197,6 +197,7 @@
01181_db_atomic_drop_on_cluster
01658_test_base64Encode_mysql_compatibility
01659_test_base64Decode_mysql_compatibility
01674_htm_xml_coarse_parse
01675_data_type_coroutine
01676_clickhouse_client_autocomplete
01671_aggregate_function_group_bitmap_data
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册