diff --git a/dbms/src/Common/formatIPv6.cpp b/dbms/src/Common/formatIPv6.cpp new file mode 100644 index 0000000000000000000000000000000000000000..54617b6aa4d0080200d18ca10ff4ebc568df9899 --- /dev/null +++ b/dbms/src/Common/formatIPv6.cpp @@ -0,0 +1,125 @@ +#include +#include +#include +#include + + +namespace DB +{ + +/// integer logarithm, return ceil(log(value, base)) (the smallest integer greater or equal than log(value, base) +static constexpr uint32_t int_log(const uint32_t value, const uint32_t base, const bool carry = false) +{ + return value >= base ? 1 + int_log(value / base, base, value % base || carry) : value % base > 1 || carry; +} + +/// print integer in desired base, faster than sprintf +template +static void print_integer(char *& out, T value) +{ + if (value == 0) + *out++ = '0'; + else + { + char buf[buffer_size]; + auto ptr = buf; + + while (value > 0) + { + *ptr++ = hexLowercase(value % base); + value /= base; + } + + while (ptr != buf) + *out++ = *--ptr; + } +} + +/// print IPv4 address as %u.%u.%u.%u +static void formatIPv4(const unsigned char * src, char *& dst, UInt8 zeroed_tail_bytes_count) +{ + const auto limit = IPV4_BINARY_LENGTH - zeroed_tail_bytes_count; + + for (const auto i : ext::range(0, IPV4_BINARY_LENGTH)) + { + UInt8 byte = (i < limit) ? src[i] : 0; + print_integer<10, UInt8>(dst, byte); + + if (i != IPV4_BINARY_LENGTH - 1) + *dst++ = '.'; + } +} + + +void formatIPv6(const unsigned char * src, char *& dst, UInt8 zeroed_tail_bytes_count) +{ + struct { int base, len; } best{-1}, cur{-1}; + std::array words{}; + + /** Preprocess: + * Copy the input (bytewise) array into a wordwise array. + * Find the longest run of 0x00's in src[] for :: shorthanding. */ + for (const auto i : ext::range(0, IPV6_BINARY_LENGTH - zeroed_tail_bytes_count)) + words[i / 2] |= src[i] << ((1 - (i % 2)) << 3); + + for (const auto i : ext::range(0, words.size())) + { + if (words[i] == 0) { + if (cur.base == -1) + cur.base = i, cur.len = 1; + else + cur.len++; + } + else + { + if (cur.base != -1) + { + if (best.base == -1 || cur.len > best.len) + best = cur; + cur.base = -1; + } + } + } + + if (cur.base != -1) + { + if (best.base == -1 || cur.len > best.len) + best = cur; + } + + if (best.base != -1 && best.len < 2) + best.base = -1; + + /// Format the result. + for (const int i : ext::range(0, words.size())) + { + /// Are we inside the best run of 0x00's? + if (best.base != -1 && i >= best.base && i < (best.base + best.len)) + { + if (i == best.base) + *dst++ = ':'; + continue; + } + + /// Are we following an initial run of 0x00s or any real hex? + if (i != 0) + *dst++ = ':'; + + /// Is this address an encapsulated IPv4? + if (i == 6 && best.base == 0 && (best.len == 6 || (best.len == 5 && words[5] == 0xffffu))) + { + formatIPv4(src + 12, dst, std::min(zeroed_tail_bytes_count, static_cast(IPV4_BINARY_LENGTH))); + break; + } + + print_integer<16>(dst, words[i]); + } + + /// Was it a trailing run of 0x00's? + if (best.base != -1 && (best.base + best.len) == words.size()) + *dst++ = ':'; + + *dst++ = '\0'; +} + +} diff --git a/dbms/src/Common/formatIPv6.h b/dbms/src/Common/formatIPv6.h new file mode 100644 index 0000000000000000000000000000000000000000..6baf5544f02a81d4062c869570ff54c589ba85fb --- /dev/null +++ b/dbms/src/Common/formatIPv6.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#define IPV4_BINARY_LENGTH 4 +#define IPV6_BINARY_LENGTH 16 +#define IPV4_MAX_TEXT_LENGTH 15 /// Does not count tail zero byte. +#define IPV6_MAX_TEXT_LENGTH 39 + + +namespace DB +{ + + +/** Rewritten inet_ntop6 from http://svn.apache.org/repos/asf/apr/apr/trunk/network_io/unix/inet_pton.c + * performs significantly faster than the reference implementation due to the absence of sprintf calls, + * bounds checking, unnecessary string copying and length calculation. + */ +void formatIPv6(const unsigned char * src, char *& dst, UInt8 zeroed_tail_bytes_count = 0); + +} diff --git a/dbms/src/Dictionaries/TrieDictionary.cpp b/dbms/src/Dictionaries/TrieDictionary.cpp index c63a88f32da3d6dfa211496a4531ed91a20b33d8..1fd2dc4172871fc50f59fbd097c04192c81a5859 100644 --- a/dbms/src/Dictionaries/TrieDictionary.cpp +++ b/dbms/src/Dictionaries/TrieDictionary.cpp @@ -8,11 +8,11 @@ #include #include #include +#include #include +#include #include -#include - namespace DB { @@ -595,7 +595,7 @@ void TrieDictionary::trieTraverse(const btrie_t * tree, Getter && getter) const Columns TrieDictionary::getKeyColumns() const { - auto ip_column = std::make_shared(ipv6_bytes_length); + auto ip_column = std::make_shared(IPV6_BINARY_LENGTH); auto mask_column = std::make_shared>(); auto getter = [& ip_column, & mask_column](__uint128_t ip, size_t mask) { @@ -603,7 +603,7 @@ Columns TrieDictionary::getKeyColumns() const ip_array[0] = Poco::ByteOrder::fromNetwork(ip_array[0]); ip_array[1] = Poco::ByteOrder::fromNetwork(ip_array[1]); std::swap(ip_array[0], ip_array[1]); - ip_column->insertData(reinterpret_cast(ip_array), ipv6_bytes_length); + ip_column->insertData(reinterpret_cast(ip_array), IPV6_BINARY_LENGTH); mask_column->insert(static_cast(mask)); }; @@ -619,7 +619,7 @@ BlockInputStreamPtr TrieDictionary::getBlockInputStream(const Names & column_nam { const auto & attr = attributes.front(); return ColumnsWithTypeAndName({ColumnWithTypeAndName(columns.front(), - std::make_shared(ipv6_bytes_length), attr.name)}); + std::make_shared(IPV6_BINARY_LENGTH), attr.name)}); }; auto getView = [](const Columns& columns, const std::vector& attributes) { @@ -631,7 +631,7 @@ BlockInputStreamPtr TrieDictionary::getBlockInputStream(const Names & column_nam { UInt8 mask = mask_column->getElement(row); char * ptr = buffer; - IPv6Format::apply(reinterpret_cast(ip_column->getDataAt(row).data), ptr); + formatIPv6(reinterpret_cast(ip_column->getDataAt(row).data), ptr); *(ptr - 1) = '/'; auto size = detail::writeUIntText(mask, ptr); column->insertData(buffer, size + (ptr - buffer)); diff --git a/dbms/src/Functions/FunctionsCoding.h b/dbms/src/Functions/FunctionsCoding.h index da1d28b7fe8ee9dd97a7afd12dc76d852b6f7745..a257e2e7323371a17ba795488a5e42cf43ec7fa2 100644 --- a/dbms/src/Functions/FunctionsCoding.h +++ b/dbms/src/Functions/FunctionsCoding.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -16,7 +17,6 @@ #include #include -#include #include #include @@ -51,129 +51,6 @@ constexpr auto ipv6_bytes_length = 16; constexpr auto uuid_bytes_length = 16; constexpr auto uuid_text_length = 36; -class IPv6Format -{ -private: - /// integer logarithm, return ceil(log(value, base)) (the smallest integer greater or equal than log(value, base) - static constexpr uint32_t int_log(const uint32_t value, const uint32_t base, const bool carry = false) - { - return value >= base ? 1 + int_log(value / base, base, value % base || carry) : value % base > 1 || carry; - } - - /// print integer in desired base, faster than sprintf - template - static void print_integer(char *& out, T value) - { - if (value == 0) - *out++ = '0'; - else - { - char buf[buffer_size]; - auto ptr = buf; - - while (value > 0) - { - *ptr++ = hexLowercase(value % base); - value /= base; - } - - while (ptr != buf) - *out++ = *--ptr; - } - } - - /// print IPv4 address as %u.%u.%u.%u - static void ipv4_format(const unsigned char * src, char *& dst, UInt8 zeroed_tail_bytes_count) - { - const auto limit = ipv4_bytes_length - zeroed_tail_bytes_count; - - for (const auto i : ext::range(0, ipv4_bytes_length)) - { - UInt8 byte = (i < limit) ? src[i] : 0; - print_integer<10, UInt8>(dst, byte); - - if (i != ipv4_bytes_length - 1) - *dst++ = '.'; - } - } - -public: - /** rewritten inet_ntop6 from http://svn.apache.org/repos/asf/apr/apr/trunk/network_io/unix/inet_pton.c - * performs significantly faster than the reference implementation due to the absence of sprintf calls, - * bounds checking, unnecessary string copying and length calculation - */ - static const void apply(const unsigned char * src, char *& dst, UInt8 zeroed_tail_bytes_count = 0) - { - struct { int base, len; } best{-1}, cur{-1}; - std::array words{}; - - /** Preprocess: - * Copy the input (bytewise) array into a wordwise array. - * Find the longest run of 0x00's in src[] for :: shorthanding. */ - for (const auto i : ext::range(0, ipv6_bytes_length - zeroed_tail_bytes_count)) - words[i / 2] |= src[i] << ((1 - (i % 2)) << 3); - - for (const auto i : ext::range(0, words.size())) - { - if (words[i] == 0) { - if (cur.base == -1) - cur.base = i, cur.len = 1; - else - cur.len++; - } - else - { - if (cur.base != -1) - { - if (best.base == -1 || cur.len > best.len) - best = cur; - cur.base = -1; - } - } - } - - if (cur.base != -1) - { - if (best.base == -1 || cur.len > best.len) - best = cur; - } - - if (best.base != -1 && best.len < 2) - best.base = -1; - - /// Format the result. - for (const int i : ext::range(0, words.size())) - { - /// Are we inside the best run of 0x00's? - if (best.base != -1 && i >= best.base && i < (best.base + best.len)) - { - if (i == best.base) - *dst++ = ':'; - continue; - } - - /// Are we following an initial run of 0x00s or any real hex? - if (i != 0) - *dst++ = ':'; - - /// Is this address an encapsulated IPv4? - if (i == 6 && best.base == 0 && (best.len == 6 || (best.len == 5 && words[5] == 0xffffu))) - { - ipv4_format(src + 12, dst, std::min(zeroed_tail_bytes_count, static_cast(ipv4_bytes_length))); - break; - } - - print_integer<16>(dst, words[i]); - } - - /// Was it a trailing run of 0x00's? - if (best.base != -1 && (best.base + best.len) == words.size()) - *dst++ = ':'; - - *dst++ = '\0'; - } -}; - class FunctionIPv6NumToString : public IFunction { @@ -220,7 +97,7 @@ public: ColumnString::Chars_t & vec_res = col_res->getChars(); ColumnString::Offsets_t & offsets_res = col_res->getOffsets(); - vec_res.resize(size * INET6_ADDRSTRLEN); + vec_res.resize(size * (IPV6_MAX_TEXT_LENGTH + 1)); offsets_res.resize(size); auto begin = reinterpret_cast(&vec_res[0]); @@ -228,7 +105,7 @@ public: for (size_t offset = 0, i = 0; offset < vec_in.size(); offset += ipv6_bytes_length, ++i) { - IPv6Format::apply(&vec_in[offset], pos); + formatIPv6(&vec_in[offset], pos); offsets_res[i] = pos - begin; } @@ -246,9 +123,9 @@ public: const auto & data_in = col_in->getData(); - char buf[INET6_ADDRSTRLEN]; + char buf[IPV6_MAX_TEXT_LENGTH + 1]; char * dst = buf; - IPv6Format::apply(reinterpret_cast(data_in.data()), dst); + formatIPv6(reinterpret_cast(data_in.data()), dst); block.safeGetByPosition(result).column = std::make_shared(col_in->size(), buf); } @@ -343,7 +220,7 @@ public: ColumnString::Chars_t & vec_res = col_res->getChars(); ColumnString::Offsets_t & offsets_res = col_res->getOffsets(); - vec_res.resize(size * INET6_ADDRSTRLEN); + vec_res.resize(size * (IPV6_MAX_TEXT_LENGTH + 1)); offsets_res.resize(size); auto begin = reinterpret_cast(&vec_res[0]); @@ -395,7 +272,7 @@ public: const auto & data_in = col_in->getData(); - char buf[INET6_ADDRSTRLEN]; + char buf[IPV6_MAX_TEXT_LENGTH + 1]; char * dst = buf; const auto address = reinterpret_cast(data_in.data()); @@ -419,7 +296,7 @@ private: void cutAddress(const unsigned char * address, char *& dst, UInt8 zeroed_tail_bytes_count) { - IPv6Format::apply(address, dst, zeroed_tail_bytes_count); + formatIPv6(address, dst, zeroed_tail_bytes_count); } }; @@ -709,7 +586,7 @@ public: ColumnString::Chars_t & vec_res = col_res->getChars(); ColumnString::Offsets_t & offsets_res = col_res->getOffsets(); - vec_res.resize(vec_in.size() * INET_ADDRSTRLEN); /// the longest value is: 255.255.255.255\0 + vec_res.resize(vec_in.size() * (IPV4_MAX_TEXT_LENGTH + 1)); /// the longest value is: 255.255.255.255\0 offsets_res.resize(vec_in.size()); char * begin = reinterpret_cast(&vec_res[0]); char * pos = begin; @@ -892,7 +769,7 @@ public: ColumnString::Chars_t & vec_res = col_res->getChars(); ColumnString::Offsets_t & offsets_res = col_res->getOffsets(); - vec_res.resize(vec_in.size() * INET_ADDRSTRLEN); /// the longest value is: 255.255.255.255\0 + vec_res.resize(vec_in.size() * (IPV4_MAX_TEXT_LENGTH + 1)); /// the longest value is: 255.255.255.255\0 offsets_res.resize(vec_in.size()); char * begin = reinterpret_cast(&vec_res[0]); char * pos = begin;