提交 7bf7242a 编写于 作者: A Alexey Milovidov

Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3].

上级 ff02af98
......@@ -23,7 +23,7 @@ static inline ContainerType max(const ContainerType & lhs, const ContainerType &
}
/** For a small number of keys - an array of fixed size "on the stack."
/** For a small number of keys - an array of fixed size "on the stack".
* For the average, HashSet is allocated.
* For large, HyperLogLog is allocated.
*/
......
......@@ -5,7 +5,7 @@
/** Hash functions that are better than the trivial function std::hash.
*
* Example: when aggregated by the visitor ID, the performance increase is more than 5 times.
* Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times.
* This is because of following reasons:
* - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits;
* - in typical implementation of standard library, hash function for integers is trivial and just use lower bits;
......
......@@ -695,7 +695,7 @@ public:
/** Insert the key,
* return the iterator to a position that can be used for `placement new` of value,
* return an iterator to a position that can be used for `placement new` of value,
* as well as the flag - whether a new key was inserted.
*
* You have to make `placement new` of value if you inserted a new key,
......
......@@ -212,7 +212,7 @@ public:
/** Insert the key,
* return the iterator to a position that can be used for `placement new` of value,
* return an iterator to a position that can be used for `placement new` of value,
* as well as the flag - whether a new key was inserted.
*
* You have to make `placement new` of value if you inserted a new key,
......
......@@ -9,9 +9,9 @@
*
* Usually works a little slower than a simple hash table.
* However, it has advantages in some cases:
* - if you need to measure two hash tables together, then you can easily parallelize them by buckets;
* - lag during resizes is spread, since the small hash tables will be resized separately;
* - in theory, the cache resize is local in a larger range of sizes.
* - if you need to merge two hash tables together, then you can easily parallelize it by buckets;
* - delay during resizes is amortized, since the small hash tables will be resized separately;
* - in theory, resizes are cache-local in a larger range of sizes.
*/
template <size_t initial_size_degree = 8>
......@@ -52,7 +52,7 @@ public:
size_t hash(const Key & x) const { return Hash::operator()(x); }
/// NOTE Bad for hash tables for more than 2^32 cells.
/// NOTE Bad for hash tables with more than 2^32 cells.
static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET; }
protected:
......@@ -95,7 +95,7 @@ public:
{
typename Source::const_iterator it = src.begin();
/// It is assumed that the zero key (stored separately) when iterating is first.
/// It is assumed that the zero key (stored separately) is first in iteration order.
if (it != src.end() && it.getPtr()->isZero(src))
{
insert(*it);
......@@ -221,7 +221,7 @@ public:
/** Insert the key,
* return the iterator to a position that can be used for `placement new` value,
* return an iterator to a position that can be used for `placement new` of value,
* as well as the flag - whether a new key was inserted.
*
* You have to make `placement new` values if you inserted a new key,
......
......@@ -9,7 +9,7 @@ namespace DB
{
/** For a small number of keys - an array of fixed size "on the stack."
/** For a small number of keys - an array of fixed size "on the stack".
* For large, HyperLogLog is allocated.
* See also the more practical implementation in CombinedCardinalityEstimator.h,
* where a hash table is also used for medium-sized sets.
......
......@@ -3,8 +3,8 @@
#include <Common/CounterInFile.h>
/** Lets you receive an auto-increment number, storing it in a file.
* Designed for rare calls (not designed for performance).
/** Allows to get an auto-increment number, storing it in a file.
* Intended for rare calls (not designed for performance).
*/
class Increment
{
......@@ -39,13 +39,13 @@ public:
return getBunch(0, create_if_need);
}
/** Get the next number and increase the count by `count`.
* If the `create_if_need` parameter is not set to true, then
* the file should already have a number written (if not - create the file manually with zero).
*
* To protect against race conditions between different processes, file locks are used.
* (But when the first file is created, the race condition is possible, so it's better to create the file in advance.)
*/
/** Get the next number and increase the counter by `count`.
* If the `create_if_need` parameter is not set to true, then
* the file should already have a number written (if not - create the file manually with zero).
*
* To protect against race conditions between different processes, file locks are used.
* (But when the first file is created, the race condition is possible, so it's better to create the file in advance.)
*/
UInt64 getBunch(UInt64 count, bool create_if_need = false)
{
return static_cast<UInt64>(counter.add(static_cast<Int64>(count), create_if_need) - count + 1);
......
......@@ -4,10 +4,11 @@
#include <Poco/Util/AbstractConfiguration.h>
#include <map>
namespace DB
{
/** Apply the macros from the config in the line.
/** Apply substitutions from the macros in config to the string.
*/
class Macros
{
......
......@@ -102,10 +102,10 @@ public:
};
/** The MemoryTracker object is quite difficult to drag to all places where significant amounts of memory are allocated.
* Therefore, a thread-local pointer to used MemoryTracker or nullptr is used, if it does not need to be used.
* This pointer is set when memory consumption is monitored in this thread.
* So, you just need to drag it to all the threads that handle one request.
/** The MemoryTracker object is quite difficult to pass to all places where significant amounts of memory are allocated.
* Therefore, a thread-local pointer to used MemoryTracker is set, or nullptr if MemoryTracker does not need to be used.
* This pointer is set when memory consumption is monitored in current thread.
* So, you just need to pass it to all the threads that handle one request.
*/
extern __thread MemoryTracker * current_memory_tracker;
......
......@@ -8,8 +8,9 @@
#define MIN_LENGTH_FOR_STRSTR 3
#define MAX_SUBPATTERNS 5
template <bool b>
void OptimizedRegularExpressionImpl<b>::analyze(
template <bool thread_safe>
void OptimizedRegularExpressionImpl<thread_safe>::analyze(
const std::string & regexp,
std::string & required_substring,
bool & is_trivial,
......@@ -20,7 +21,8 @@ void OptimizedRegularExpressionImpl<b>::analyze(
* a string outside parentheses,
* in which all metacharacters are escaped,
* and also if there are no '|' outside the brackets,
* and also avoid substrings of the form `http://` or `www`.
* and also avoid substrings of the form `http://` or `www` and some other
* (this is the hack for typical use case in Yandex.Metrica).
*/
const char * begin = regexp.data();
const char * pos = begin;
......@@ -32,9 +34,9 @@ void OptimizedRegularExpressionImpl<b>::analyze(
bool has_alternative_on_depth_0 = false;
/// Substring with a position.
typedef std::pair<std::string, size_t> Substring;
using Substring = std::pair<std::string, size_t>;
using Substrings = std::vector<Substring>;
typedef std::vector<Substring> Substrings;
Substrings trivial_substrings(1);
Substring * last_substring = &trivial_substrings.back();
......@@ -157,7 +159,7 @@ void OptimizedRegularExpressionImpl<b>::analyze(
++pos;
break;
/// Quantifiers that allow a zero number.
/// Quantifiers that allow a zero number of occurences.
case '{':
in_curly_braces = true;
case '?': case '*':
......@@ -208,7 +210,7 @@ void OptimizedRegularExpressionImpl<b>::analyze(
{
if (((it->second == 0 && candidate_it->second != 0)
|| ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))
/// Tuning for the domain
/// Tuning for typical usage domain
&& (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://")))
&& (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http")))
&& (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www")))
......@@ -241,12 +243,12 @@ void OptimizedRegularExpressionImpl<b>::analyze(
}
template <bool b>
OptimizedRegularExpressionImpl<b>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
template <bool thread_safe>
OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
{
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
/// 3 options are supported
/// Just three following options are supported
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
throw Poco::Exception("OptimizedRegularExpression: Unsupported option.");
......@@ -280,8 +282,8 @@ OptimizedRegularExpressionImpl<b>::OptimizedRegularExpressionImpl(const std::str
}
template <bool b>
bool OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subject_size) const
template <bool thread_safe>
bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size) const
{
if (is_trivial)
{
......@@ -309,8 +311,8 @@ bool OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subje
}
template <bool b>
bool OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subject_size, Match & match) const
template <bool thread_safe>
bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, Match & match) const
{
if (is_trivial)
{
......@@ -357,8 +359,8 @@ bool OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subje
}
template <bool b>
unsigned OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const
template <bool thread_safe>
unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const
{
matches.clear();
......
......@@ -24,7 +24,7 @@ namespace DB
* To be more precise - for use in ColumnVector.
* It differs from std::vector in that it does not initialize the elements.
*
* Made uncopable so that there are no random copies. You can copy the data using `assign` method.
* Made noncopyable so that there are no accidential copies. You can copy the data using `assign` method.
*
* Only part of the std::vector interface is supported.
*
......@@ -40,20 +40,20 @@ template <typename T, size_t INITIAL_SIZE = 4096, typename TAllocator = Allocato
class PODArray : private boost::noncopyable, private TAllocator /// empty base optimization
{
private:
/// Round padding up to an integer number of elements to simplify arithmetic.
/// Round padding up to an whole number of elements to simplify arithmetic.
static constexpr size_t pad_right = (pad_right_ + sizeof(T) - 1) / sizeof(T) * sizeof(T);
char * c_start = nullptr;
char * c_end = nullptr;
char * c_start = nullptr;
char * c_end = nullptr;
char * c_end_of_storage = nullptr; /// Does not include pad_right.
T * t_start() { return reinterpret_cast<T *>(c_start); }
T * t_end() { return reinterpret_cast<T *>(c_end); }
T * t_end_of_storage() { return reinterpret_cast<T *>(c_end_of_storage); }
T * t_start() { return reinterpret_cast<T *>(c_start); }
T * t_end() { return reinterpret_cast<T *>(c_end); }
T * t_end_of_storage() { return reinterpret_cast<T *>(c_end_of_storage); }
const T * t_start() const { return reinterpret_cast<const T *>(c_start); }
const T * t_end() const { return reinterpret_cast<const T *>(c_end); }
const T * t_end_of_storage() const { return reinterpret_cast<const T *>(c_end_of_storage); }
const T * t_start() const { return reinterpret_cast<const T *>(c_start); }
const T * t_end() const { return reinterpret_cast<const T *>(c_end); }
const T * t_end_of_storage() const { return reinterpret_cast<const T *>(c_end_of_storage); }
/// The amount of memory occupied by the num_elements of the elements.
static size_t byte_size(size_t num_elements) { return num_elements * sizeof(T); }
......@@ -173,16 +173,16 @@ public:
const T & operator[] (size_t n) const { return t_start()[n]; }
T & front() { return t_start()[0]; }
T & back() { return t_end()[-1]; }
T & back() { return t_end()[-1]; }
const T & front() const { return t_start()[0]; }
const T & back() const { return t_end()[-1]; }
iterator begin() { return t_start(); }
iterator end() { return t_end(); }
const_iterator begin() const { return t_start(); }
const_iterator end() const { return t_end(); }
const_iterator cbegin() const { return t_start(); }
const_iterator cend() const { return t_end(); }
iterator begin() { return t_start(); }
iterator end() { return t_end(); }
const_iterator begin() const { return t_start(); }
const_iterator end() const { return t_end(); }
const_iterator cbegin() const { return t_start(); }
const_iterator cend() const { return t_end(); }
void reserve(size_t n)
{
......@@ -209,7 +209,7 @@ public:
c_end = c_start + byte_size(n);
}
/// Same as resize, but zeros new elements.
/// Same as resize, but zeroes new elements.
void resize_fill(size_t n)
{
size_t old_size = size();
......@@ -261,7 +261,7 @@ public:
c_end -= byte_size(1);
}
/// Do not insert a piece of yourself into the array. Because with the resize, the iterators on themselves can be invalidated.
/// Do not insert into the array a piece of itself. Because with the resize, the iterators on themselves can be invalidated.
template <typename It1, typename It2>
void insert(It1 from_begin, It2 from_end)
{
......
......@@ -8,8 +8,17 @@
#include <common/logger_useful.h>
#include <Common/Exception.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
}
/** A class from which you can inherit and get a pool of something. Used for database connection pools.
* The heir must provide a method for creating a new object to place in the pool.
* Descendant class must provide a method for creating a new object to place in the pool.
*/
template <typename TObject>
......@@ -63,27 +72,27 @@ public:
Entry() {} /// For deferred initialization.
/** The `Entry` object protects the resource from being used by another thread.
* The following methods are forbidden for `rvalue`, so you can not write a similar to
*
* auto q = pool.Get()->query("SELECT .."); // Oops, after this line Entry was destroyed
* q.execute (); // Someone else can use this Connection
*/
* The following methods are forbidden for `rvalue`, so you can not write a similar to
*
* auto q = pool.Get()->query("SELECT .."); // Oops, after this line Entry was destroyed
* q.execute (); // Someone else can use this Connection
*/
Object * operator->() && = delete;
const Object * operator->() const && = delete;
Object & operator*() && = delete;
const Object & operator*() const && = delete;
Object * operator->() & { return &*data->data.object; }
const Object * operator->() const & { return &*data->data.object; }
Object & operator*() & { return *data->data.object; }
const Object & operator*() const & { return *data->data.object; }
Object * operator->() & { return &*data->data.object; }
const Object * operator->() const & { return &*data->data.object; }
Object & operator*() & { return *data->data.object; }
const Object & operator*() const & { return *data->data.object; }
bool isNull() const { return data == nullptr; }
PoolBase * getPool() const
{
if (!data)
throw DB::Exception("attempt to get pool from uninitialized entry");
throw DB::Exception("Attempt to get pool from uninitialized entry", DB::ErrorCodes::LOGICAL_ERROR);
return &data->data.pool;
}
......@@ -95,7 +104,7 @@ public:
virtual ~PoolBase() {}
/** Allocates the object for the job. With timeout < 0, the timeout is infinite. */
/** Allocates the object. Wait for free object in pool for 'timeout'. With 'timeout' < 0, the timeout is infinite. */
Entry get(Poco::Timespan::TimeDiff timeout)
{
std::unique_lock<std::mutex> lock(mutex);
......@@ -137,7 +146,7 @@ private:
/** Pool. */
Objects items;
/** Block to access the pool. */
/** Lock to access the pool. */
std::mutex mutex;
std::condition_variable available;
......@@ -151,7 +160,7 @@ protected:
items.reserve(max_items);
}
/** Creates a new object to put in the pool. */
/** Creates a new object to put into the pool. */
virtual ObjectPtr allocObject() = 0;
};
......@@ -13,10 +13,10 @@
#include <Core/Defines.h>
/** Bitwise sort, has the following functionality:
/** Radix sort, has the following functionality:
* Can sort unsigned, signed numbers, and floats.
* Can sort an array of fixed length elements that contain something else besides the key.
* Customizable digit size.
* Customizable radix size.
*
* LSB, stable.
* NOTE For some applications it makes sense to add MSB-radix-sort,
......@@ -49,7 +49,7 @@ struct RadixSortMallocAllocator
template <typename KeyBits>
struct RadixSortFloatTransform
{
/// Is it worth writing the result in memory, or is it better to do it every time again?
/// Is it worth writing the result in memory, or is it better to do calculation every time again?
static constexpr bool transform_is_simple = false;
static KeyBits forward(KeyBits x)
......@@ -74,7 +74,7 @@ struct RadixSortFloatTraits
/// The type to which the key is transformed to do bit operations. This UInt is the same size as the key.
using KeyBits = typename std::conditional<sizeof(Float) == 8, uint64_t, uint32_t>::type;
static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, it bits, to do one pass - reshuffle of the array.
static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array.
/// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits.
using Transform = RadixSortFloatTransform<KeyBits>;
......@@ -95,7 +95,7 @@ struct RadixSortIdentityTransform
static constexpr bool transform_is_simple = true;
static KeyBits forward(KeyBits x) { return x; }
static KeyBits backward(KeyBits x) { return x; }
static KeyBits backward(KeyBits x) { return x; }
};
......@@ -105,7 +105,7 @@ struct RadixSortSignedTransform
static constexpr bool transform_is_simple = true;
static KeyBits forward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
};
......@@ -150,7 +150,7 @@ struct RadixSort
private:
using Element = typename Traits::Element;
using Key = typename Traits::Key;
using CountType = typename Traits::CountType;
using CountType = typename Traits::CountType;
using KeyBits = typename Traits::KeyBits;
static constexpr size_t HISTOGRAM_SIZE = 1 << Traits::PART_SIZE_BITS;
......@@ -174,9 +174,9 @@ public:
{
/// If the array is smaller than 256, then it is better to use another algorithm.
/// There are loops of NUM_PASSES. It is very important that they unfold in compile-time.
/// There are loops of NUM_PASSES. It is very important that they are unfolded at compile-time.
/// For each of the NUM_PASSES bits of the key, consider how many times each value of this piece met.
/// For each of the NUM_PASSES bit ranges of the key, consider how many times each value of this bit range met.
CountType histograms[HISTOGRAM_SIZE * NUM_PASSES] = {0};
typename Traits::Allocator allocator;
......@@ -230,6 +230,7 @@ public:
}
/// If the number of passes is odd, the result array is in a temporary buffer. Copy it to the place of the original array.
/// NOTE Sometimes it will be more optimal to provide non-destructive interface, that will not modify original array.
if (NUM_PASSES % 2)
memcpy(arr, swap_buffer, size * sizeof(Element));
......
......@@ -10,8 +10,8 @@ namespace DB
/** Lets you run the command,
* read it stdout, stderr, write to stdin,
* wait for completion.
* read it stdout and stderr; write to stdin;
* wait for completion.
*
* The implementation is similar to the popen function from POSIX (see libc source code).
*
......@@ -20,8 +20,8 @@ namespace DB
* with some overcommit settings, if the address space of the process is more than half the amount of available memory.
* Also, changing memory maps - a fairly resource-intensive operation.
*
* The second difference - allows to work simultaneously with stdin, and with stdout, and with stderr running process,
* and also find out the code and the completion status.
* The second difference - allows to work simultaneously with stdin, and with stdout, and with stderr of running process,
* and also to obtain the return code and completion status.
*/
class ShellCommand
{
......
......@@ -7,9 +7,9 @@
/** The simplest cache for a free function.
* You can also pass a static class method or lambda without capturing.
* The size is unlimited. Values are not obsolete.
* To synchronize, use mutex.
* You can also pass a static class method or lambda without captures.
* The size is unlimited. Values are stored permanently and never evicted.
* Mutex is used for synchronization.
* Suitable only for the simplest cases.
*
* Usage
......
......@@ -3,17 +3,17 @@
/** SipHash is a fast cryptographic hash function for short strings.
* Taken from here: https://www.131002.net/siphash/
*
* This is SipHash 2-4 variant.
*
* Two changes are made:
* - returns 128 bits, not 64;
* - returns also 128 bits, not only 64;
* - done streaming (can be calculated in parts).
*
* On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL.
* (~ 700 MB/sec, 15 million strings per second)
*/
#include <cstdint>
#include <cstddef>
#include <Core/Types.h>
#include <common/Types.h>
#define ROTL(x,b) static_cast<u64>( ((x) << (b)) | ( (x) >> (64 - (b))) )
......@@ -30,23 +30,20 @@
class SipHash
{
private:
using u64 = DB::UInt64;
using u8 = DB::UInt8;
/// Status.
u64 v0;
u64 v1;
u64 v2;
u64 v3;
/// State.
UInt64 v0;
UInt64 v1;
UInt64 v2;
UInt64 v3;
/// How many bytes have been processed.
u64 cnt;
UInt64 cnt;
/// The current 8 bytes of input data.
union
{
u64 current_word;
u8 current_bytes[8];
UInt64 current_word;
UInt8 current_bytes[8];
};
void finalize()
......@@ -68,7 +65,7 @@ private:
public:
/// Arguments - seed.
SipHash(u64 k0 = 0, u64 k1 = 0)
SipHash(UInt64 k0 = 0, UInt64 k1 = 0)
{
/// Initialize the state with some random bytes and seed.
v0 = 0x736f6d6570736575ULL ^ k0;
......@@ -80,7 +77,7 @@ public:
current_word = 0;
}
void update(const char * data, u64 size)
void update(const char * data, UInt64 size)
{
const char * end = data + size;
......@@ -94,7 +91,7 @@ public:
++cnt;
}
/// If you still do not have enough bytes to an 8-byte word.
/// If we still do not have enough bytes to an 8-byte word.
if (cnt & 7)
return;
......@@ -108,7 +105,7 @@ public:
while (data + 8 <= end)
{
current_word = *reinterpret_cast<const u64 *>(data);
current_word = *reinterpret_cast<const UInt64 *>(data);
v3 ^= current_word;
SIPROUND;
......@@ -138,18 +135,18 @@ public:
void get128(char * out)
{
finalize();
reinterpret_cast<u64 *>(out)[0] = v0 ^ v1;
reinterpret_cast<u64 *>(out)[1] = v2 ^ v3;
reinterpret_cast<UInt64 *>(out)[0] = v0 ^ v1;
reinterpret_cast<UInt64 *>(out)[1] = v2 ^ v3;
}
void get128(u64 & lo, u64 & hi)
void get128(UInt64 & lo, UInt64 & hi)
{
finalize();
lo = v0 ^ v1;
hi = v2 ^ v3;
}
u64 get64()
UInt64 get64()
{
finalize();
return v0 ^ v1 ^ v2 ^ v3;
......@@ -160,6 +157,7 @@ public:
#undef ROTL
#undef SIPROUND
#include <cstddef>
inline void sipHash128(const char * data, const size_t size, char * out)
{
......@@ -168,7 +166,7 @@ inline void sipHash128(const char * data, const size_t size, char * out)
hash.get128(out);
}
inline DB::UInt64 sipHash64(const char * data, const size_t size)
inline UInt64 sipHash64(const char * data, const size_t size)
{
SipHash hash;
hash.update(data, size);
......@@ -177,7 +175,7 @@ inline DB::UInt64 sipHash64(const char * data, const size_t size)
#include <string>
inline DB::UInt64 sipHash64(const std::string & s)
inline UInt64 sipHash64(const std::string & s)
{
return sipHash64(s.data(), s.size());
}
......@@ -19,15 +19,14 @@
namespace DB
{
namespace ErrorCodes
{
extern const int UNSUPPORTED_PARAMETER;
}
/** Variants for finding a substring in a string.
* In most cases, less productive than Volnitsky (see Volnitsky.h).
/** Variants for searching a substring in a string.
* In most cases, performance is less than Volnitsky (see Volnitsky.h).
*/
......@@ -37,7 +36,7 @@ struct StringSearcherBase
static constexpr auto n = sizeof(__m128i);
const int page_size = getpagesize();
bool page_safe(const void * const ptr) const
bool pageSafe(const void * const ptr) const
{
return ((page_size - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <= page_size - n;
}
......@@ -55,7 +54,7 @@ class StringSearcher<false, false> : private StringSearcherBase
private:
using UTF8SequenceBuffer = UInt8[6];
/// string to be searched for
/// substring to be searched for
const UInt8 * const needle;
const std::size_t needle_size;
const UInt8 * const needle_end = needle + needle_size;
......@@ -135,8 +134,7 @@ public:
if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
throw DB::Exception{
"UTF8 sequences with different lowercase and uppercase lengths are not supported",
DB::ErrorCodes::UNSUPPORTED_PARAMETER
};
DB::ErrorCodes::UNSUPPORTED_PARAMETER};
cache_actual_len += src_len;
if (cache_actual_len < n)
......@@ -165,7 +163,7 @@ public:
static const Poco::UTF8Encoding utf8;
#if __SSE4_1__
if (page_safe(pos))
if (pageSafe(pos))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
......@@ -230,7 +228,7 @@ public:
while (haystack < haystack_end)
{
#if __SSE4_1__
if (haystack + n <= haystack_end && page_safe(haystack))
if (haystack + n <= haystack_end && pageSafe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
......@@ -249,7 +247,7 @@ public:
const auto offset = __builtin_ctz(mask);
haystack += offset;
if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack))
if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
......@@ -377,7 +375,7 @@ public:
bool compare(const UInt8 * pos) const
{
#if __SSE4_1__
if (page_safe(pos))
if (pageSafe(pos))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
......@@ -429,7 +427,7 @@ public:
while (haystack < haystack_end)
{
#if __SSE4_1__
if (haystack + n <= haystack_end && page_safe(haystack))
if (haystack + n <= haystack_end && pageSafe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
......@@ -447,7 +445,7 @@ public:
const auto offset = __builtin_ctz(mask);
haystack += offset;
if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack))
if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
......@@ -559,7 +557,7 @@ public:
bool compare(const UInt8 * pos) const
{
#if __SSE4_1__
if (page_safe(pos))
if (pageSafe(pos))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
const auto v_against_cache = _mm_cmpeq_epi8(v_haystack, cache);
......@@ -609,7 +607,7 @@ public:
while (haystack < haystack_end)
{
#if __SSE4_1__
if (haystack + n <= haystack_end && page_safe(haystack))
if (haystack + n <= haystack_end && pageSafe(haystack))
{
/// find first character
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
......@@ -627,7 +625,7 @@ public:
const auto offset = __builtin_ctz(mask);
haystack += offset;
if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack))
if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack))
{
/// check for first 16 octets
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
......@@ -694,9 +692,9 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
/** Uses functions from libc.
* It makes sense to use short strings when cheap initialization is required.
* There is no option for register-independent search for UTF-8 strings.
* It is required that the end of the lines be zero byte.
* It makes sense to use only with short haystacks when cheap initialization is required.
* There is no option for case-insensitive search for UTF-8 strings.
* It is required that strings are zero-terminated.
*/
struct LibCASCIICaseSensitiveStringSearcher
......
#pragma once
#include <time.h> /// nanosleep
#include <mutex>
#include <memory>
#include <Common/Stopwatch.h>
#include <Common/Exception.h>
#include <IO/WriteHelpers.h>
namespace DB
{
......@@ -15,12 +17,12 @@ namespace ErrorCodes
}
/** Allows you to limit the speed of something (in pieces per second) using sleep.
/** Allows you to limit the speed of something (in entities per second) using sleep.
* Specifics of work:
* - only the average speed is considered, from the moment of the first call of `add` function;
* if there were periods with low speed, then during some time after them, the speed will be higher;
*
* Also allows you to set a limit on the maximum number of pieces. If you exceed, an exception is thrown.
* Also allows you to set a limit on the maximum number of entities. If exceeded, an exception will be thrown.
*/
class Throttler
{
......@@ -56,7 +58,7 @@ public:
if (max_speed)
{
/// How much time would have gone for the speed to become `max_speed`.
/// How much time to wait for the average speed to become `max_speed`.
UInt64 desired_ns = new_count * 1000000000 / max_speed;
if (desired_ns > elapsed_ns)
......@@ -65,7 +67,7 @@ public:
timespec sleep_ts;
sleep_ts.tv_sec = sleep_ns / 1000000000;
sleep_ts.tv_nsec = sleep_ns % 1000000000;
nanosleep(&sleep_ts, nullptr); /// NOTE Ends early in case of a signal. This is considered normal.
nanosleep(&sleep_ts, nullptr); /// NOTE Returns early in case of a signal. This is considered normal.
}
}
}
......
......@@ -16,10 +16,10 @@ class Context;
namespace VirtualColumnUtils
{
/// Calculate the minimum numeric suffix to add to the row so that it is not present in the set
/// Calculate the minimum numeric suffix to add to the string so that it is not present in the set
String chooseSuffix(const NamesAndTypesList & columns, const String & name);
/// Calculate the minimum total numeric suffix to add to each row,
/// Calculate the minimum total numeric suffix to add to each string,
/// so that none is present in the set.
String chooseSuffixForSet(const NamesAndTypesList & columns, const std::vector<String> & names);
......
......@@ -2,6 +2,7 @@
#include <Common/StringSearcher.h>
#include <Common/StringUtils.h>
#include <Core/Types.h>
#include <Poco/UTF8Encoding.h>
#include <Poco/Unicode.h>
#include <ext/range.hpp>
......@@ -12,7 +13,7 @@
/** Search for a substring in a string by Volnitsky's algorithm
* http://volnitsky.com/project/str_search/
*
* `haystack` and `needle` can contain null bytes.
* `haystack` and `needle` can contain zero bytes.
*
* Algorithm:
* - if the `needle` is too small or too large, or too small `haystack`, use std::search or memchr;
......@@ -23,7 +24,7 @@
* - bigrams can be inserted several times if they occur in the needle several times;
* - when searching, take from haystack bigram, which should correspond to the last bigram of needle (comparing from the end);
* - look for it in the hash table, if found - get the offset from the hash table and compare the string bytewise;
* - if it did not work, we check the next cell of the hash table from the collision resolution chain;
* - if it did not match, we check the next cell of the hash table from the collision resolution chain;
* - if not found, skip to haystack almost the size of the needle bytes;
*
* Unaligned memory access is used.
......@@ -39,34 +40,35 @@ template <typename CRTP>
class VolnitskyBase
{
protected:
using offset_t = uint8_t; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255.
using ngram_t = uint16_t; /// n-gram (2 bytes).
using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255.
using Ngram = UInt16; /// n-gram (2 bytes).
const UInt8 * const needle;
const size_t needle_size;
const UInt8 * const needle_end = needle + needle_size;
/// For how long we move, if the n-gram from haystack is not found in the hash table.
const size_t step = needle_size - sizeof(ngram_t) + 1;
const size_t step = needle_size - sizeof(Ngram) + 1;
/** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1)
* storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
static const size_t hash_size = 64 * 1024; /// Fits into the L2 cache.
offset_t hash[hash_size]; /// Hash table.
* storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
static const size_t hash_size = 64 * 1024; /// Fits into the L2 cache (of common Intel CPUs).
Offset hash[hash_size]; /// Hash table.
/// min haystack size to use main algorithm instead of fallback
static constexpr auto min_haystack_size_for_algorithm = 20000;
const bool fallback; /// Do I need to use the fallback algorithm.
const bool fallback; /// Do we need to use the fallback algorithm.
public:
/** haystack_size_hint - the expected total size of the haystack for `search` calls. Can not specify.
/** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
* If you specify it small enough, the fallback algorithm will be used,
* since it is considered that it's useless to waste time initializing the hash table.
*/
VolnitskyBase(const char * const needle, const size_t needle_size, size_t haystack_size_hint = 0)
: needle{reinterpret_cast<const UInt8 *>(needle)}, needle_size{needle_size},
fallback{
needle_size < 2 * sizeof(ngram_t) || needle_size >= std::numeric_limits<offset_t>::max() ||
(haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)}
needle_size < 2 * sizeof(Ngram)
|| needle_size >= std::numeric_limits<Offset>::max()
|| (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)}
{
if (fallback)
return;
......@@ -74,7 +76,7 @@ public:
memset(hash, 0, sizeof(hash));
/// int is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
for (auto i = static_cast<int>(needle_size - sizeof(ngram_t)); i >= 0; --i)
for (auto i = static_cast<int>(needle_size - sizeof(Ngram)); i >= 0; --i)
self().putNGram(this->needle + i, i + 1, this->needle);
}
......@@ -91,7 +93,7 @@ public:
return self().search_fallback(haystack, haystack_end);
/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
const auto * pos = haystack + needle_size - sizeof(ngram_t);
const auto * pos = haystack + needle_size - sizeof(Ngram);
for (; pos <= haystack_end - needle_size; pos += step)
{
/// We look at all the cells of the hash table that can correspond to the n-gram from haystack.
......@@ -119,12 +121,12 @@ protected:
CRTP & self() { return static_cast<CRTP &>(*this); }
const CRTP & self() const { return const_cast<VolnitskyBase *>(this)->self(); }
static const ngram_t & toNGram(const UInt8 * const pos)
static const Ngram & toNGram(const UInt8 * const pos)
{
return *reinterpret_cast<const ngram_t *>(pos);
return *reinterpret_cast<const Ngram *>(pos);
}
void putNGramBase(const ngram_t ngram, const int offset)
void putNGramBase(const Ngram ngram, const int offset)
{
/// Put the offset for the n-gram in the corresponding cell or the nearest free cell.
size_t cell_num = ngram % hash_size;
......@@ -145,7 +147,7 @@ protected:
union
{
ngram_t n;
Ngram n;
Chars chars;
};
......@@ -260,7 +262,7 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
union
{
ngram_t n;
Ngram n;
Chars chars;
};
......@@ -277,10 +279,12 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
* or intersect with two code points.
*
* In the first case, you need to consider up to two alternatives - this code point in upper and lower case,
* and in the second case - up to four alternatives - fragments of two code points in all combinations of registers.
* and in the second case - up to four alternatives - fragments of two code points in all combinations of cases.
*
* It does not take into account the dependence of the transformation between the registers from the locale (for example - Turkish `Ii`)
* It does not take into account the dependence of the case-transformation from the locale (for example - Turkish `Ii`)
* as well as composition / decomposition and other features.
*
* It also does not work if characters with lower and upper cases are represented by different number of bytes or code points.
*/
using Seq = UInt8[6];
......@@ -302,12 +306,12 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
putNGramBase(n, offset);
else
{
/// where is the given ngram in respect to UTF-8 sequence start?
/// where is the given ngram in respect to the start of UTF-8 sequence?
const auto seq_ngram_offset = pos - seq_pos;
Seq seq;
/// put ngram from lowercase
/// put ngram for lowercase
utf8.convert(l_u32, seq, sizeof(seq));
chars.c0 = seq[seq_ngram_offset];
chars.c1 = seq[seq_ngram_offset + 1];
......@@ -326,7 +330,7 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
/// first sequence may start before u_pos if it is not ASCII
auto first_seq_pos = pos;
UTF8::syncBackward(first_seq_pos, begin);
/// where is the given ngram in respect to the first UTF-8 sequence start?
/// where is the given ngram in respect to the start of first UTF-8 sequence?
const auto seq_ngram_offset = pos - first_seq_pos;
const auto first_u32 = utf8.convert(first_seq_pos);
......
......@@ -4,11 +4,11 @@
#include <IO/WriteBuffer.h>
/// Displays the transmitted size in bytes as 123.45 GiB.
/// Displays the passed size in bytes as 123.45 GiB.
void formatReadableSizeWithBinarySuffix(double value, DB::WriteBuffer & out, int precision = 2);
std::string formatReadableSizeWithBinarySuffix(double value, int precision = 2);
/// Displays the transmitted size in bytes as 132.55 GB.
/// Displays the passed size in bytes as 132.55 GB.
void formatReadableSizeWithDecimalSuffix(double value, DB::WriteBuffer & out, int precision = 2);
std::string formatReadableSizeWithDecimalSuffix(double value, int precision = 2);
......
......@@ -2,7 +2,7 @@
#include <string>
/** Get the FQDN for the local server by resolving DNS hostname - similar to calling the hostname utility with the -f flag.
* If it does not work, return hostname - similar to calling hostname without flags or uname -n.
/** Get the FQDN for the local server by resolving DNS hostname - similar to calling the 'hostname' tool with the -f flag.
* If it does not work, return hostname - similar to calling 'hostname' without flags or 'uname -n'.
*/
const std::string & getFQDNOrHostName();
......@@ -16,7 +16,7 @@ namespace DB
}
/** Checks match of type by comparing typeid.
/** Checks type by comparing typeid.
* The exact match of the type is checked. That is, cast in the ancestor will be unsuccessful.
* In the rest, behaves like a dynamic_cast.
*/
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册