提交 d42b5045 编写于 作者: A Alexey Milovidov

Merging randomPrintableASCII #8401

上级 4faf2f54
#include <Functions/IFunctionImpl.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnConst.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <IO/WriteBufferFromVector.h>
#include <IO/WriteHelpers.h>
#include <Common/formatReadable.h>
#include <Common/typeid_cast.h>
#include <type_traits>
#include <random>
#include <iostream>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
}
class FunctionRandomASCII : public IFunction
{
public:
static constexpr auto name = "randomASCII";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionRandomASCII>(); }
String getName() const override
{
return name;
}
size_t getNumberOfArguments() const override { return 1; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
const IDataType & type = *arguments[0];
if (!isNativeNumber(type))
throw Exception("Cannot format " + type.getName() + " as size in bytes", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeString>();
}
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
if (!(executeType<UInt8>(block, arguments, result, input_rows_count)
|| executeType<UInt16>(block, arguments, result, input_rows_count)
|| executeType<UInt32>(block, arguments, result, input_rows_count)
|| executeType<UInt64>(block, arguments, result, input_rows_count)
|| executeType<Int8>(block, arguments, result, input_rows_count)
|| executeType<Int16>(block, arguments, result, input_rows_count)
|| executeType<Int32>(block, arguments, result, input_rows_count)
|| executeType<Int64>(block, arguments, result, input_rows_count)))
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
+ " of argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
private:
template <typename T>
bool executeType(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count)
{
bool is_const_column = false;
const ColumnVector<T> * col_from = checkAndGetColumn<ColumnVector<T>>(block.getByPosition(arguments[0]).column.get());
if (!col_from){
col_from = checkAndGetColumnConstData<ColumnVector<T>>(block.getByPosition(arguments[0]).column.get());
is_const_column = true;
}
if (col_from){
auto col_to = ColumnString::create();
const typename ColumnVector<T>::Container & vec_from = col_from->getData();
ColumnString::Chars & data_to = col_to->getChars();
ColumnString::Offsets & offsets_to = col_to->getOffsets();
offsets_to.resize(input_rows_count);
WriteBufferFromVector<ColumnString::Chars> buf_to(data_to);
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(32, 127); //Printable ASCII symbols
std::random_device rd;
char character;
size_t str_length = 0;
if (is_const_column){
str_length = static_cast<size_t>(vec_from[0]);
}
for (size_t i = 0; i < input_rows_count; ++i)
{
if (!is_const_column){
str_length = static_cast<size_t>(vec_from[i]);
}
generator.seed( rd() );
if (str_length > 0){
for (size_t j = 0; j < str_length; ++j)
{
character = distribution(generator);
writeChar(character, buf_to);
}
}
writeChar(0, buf_to);
offsets_to[i] = buf_to.count();
}
buf_to.finish();
block.getByPosition(result).column = std::move(col_to);
return true;
}
return false;
}
};
void registerFunctionRandomASCII(FunctionFactory & factory)
{
factory.registerFunction<FunctionRandomASCII>();
}
}
#include <Functions/IFunctionImpl.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <Common/thread_local_rng.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
/** Generate random string of specified length with printable ASCII characters, almost uniformly distributed.
* First argument is length, other optional arguments are ignored and used to prevent common subexpression elimination to get different values.
*/
class FunctionRandomPrintableASCII : public IFunction
{
public:
static constexpr auto name = "randomPrintableASCII";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionRandomPrintableASCII>(); }
String getName() const override
{
return name;
}
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() < 1)
throw Exception("Function " + getName() + " requires at least one argument: the size of resulting string", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
const IDataType & length_type = *arguments[0];
if (!isNumber(length_type))
throw Exception("First argument of function " + getName() + " must have numeric type", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeString>();
}
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
auto col_to = ColumnString::create();
ColumnString::Chars & data_to = col_to->getChars();
ColumnString::Offsets & offsets_to = col_to->getOffsets();
offsets_to.resize(input_rows_count);
const IColumn & length_column = *block.getByPosition(arguments[0]).column;
IColumn::Offset offset = 0;
for (size_t row_num = 0; row_num < input_rows_count; ++row_num)
{
size_t length = length_column.getUInt(row_num);
IColumn::Offset next_offset = offset + length + 1;
data_to.resize(next_offset);
offsets_to[row_num] = next_offset;
for (size_t pos = offset, end = offset + length; pos < end; pos += 4) /// We have padding in column buffers that we can overwrite.
{
UInt64 rand = thread_local_rng();
UInt16 rand1 = rand;
UInt16 rand2 = rand >> 16;
UInt16 rand3 = rand >> 32;
UInt16 rand4 = rand >> 48;
/// Printable characters are from range [32; 126].
/// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
data_to[pos + 0] = 32 + ((rand1 * 95) >> 16);
data_to[pos + 1] = 32 + ((rand2 * 95) >> 16);
data_to[pos + 2] = 32 + ((rand3 * 95) >> 16);
data_to[pos + 3] = 32 + ((rand4 * 95) >> 16);
/// TODO Implement SIMD optimizations from Danila Kutenin.
}
data_to[offset + length] = 0;
offset = next_offset;
}
block.getByPosition(result).column = std::move(col_to);
}
};
void registerFunctionRandomPrintableASCII(FunctionFactory & factory)
{
factory.registerFunction<FunctionRandomPrintableASCII>();
}
}
......@@ -54,7 +54,7 @@ void registerFunctionEvalMLMethod(FunctionFactory &);
void registerFunctionBasename(FunctionFactory &);
void registerFunctionTransform(FunctionFactory &);
void registerFunctionGetMacro(FunctionFactory &);
void registerFunctionRandomASCII(FunctionFactory &);
void registerFunctionRandomPrintableASCII(FunctionFactory &);
void registerFunctionGetScalar(FunctionFactory &);
#if USE_ICU
......
......@@ -54,7 +54,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
registerFunctionBasename(factory);
registerFunctionTransform(factory);
registerFunctionGetMacro(factory);
registerFunctionRandomASCII(factory);
registerFunctionRandomPrintableASCII(factory);
registerFunctionGetScalar(factory);
#if USE_ICU
......
<test>
<type>once</type>
<stop_conditions>
<any_of>
<average_speed_not_changing_for_ms>4000</average_speed_not_changing_for_ms>
<total_time_ms>10000</total_time_ms>
</any_of>
</stop_conditions>
<main_metric>
<max_rows_per_second />
<max_bytes_per_second />
<avg_rows_per_second />
<avg_bytes_per_second />
</main_metric>
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(10))</query>
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(100))</query>
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(1000))</query>
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(10000))</query>
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(rand() % 10))</query>
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(rand() % 100))</query>
<query>SELECT count() FROM system.numbers WHERE NOT ignore(randomPrintableASCII(rand() % 1000))</query>
</test>
SELECT toTypeName(randomASCII(1000));
SELECT length(randomASCII(1000));
SELECT toTypeName(randomPrintableASCII(1000));
SELECT length(randomPrintableASCII(1000));
......@@ -6,7 +6,7 @@ Returns a string with the name of the host that this function was performed on.
## FQDN {#fqdn}
Returns the fully qualified domain name.
Returns the fully qualified domain name.
**Syntax**
......@@ -392,7 +392,7 @@ neighbor(column, offset[, default_value])
The result of the function depends on the affected data blocks and the order of data in the block.
If you make a subquery with ORDER BY and call the function from outside the subquery, you can get the expected result.
**Parameters**
**Parameters**
- `column` — A column name or scalar expression.
- `offset` — The number of rows forwards or backwards from the current row of `column`. [Int64](../../data_types/int_uint.md).
......@@ -400,7 +400,7 @@ If you make a subquery with ORDER BY and call the function from outside the subq
**Returned values**
- Value for `column` in `offset` distance from current row if `offset` value is not outside block bounds.
- Value for `column` in `offset` distance from current row if `offset` value is not outside block bounds.
- Default value for `column` if `offset` value is outside block bounds. If `default_value` is given, then it will be used.
Type: type of data blocks affected or default value type.
......@@ -545,7 +545,7 @@ WHERE diff != 1
└────────┴──────┘
```
```sql
set max_block_size=100000 -- default value is 65536!
set max_block_size=100000 -- default value is 65536!
SELECT
number,
......@@ -886,7 +886,7 @@ Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many.
## identity()
Returns the same value that was used as its argument.
Returns the same value that was used as its argument.
```sql
SELECT identity(42)
......@@ -898,14 +898,14 @@ SELECT identity(42)
```
Used for debugging and testing, allows to "break" access by index, and get the result and query performance for a full scan.
## randomASCII {#randomascii}
## randomPrintableASCII {#randomascii}
Generates a string with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters.
**Syntax**
```sql
randomASKII(length)
randomPrintableASCII(length)
```
**Parameters**
......@@ -923,14 +923,14 @@ Type: [String](../../data_types/string.md)
**Example**
```sql
SELECT number, randomASCII(30) as str, length(str) FROM system.numbers LIMIT 3
SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers LIMIT 3
```
```text
┌─number─┬─str────────────────────────────┬─length(randomASCII(30))─┐
│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │
│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │
│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │
└────────┴────────────────────────────────┴─────────────────────────┘
┌─number─┬─str────────────────────────────┬─length(randomPrintableASCII(30))─┐
│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │
│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │
│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │
└────────┴────────────────────────────────┴──────────────────────────────────
```
[Original article](https://clickhouse.yandex/docs/en/query_language/functions/other_functions/) <!--hide-->
......@@ -6,7 +6,7 @@
## FQDN {#fqdn}
Возвращает полное имя домена.
Возвращает полное имя домена.
**Синтаксис**
......@@ -377,7 +377,7 @@ neighbor(column, offset[, default_value])
**Возвращаемое значение**
- Значение `column` в смещении от текущей строки, если значение `offset` не выходит за пределы блока.
- Значение `column` в смещении от текущей строки, если значение `offset` не выходит за пределы блока.
- Значение по умолчанию для `column`, если значение `offset` выходит за пределы блока данных. Если передан параметр `default_value`, то значение берется из него.
Тип: зависит от данных в `column` или переданного значения по умолчанию в `default_value`.
......@@ -885,14 +885,14 @@ SELECT identity(42)
```
Используется для отладки и тестирования, позволяет "сломать" доступ по индексу, и получить результат и производительность запроса для полного сканирования.
## randomASCII {#randomascii}
## randomPrintableASCII {#randomascii}
Генерирует строку со случайным набором печатных символов [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters).
**Синтаксис**
```sql
randomASKII(length)
randomPrintableASCII(length)
```
**Параметры**
......@@ -910,14 +910,14 @@ randomASKII(length)
**Пример**
```sql
SELECT number, randomASCII(30) as str, length(str) FROM system.numbers LIMIT 3
SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers LIMIT 3
```
```text
┌─number─┬─str────────────────────────────┬─length(randomASCII(30))─┐
│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │
│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │
│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │
└────────┴────────────────────────────────┴─────────────────────────┘
┌─number─┬─str────────────────────────────┬─length(randomPrintableASCII(30))─┐
│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │
│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │
│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │
└────────┴────────────────────────────────┴──────────────────────────────────
```
[Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/other_functions/) <!--hide-->
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册