提交 76153ed8 编写于 作者: A Alexey Milovidov

clickhouse-obfuscator: allow to generate more data than was in source

上级 c67dbc51
......@@ -108,6 +108,9 @@ public:
/// Call generate: pass source data column to obtain a column with anonymized data as a result.
virtual ColumnPtr generate(const IColumn & column);
/// Deterministically change seed to some other value. This can be used to generate more values than were in source.
virtual void updateSeed();
virtual ~IModel() {}
};
......@@ -175,7 +178,7 @@ static UInt64 transform(UInt64 x, UInt64 seed)
class UnsignedIntegerModel : public IModel
{
private:
const UInt64 seed;
UInt64 seed;
public:
UnsignedIntegerModel(UInt64 seed_) : seed(seed_) {}
......@@ -195,6 +198,11 @@ public:
return res;
}
void updateSeed() override
{
seed = hash(seed);
}
};
......@@ -211,7 +219,7 @@ static Int64 transformSigned(Int64 x, UInt64 seed)
class SignedIntegerModel : public IModel
{
private:
const UInt64 seed;
UInt64 seed;
public:
SignedIntegerModel(UInt64 seed_) : seed(seed_) {}
......@@ -231,6 +239,11 @@ public:
return res;
}
void updateSeed() override
{
seed = hash(seed);
}
};
......@@ -253,7 +266,7 @@ template <typename Float>
class FloatModel : public IModel
{
private:
const UInt64 seed;
UInt64 seed;
Float src_prev_value = 0;
Float res_prev_value = 0;
......@@ -280,6 +293,11 @@ public:
return res_column;
}
void updateSeed() override
{
seed = hash(seed);
}
};
......@@ -294,6 +312,10 @@ public:
{
return column.cloneResized(column.size());
}
void updateSeed() override
{
}
};
......@@ -347,7 +369,7 @@ static void transformFixedString(const UInt8 * src, UInt8 * dst, size_t size, UI
class FixedStringModel : public IModel
{
private:
const UInt64 seed;
UInt64 seed;
public:
FixedStringModel(UInt64 seed_) : seed(seed_) {}
......@@ -373,6 +395,11 @@ public:
return res_column;
}
void updateSeed() override
{
seed = hash(seed);
}
};
......@@ -380,7 +407,7 @@ public:
class DateTimeModel : public IModel
{
private:
const UInt64 seed;
UInt64 seed;
UInt32 src_prev_value = 0;
UInt32 res_prev_value = 0;
......@@ -418,6 +445,11 @@ public:
return res_column;
}
void updateSeed() override
{
seed = hash(seed);
}
};
......@@ -790,6 +822,11 @@ public:
return res_column;
}
void updateSeed() override
{
seed = hash(seed);
}
};
......@@ -823,6 +860,11 @@ public:
return ColumnArray::create((*std::move(new_nested_column)).mutate(), (*std::move(column_array.getOffsetsPtr())).mutate());
}
void updateSeed() override
{
nested_model->updateSeed();
}
};
......@@ -856,6 +898,11 @@ public:
return ColumnNullable::create((*std::move(new_nested_column)).mutate(), (*std::move(column_nullable.getNullMapColumnPtr())).mutate());
}
void updateSeed() override
{
nested_model->updateSeed();
}
};
......@@ -939,6 +986,12 @@ public:
res[i] = models[i]->generate(*columns[i]);
return res;
}
void updateSeed()
{
for (auto & model : models)
model->updateSeed();
}
};
}
......@@ -993,7 +1046,7 @@ try
std::string input_format = options["input-format"].as<std::string>();
std::string output_format = options["output-format"].as<std::string>();
std::optional<UInt64> limit;
UInt64 limit = 0;
if (options.count("limit"))
limit = options["limit"].as<UInt64>();
......@@ -1045,27 +1098,32 @@ try
UInt64 max_block_size = 8192;
/// Train step
UInt64 source_rows = 0;
{
if (!silent)
std::cerr << "Training models\n";
BlockInputStreamPtr input = context.getInputFormat(input_format, file_in, header, max_block_size);
UInt64 processed_rows = 0;
input->readPrefix();
while (Block block = input->read())
{
obfuscator.train(block.getColumns());
processed_rows += block.rows();
source_rows += block.rows();
if (!silent)
std::cerr << "Processed " << processed_rows << " rows\n";
std::cerr << "Processed " << source_rows << " rows\n";
}
input->readSuffix();
}
obfuscator.finalize();
if (!limit)
limit = source_rows;
/// Generation step
UInt64 processed_rows = 0;
while (processed_rows < limit)
{
if (!silent)
std::cerr << "Generating data\n";
......@@ -1075,10 +1133,9 @@ try
BlockInputStreamPtr input = context.getInputFormat(input_format, file_in, header, max_block_size);
BlockOutputStreamPtr output = context.getOutputFormat(output_format, file_out, header);
if (limit)
input = std::make_shared<LimitBlockInputStream>(input, *limit, 0);
if (processed_rows + source_rows > limit)
input = std::make_shared<LimitBlockInputStream>(input, limit - processed_rows, 0);
UInt64 processed_rows = 0;
input->readPrefix();
output->writePrefix();
while (Block block = input->read())
......@@ -1091,6 +1148,8 @@ try
}
output->writeSuffix();
input->readSuffix();
obfuscator.updateSeed();
}
return 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册