提交 6c814c9f 编写于 作者: P philip.han

Add bayesAB function for bayesian_ab_testing

上级 e0d19d2a
......@@ -174,3 +174,9 @@
[submodule "contrib/sentry-native"]
path = contrib/sentry-native
url = https://github.com/getsentry/sentry-native.git
[submodule "contrib/gcem"]
path = contrib/gcem
url = https://github.com/kthohr/gcem.git
[submodule "contrib/stats"]
path = contrib/stats
url = https://github.com/kthohr/stats.git
......@@ -319,4 +319,6 @@ if (USE_SENTRY)
endif()
add_subdirectory (fmtlib-cmake)
add_subdirectory (stats-cmake)
add_subdirectory (gcem)
Subproject commit 8d4f1b5d76ea8f6ff12f3f4f34cda45424556b00
Subproject commit b6dd459c10a88c7ea04693c007e9e35820c5d9ad
# The stats is a header-only library of probability density functions,
# cumulative distribution functions, quantile functions, and random sampling methods.
set(STATS_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/stats/include)
set(GCEM_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/gcem/include)
add_library(stats)
target_include_directories(stats SYSTEM PUBLIC ${STATS_INCLUDE_DIR})
target_include_directories(stats SYSTEM PUBLIC ${GCEM_INCLUDE_DIR})
add_dependencies(stats gcem)
......@@ -111,3 +111,5 @@ target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_url)
add_subdirectory(array)
target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_array)
target_link_libraries(clickhouse_functions PRIVATE stats)
#include <math.h>
#include <sstream>
#include <DataTypes/DataTypeString.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnsNumber.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/abtesting.h>
#include <IO/WriteHelpers.h>
#include <IO/WriteBufferFromOStream.h>
#define STATS_ENABLE_STDVEC_WRAPPERS
#include <stats.hpp>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int BAD_ARGUMENTS;
}
template <bool higher_is_better>
ABTestResult bayesian_ab_test(std::string distribution, std::vector<double> xs, std::vector<double> ys)
{
const size_t R = 1000, C = 100;
ABTestResult result;
std::vector<std::vector<double>> samples_matrix;
if (distribution == "beta")
{
double alpha, beta;
for (size_t i = 0; i < xs.size(); ++i)
if (xs[i] < ys[i])
throw Exception("Conversions cannot be larger than trials", ErrorCodes::BAD_ARGUMENTS);
for (size_t i = 0; i < xs.size(); ++i)
{
alpha = 1.0 + ys[i];
beta = 1.0 + xs[i] - ys[i];
samples_matrix.push_back(stats::rbeta<std::vector<double>>(R, C, alpha, beta));
}
}
else if (distribution == "gamma")
{
double shape, scale;
for (size_t i = 0; i < xs.size(); ++i)
{
shape = 1.0 + xs[i];
scale = 250.0 / (1 + 250.0 * ys[i]);
samples_matrix.push_back(stats::rgamma<std::vector<double>>(R, C, shape, scale));
}
}
std::vector<double> means;
for (size_t i = 0; i < xs.size(); ++i)
{
auto mean = accumulate(samples_matrix[i].begin(), samples_matrix[i].end(), 0.0) / samples_matrix[i].size();
means.push_back(mean);
}
// Beats control
result.beats_control.resize(xs.size(), 0);
for (size_t i = 1; i < xs.size(); ++i)
{
for (size_t n = 0; n < R * C; ++n)
{
if (higher_is_better)
{
if (samples_matrix[i][n] > samples_matrix[0][n])
++result.beats_control[i];
}
else
{
if (samples_matrix[i][n] < samples_matrix[0][n])
++result.beats_control[i];
}
}
}
for (size_t i = 1; i < xs.size(); ++i)
result.beats_control[i] = static_cast<double>(result.beats_control[i]) / R / C;
// To be best
std::vector<size_t> count_m(xs.size(), 0);
std::vector<double> row(xs.size(), 0);
result.best.resize(xs.size(), 0);
for (size_t n = 0; n < R * C; ++n)
{
for (size_t i = 0; i < xs.size(); ++i)
row[i] = samples_matrix[i][n];
double m;
if (higher_is_better)
m = *std::max_element(row.begin(), row.end());
else
m = *std::min_element(row.begin(), row.end());
for (size_t i = 0; i < xs.size(); ++i)
{
if (m == samples_matrix[i][n])
{
++result.best[i];
break;
}
}
}
for (size_t i = 0; i < xs.size(); ++i)
result.best[i] = static_cast<double>(result.best[i]) / R / C;
return result;
}
class FunctionBayesAB : public IFunction
{
public:
static constexpr auto name = "bayesAB";
static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionBayesAB>();
}
String getName() const override
{
return name;
}
size_t getNumberOfArguments() const override { return 5; }
DataTypePtr getReturnTypeImpl(const DataTypes &) const override
{
return std::make_shared<DataTypeString>();
}
const IColumn * getNestedConstColumn(Block & block, const ColumnNumbers & arguments, const size_t n)
{
const IColumn * col = block.getByPosition(arguments[n]).column.get();
const IColumn * nested_col;
ColumnPtr materialized_column;
if (const ColumnConst * const_arr = checkAndGetColumnConst<ColumnArray>(col))
{
materialized_column = const_arr->convertToFullColumn();
const auto & materialized_arr = typeid_cast<const ColumnArray &>(*materialized_column);
nested_col = &materialized_arr.getData();
}
else
throw Exception("Illegal column " + col->getName() + " as argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return nested_col;
}
std::vector<double> getDoubleValues(const IColumn * col)
{
const ColumnFloat64 * column = checkAndGetColumn<ColumnFloat64>(*col);
if (!column)
throw Exception("Illegal type of argument for function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
std::vector<double> ret;
for (size_t i = 0; i < column->size(); ++i)
ret.push_back(column->getData()[i]);
return ret;
}
std::vector<std::string> getStringValues(const IColumn * col)
{
const ColumnString * column = checkAndGetColumn<ColumnString>(*col);
if (!column)
throw Exception("Illegal type of argument for function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
std::vector<std::string> ret;
for (size_t i = 0; i < column->size(); ++i)
ret.push_back(column->getDataAt(i).data);
return ret;
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override
{
std::vector<double> xs, ys;
std::vector<std::string> variant_names;
std::string dist;
bool higher_is_better;
if (const ColumnConst * col_dist = checkAndGetColumnConst<ColumnString>(block.getByPosition(arguments[0]).column.get()))
dist = col_dist->getDataAt(0).data;
else
throw Exception("First argument for function " + getName() + " must be String", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (const ColumnConst * col_higher_is_better = checkAndGetColumnConst<ColumnUInt8>(block.getByPosition(arguments[1]).column.get()))
higher_is_better = col_higher_is_better->getBool(0);
else
throw Exception("Second argument for function " + getName() + " must be Boolean", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
variant_names = getStringValues(getNestedConstColumn(block, arguments, 2));
xs = getDoubleValues(getNestedConstColumn(block, arguments, 3));
ys = getDoubleValues(getNestedConstColumn(block, arguments, 4));
if (variant_names.size() != xs.size() || xs.size() != ys.size())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Sizes of arguments doen't match: variant_names: {}, xs: {}, ys: {}",
variant_names.size(), xs.size(), ys.size());
if (std::count_if(xs.begin(), xs.end(), [](double v) { return v < 0; }) > 0 ||
std::count_if(ys.begin(), ys.end(), [](double v) { return v < 0; }) > 0)
throw Exception("Negative values don't allowed", ErrorCodes::BAD_ARGUMENTS);
ABTestResult test_result;
if (dist == "beta")
{
if (higher_is_better)
test_result = bayesian_ab_test<true>(dist, xs, ys);
else
test_result = bayesian_ab_test<false>(dist, xs, ys);
}
else if (dist == "gamma")
{
if (higher_is_better)
test_result = bayesian_ab_test<false>(dist, xs, ys);
else
test_result = bayesian_ab_test<true>(dist, xs, ys);
}
else
throw Exception("First argument for function " + getName() + " cannot be " + dist, ErrorCodes::BAD_ARGUMENTS);
FormatSettings settings;
std::stringstream s;
{
WriteBufferFromOStream buf(s);
writeCString("{\"data\":[", buf);
for (size_t i = 0; i < xs.size(); ++i)
{
writeCString("{\"variant_name\":", buf);
writeJSONString(variant_names[i], buf, settings);
writeCString(",\"beats_control\":", buf);
writeText(test_result.beats_control[i], buf);
writeCString(",\"to_be_best\":", buf);
writeText(test_result.best[i], buf);
writeCString("}", buf);
if (i != xs.size() -1) writeCString(",", buf);
}
writeCString("]}", buf);
}
auto dst = ColumnString::create();
std::string result_str = s.str();
dst->insertData(result_str.c_str(), result_str.length());
block.getByPosition(result).column = std::move(dst);
}
};
void registerFunctionBayesAB(FunctionFactory & factory)
{
factory.registerFunction<FunctionBayesAB>();
}
}
#pragma once
#include <iostream>
#include <vector>
#include <algorithm>
namespace DB
{
typedef struct _ABTestResult
{
std::vector<double> beats_control;
std::vector<double> best;
} ABTestResult;
template <bool higher_is_better>
ABTestResult bayesian_ab_test(std::string distribution, std::vector<double> xs, std::vector<double> ys);
}
......@@ -38,6 +38,7 @@ void registerFunctionsNull(FunctionFactory &);
void registerFunctionsJSON(FunctionFactory &);
void registerFunctionsConsistentHashing(FunctionFactory & factory);
void registerFunctionsUnixTimestamp64(FunctionFactory & factory);
void registerFunctionBayesAB(FunctionFactory &);
void registerFunctions()
......@@ -80,6 +81,7 @@ void registerFunctions()
registerFunctionsIntrospection(factory);
registerFunctionsConsistentHashing(factory);
registerFunctionsUnixTimestamp64(factory);
registerFunctionBayesAB(factory);
}
}
add_executable (number_traits number_traits.cpp)
add_executable (abtesting abtesting.cpp)
target_link_libraries (number_traits PRIVATE dbms)
target_link_libraries (abtesting PRIVATE clickhouse_functions)
#include <Functions/abtesting.h>
#include <iostream>
#include <stdio.h>
DB::ABTestResult test_bayesab(std::string dist, std::vector<double> xs, std::vector<double> ys, size_t & max, size_t & min)
{
DB::ABTestResult ret;
std::cout << std::fixed;
if (dist == "beta")
{
std::cout << dist << "\nclicks: ";
for (auto x : xs) std::cout << x << " ";
std::cout <<"\tconversions: ";
for (auto y : ys) std::cout << y << " ";
std::cout << "\n";
ret = DB::bayesian_ab_test<true>(dist, xs, ys);
}
else if (dist == "gamma")
{
std::cout << dist << "\nclicks: ";
for (auto x : xs) std::cout << x << " ";
std::cout <<"\tcost: ";
for (auto y : ys) std::cout << y << " ";
std::cout << "\n";
ret = DB::bayesian_ab_test<false>(dist, xs, ys);
}
for (size_t i = 0; i < ret.beats_control.size(); ++i)
std::cout << i << " beats 0: " << ret.beats_control[i] << std::endl;
for (size_t i = 0; i < ret.beats_control.size(); ++i)
std::cout << i << " to be best: " << ret.best[i] << std::endl;
max = std::max_element(ret.best.begin(), ret.best.end()) - ret.best.begin();
min = std::min_element(ret.best.begin(), ret.best.end()) - ret.best.begin();
return ret;
}
int main(int, char **)
{
size_t max, min;
auto ret = test_bayesab("beta", {10000, 1000, 900}, {600, 110, 90}, max, min);
if (max != 1) exit(1);
ret = test_bayesab("beta", {3000, 3000, 3000}, {600, 100, 90}, max, min);
if (max != 0) exit(1);
ret = test_bayesab("beta", {3000, 3000, 3000}, {100, 90, 110}, max, min);
if (max != 2) exit(1);
ret = test_bayesab("beta", {3000, 3000, 3000}, {110, 90, 100}, max, min);
if (max != 0) exit(1);
ret = test_bayesab("gamma", {10000, 1000, 900}, {600, 110, 90}, max, min);
if (max != 1) exit(1);
ret = test_bayesab("gamma", {3000, 3000, 3000}, {600, 100, 90}, max, min);
if (max != 0) exit(1);
ret = test_bayesab("gamma", {3000, 3000, 3000}, {100, 90, 110}, max, min);
if (max != 2) exit(1);
ret = test_bayesab("gamma", {3000, 3000, 3000}, {110, 90, 100}, max, min);
if (max != 0) exit(1);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册