提交 80ce7edb 编写于 作者: Y Yancey1989

make farward correct

上级 74f519ff
......@@ -70,7 +70,7 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
}
protected:
framework::OpKernelType GetKernelType(
framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
......@@ -96,7 +96,7 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
}
protected:
framework::OpKernelType GetKernelType(
framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
......
......@@ -49,34 +49,31 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto& device_ctx = ctx.template device_context<DeviceContext>();
math::RowwiseSum<DeviceContext, T> row_sum;
math::MatrixBitCodeFunctor<T> bit_code;
math::MatrixBitCodeFunctor<T> bit_code(num_classes, ids->data<int64_t>());
std::vector<int64_t> sum_dims({batch_size, 1UL});
sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
auto sum_mat = EigenMatrix<T>::From(sum);
out->mutable_data<T>(ctx.GetPlace());
auto out_mat = framework::EigenVector<T>::Flatten(*out);
if (bias) {
bit_code.Add(num_classes, ids->data<int64_t>(), pre_out, *bias);
bit_code.Add(pre_out, *bias);
}
for (int i = 0; i < in->dims()[0]; ++i) {
bit_code.Mul(num_classes, ids->data<int64_t>(), pre_out,
w->Slice(i, i + 1), in->Slice(i, i + 1));
for (int64_t i = 0; i < batch_size; ++i) {
auto w_i = w->Slice(i, i + 1);
bit_code.Mul(pre_out, w_i, *in);
}
// clip the matrix with (-40, 40)
Transform<DeviceContext> trans;
trans(ctx.template device_context<DeviceContext>(), pre_out_data,
pre_out_data + pre_out.numel(), pre_out_data,
ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
bit_code.Sum(num_classes, ids->data<int64_t>(), pre_out, *out,
static_cast<T>(-1));
bit_code.Sum(pre_out, *out, static_cast<T>(-1));
// softrelu with threshold is 40.0
trans(ctx.template device_context<DeviceContext>(), pre_out_data,
pre_out_data + pre_out.numel(), pre_out_data,
ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
row_sum(device_ctx, pre_out, &sum);
out_mat.device(place) = sum_mat + out_mat;
}
......@@ -103,28 +100,26 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
auto pre_out_mat = EigenMatrix<T>::From(pre_out);
// init pre_out matrix with {1.0}
math::SetConstant<DeviceContext, T> one;
math::MatrixBitCodeFunctor<T> bit_code;
math::MatrixBitCodeFunctor<T> bit_code(num_classes, ids->data<int64_t>());
one(device_ctx, &pre_out, static_cast<T>(1.0));
// softrelu derivative
pre_out_mat.device(place) =
pre_out_mat * (static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat);
bit_code.Sub(num_classes, ids->data<int64_t>(), pre_out);
bit_code.Sub(pre_out);
if (bias) {
bias->mutable_data<T>(ctx.GetPlace());
bit_code.AddGrad(num_classes, ids->data<int64_t>(), pre_out, *bias);
bit_code.AddGrad(pre_out, *bias);
}
in_grad->mutable_data<T>(ctx.GetPlace());
w->mutable_data<T>(ctx.GetPlace());
for (int i = 0; i < in_grad->dims()[0]; ++i) {
auto p_sliced = w->Slice(i, i + 1);
auto in_sliced = in->Slice(i, i + 1);
auto in_grad_sliced = in_grad->Slice(i, i + 1);
bit_code.MulGradWeight(num_classes, ids->data<int64_t>(), pre_out,
p_sliced, in_sliced);
bit_code.MulGradError(num_classes, ids->data<int64_t>(), pre_out,
p_sliced, in_grad_sliced);
for (int i = 0; i < batch_size; ++i) {
auto w_i = w->Slice(i, i + 1);
// auto in_i = in->Slice(i, i + 1);
// auto in_grad_i = in_grad->Slice(i, i + 1);
bit_code.MulGradWeight(pre_out, w_i, *in);
bit_code.MulGradError(pre_out, w_i, *in_grad);
}
}
};
......
......@@ -62,13 +62,13 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
template <typename DeviceContext, typename T>
void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* vector) {
framework::Tensor* out) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[1];
PADDLE_ENFORCE_EQ(vector->numel(), size);
PADDLE_ENFORCE_EQ(out->numel(), size);
auto in = framework::EigenMatrix<T, Eigen::ColMajor>::From(input);
auto vec = framework::EigenMatrix<T, Eigen::ColMajor>::From(*vector);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(*out);
vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
}
......
......@@ -22,7 +22,7 @@ namespace math {
* CodeTable class should support 3 functions:
*
* size_t size()
* return the number of codes
* return the number of ids
*
* int getMaxCodeLength()
* return the maximal code length
......@@ -45,56 +45,47 @@ namespace math {
*
*/
/*
for i:
for j < codeLength:
op(a(i, j), b(0, index(i, j)))
*/
template <typename T, class CodeTable, class Op>
static void AddByBitCodeT(Op op, CodeTable code_table, const int64_t* codes,
const framework::Tensor& tmat,
const framework::Tensor& vec) {
size_t num_sample = tmat.dims()[0];
size_t width = vec.dims()[1];
for (size_t i = 0; i < num_sample; ++i) {
auto code = code_table(static_cast<size_t>(codes[i]));
template <typename T>
void MatrixBitCodeFunctor<T>::Add(framework::Tensor& tmat,
const framework::Tensor& vec) {
SimpleCodeTable code_table(num_classes_);
size_t batch_size = tmat.dims()[0];
size_t width = tmat.dims()[1];
for (size_t i = 0; i < batch_size; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j);
auto t = tmat.data<T>()[i * width + j];
auto v = vec.data<T>()[index];
op(t, v);
tmat.data<T>()[i * width + j] += vec.data<T>()[index];
}
}
}
template <typename T, class CodeTable>
void SubByBitCodeT(CodeTable code_table, const int64_t* codes,
framework::Tensor& tmat) {
// size_t max_code_length = code_table.get_max_code_length();
size_t num_samples = tmat.dims()[0];
size_t o_width = tmat.dims()[1];
for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(codes[i]));
template <typename T>
void MatrixBitCodeFunctor<T>::AddGrad(framework::Tensor& tmat,
framework::Tensor& vec) {
SimpleCodeTable code_table(num_classes_);
size_t batch_size = tmat.dims()[0];
size_t width = tmat.dims()[1];
for (size_t i = 0; i < batch_size; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
if (code.calc_bit(j)) {
tmat.data<T>()[i * o_width + j] -= 1;
}
size_t index = code.calc_index(j);
vec.data<T>()[index] += tmat.data<T>()[i * width + j];
}
}
}
template <typename T, class CodeTable>
void SumByBitCodeT(CodeTable code_table, const int64_t* codes,
framework::Tensor& tmat, framework::Tensor& sum,
const T& scale_sum) {
// size_t max_code_length = code_table.get_max_code_length();
template <typename T>
void MatrixBitCodeFunctor<T>::Sum(framework::Tensor& tmat,
framework::Tensor& sum, T scale_sum) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0];
size_t o_width = tmat.dims()[1];
for (size_t i = 0; i < num_samples; ++i) {
T sm = static_cast<T>(0.0);
auto code = code_table(static_cast<size_t>(codes[i]));
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
if (code.calc_bit(j)) {
......@@ -106,116 +97,99 @@ void SumByBitCodeT(CodeTable code_table, const int64_t* codes,
}
template <typename T>
void MatrixBitCodeFunctor<T>::Add(size_t num_classes, const int64_t* codes,
framework::Tensor& tmat,
const framework::Tensor& vec) {
auto op = [](T& t, const T& v) { t += v; };
AddByBitCodeT<T>(op, SimpleCodeTable(num_classes), codes, tmat, vec);
}
template <typename T>
void MatrixBitCodeFunctor<T>::AddGrad(size_t num_classes, const int64_t* codes,
framework::Tensor& tmat,
framework::Tensor& vec) {
auto op = [](T& t, T& v) { v += t; };
AddByBitCodeT<T>(op, SimpleCodeTable(num_classes), codes, tmat, vec);
}
template <typename T>
void MatrixBitCodeFunctor<T>::Sum(size_t num_classes, const int64_t* codes,
framework::Tensor& tmat,
framework::Tensor& sum, T scale_sum) {
SumByBitCodeT<T>(SimpleCodeTable(num_classes), codes, tmat, sum, scale_sum);
}
template <typename T>
void MatrixBitCodeFunctor<T>::Mul(size_t num_classes, const int64_t* codes,
framework::Tensor& tmat,
void MatrixBitCodeFunctor<T>::Mul(framework::Tensor& tmat,
const framework::Tensor& weight,
const framework::Tensor& input) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0];
size_t tmat_width = tmat.dims()[1];
size_t input_width = input.dims()[1];
size_t weight_width = weight.dims()[1];
auto tmat_p = tmat.data<T>();
auto weight_p = weight.data<T>();
auto input_p = input.data<T>();
auto code_table = SimpleCodeTable(num_classes);
size_t weight_width = weight.dims()[2];
auto tmat_value = tmat.data<T>();
auto weight_value = weight.data<T>();
auto input_value = input.data<T>();
for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(codes[i]));
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j);
T sum = static_cast<T>(0.0);
for (size_t k = 0; k < input_width; ++k) {
sum +=
weight_p[weight_width * index + k] * input_p[input_width * i + k];
sum += weight_value[weight_width * index + k] *
input_value[input_width * i + k];
}
tmat_p[i * tmat_width + j] += sum;
tmat_value[i * tmat_width + j] += sum;
}
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::MulGradWeight(size_t num_classes,
const int64_t* codes,
const framework::Tensor& tmat,
void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
framework::Tensor& weight,
const framework::Tensor& input) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0];
size_t input_width = input.dims()[1];
size_t weight_width = weight.dims()[1];
auto tmat_p = tmat.data<T>();
auto weight_p = weight.data<T>();
auto input_p = input.data<T>();
auto code_table = SimpleCodeTable(num_classes);
auto tmat_value = tmat.data<T>();
auto weight_value = weight.data<T>();
auto input_value = input.data<T>();
for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(codes[i]));
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j);
for (size_t k = 0; k < input_width; ++k) {
weight_p[weight_width * index * k] +=
tmat_p[i * weight_width * j] * input_p[input_width * i + k];
weight_value[weight_width * index * k] +=
tmat_value[i * weight_width * j] * input_value[input_width * i + k];
}
}
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::MulGradError(size_t num_classes,
const int64_t* codes,
const framework::Tensor& tmat,
void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
const framework::Tensor& weight,
framework::Tensor& input) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0];
size_t input_width = input.dims()[1];
size_t weight_width = weight.dims()[1];
auto tmat_p = tmat.data<T>();
auto weight_p = weight.data<T>();
auto input_p = input.data<T>();
auto code_table = SimpleCodeTable(num_classes);
auto tmat_value = tmat.data<T>();
auto weight_value = weight.data<T>();
auto input_value = input.data<T>();
for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(codes[i]));
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j);
for (size_t k = 0; k < input_width; ++k) {
input_p[weight_width * index * k] +=
tmat_p[i * weight_width * j] * weight_p[weight_width * i + k];
input_value[weight_width * index * k] +=
tmat_value[i * weight_width * j] *
weight_value[weight_width * i + k];
}
}
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::Sub(size_t num_classes, const int64_t* codes,
framework::Tensor& tmat) {
SubByBitCodeT<T>(SimpleCodeTable(num_classes), codes, tmat);
void MatrixBitCodeFunctor<T>::Sub(framework::Tensor& tmat) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0];
size_t o_width = tmat.dims()[1];
for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
if (code.calc_bit(j)) {
tmat.data<T>()[i * o_width + j] -= 1;
}
}
}
}
template class MatrixBitCodeFunctor<float>;
......
......@@ -63,46 +63,45 @@ struct SimpleCodeTable {
template <typename T>
class MatrixBitCodeFunctor {
public:
explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
: num_classes_(num_classes), ids_(ids) {}
/* For j < code_length
tmat(i, j) += vec(0, index(i, j))
*/
void Add(size_t num_classes, const int64_t* codes, framework::Tensor& tmat,
const framework::Tensor& vec);
void Add(framework::Tensor& tmat, const framework::Tensor& vec);
/* For j < code_length
vec(0, index(i, j)) += tmat(i, j)
*/
void AddGrad(size_t num_classes, const int64_t* codes,
framework::Tensor& tmat, framework::Tensor& vec);
void AddGrad(framework::Tensor& tmat, framework::Tensor& vec);
/* For j < code_length
sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
*/
void Sum(size_t num_classes, const int64_t* codes, framework::Tensor& tmat,
framework::Tensor& sum, T scale_sum);
void Sum(framework::Tensor& tmat, framework::Tensor& sum, T scale_sum);
/* For j < code_length
tmat(i, j) -= bit(i, j)
*/
void Sub(size_t num_classes, const int64_t* codes, framework::Tensor& tmat);
void Sub(framework::Tensor& tmat);
/* For j < code_length
input.row(i) += tmat(i, j) * weight.row(index(i, j))
*/
void Mul(size_t num_classes, const int64_t* codes, framework::Tensor& tmat,
const framework::Tensor& weight, const framework::Tensor& input);
void Mul(framework::Tensor& tmat, const framework::Tensor& weight,
const framework::Tensor& input);
/* For index(i, j) >= 0:
weight.row(index(i, j)) += tmat(i, j) * input.row(i)
*/
void MulGradWeight(size_t num_classes, const int64_t* codes,
const framework::Tensor& tmat, framework::Tensor& weight,
void MulGradWeight(const framework::Tensor& tmat, framework::Tensor& weight,
const framework::Tensor& input);
/* For j < code_length
input.row(i) += tmat(i, j) * weight.row(index(i, j))
*/
void MulGradError(size_t num_classes, const int64_t* codes,
const framework::Tensor& tmat,
void MulGradError(const framework::Tensor& tmat,
const framework::Tensor& weight, framework::Tensor& input);
size_t num_classes_;
const int64_t* ids_;
};
} // namespace math
} // namespace operators
......
......@@ -49,6 +49,7 @@ def create_op(scope, op_type, inputs, outputs, attrs):
for attr_name in Operator.get_op_attr_names(op_type):
if attr_name in attrs:
kwargs[attr_name] = attrs[attr_name]
return Operator(op_type, **kwargs)
......@@ -104,8 +105,6 @@ def get_numeric_gradient(scope,
tensor_to_check_dtype = np.float32
elif tensor_to_check_dtype == core.DataType.FP64:
tensor_to_check_dtype = np.float64
elif tensor_to_check_dtype == core.DataType.INT64:
tensor_to_check_dtype = np.int64
else:
raise ValueError("Not supported data type " + str(
tensor_to_check_dtype))
......@@ -115,8 +114,6 @@ def get_numeric_gradient(scope,
def __get_elem__(tensor, i):
if tensor_to_check_dtype == np.float32:
return tensor.get_float_element(i)
elif tensor_to_check_dtype == np.int64:
return tensor.get_int64_element(i)
else:
return tensor.get_double_element(i)
......@@ -356,11 +353,13 @@ class OpTest(unittest.TestCase):
op_attrs = self.attrs if hasattr(self, "attrs") else dict()
self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
op_attrs)
if no_grad_set is None:
no_grad_set = set()
if not type(output_names) is list:
output_names = [output_names]
numeric_grads = user_defined_grads or [
get_numeric_gradient(
self.scope,
......@@ -456,7 +455,9 @@ class OpTest(unittest.TestCase):
# infer variable type and infer shape in compile-time
op.desc.infer_var_type(block.desc)
op.desc.infer_shape(block.desc)
mean_inputs = map(block.var, output_names)
if len(mean_inputs) == 1:
loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
op = block.append_op(
......
import unittest
import numpy as np
from op_test import OpTest
import math
def find_latest_set(num):
return 1 + int(math.floor(math.log(num, 2)))
class CodeTable(object):
def __init__(self, num_classes, code):
self.c = num_classes + code
def cal_index(self, bit):
return (self.c >> (bit + 1)) - 1
def get_length(self):
return find_latest_set(self.c) - 1
def cal_bit(self, bit):
return self.c & (1 << bit)
def hsigmoid(x, w, ids, bias, num_classes):
# code length =
# initialize pre out with dims={batch_size, code_length}
batch_size = x.shape[0]
code_length = find_latest_set(num_classes - 1)
code_table = [0 for _ in range(code_length)]
pre_output = np.zeros((batch_size, code_length))
pre_sum = np.zeros((batch_size, 1))
out = np.zeros((batch_size, 1)).astype("float32")
# pre_out += code(bias)
for i in xrange(batch_size):
code_table = CodeTable(num_classes, ids[i])
length = code_table.get_length()
for j in xrange(length):
idx = code_table.cal_index(j)
pre_output[i][j] += bias[0][idx]
# pre_out += code(w) * x
for i in xrange(batch_size):
for j in xrange(batch_size):
code_table = CodeTable(num_classes, ids[j])
length = code_table.get_length()
for k in xrange(length):
idx = code_table.cal_index(k)
sum = 0.0
for l in xrange(x.shape[1]):
sum += w[i][idx][l] * x[j][l]
pre_output[j][k] += sum
# clip[-40.0, 40.0]
np.clip(pre_output, -40.0, 40.0)
# out(i, 0) = \sum_j bit(i, j) * preout(i, j)
for i in xrange(batch_size):
code_table = CodeTable(num_classes, ids[i])
length = code_table.get_length()
sum = 0.0
for j in xrange(length):
if code_table.cal_bit(j):
sum += pre_output[i][j]
out[i] = -1.0 * sum
# soft relu
np.clip(pre_output, -40.0, 40.0)
pre_output = np.log(1 + np.exp(pre_output))
pre_sum = pre_output.sum(1).reshape((batch_size, 1))
out += pre_sum
return out
class TestHSigmoidOp(OpTest):
......@@ -16,9 +81,8 @@ class TestHSigmoidOp(OpTest):
bias = np.random.random((1, num_classes - 1)).astype("float32")
self.inputs = {'X': x, 'W': w, 'Ids': ids, 'Bias': bias}
self.attrs = {'num_classes': num_classes}
self.outputs = {
'Out': np.random.random((batch_size, 1)).astype("float32")
}
out = hsigmoid(x, w, ids, bias, num_classes)
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册