From 1abd3b3a29b6964323d679d47dea31830f5b5e6a Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 27 Nov 2017 19:28:57 +0800 Subject: [PATCH] implement forward --- paddle/operators/CMakeLists.txt | 4 +- paddle/operators/hierarchical_sigmoid_op.cc | 6 +- paddle/operators/hierarchical_sigmoid_op.h | 39 ++++++++++- paddle/operators/math/math_function.cc | 2 + paddle/operators/math/math_function.cu | 2 + paddle/operators/math/math_function.h | 6 ++ paddle/operators/math/math_function_impl.h | 14 ++++ paddle/operators/math/matrix_bit_code.cc | 77 +++++++++++++++++++-- paddle/operators/math/matrix_bit_code.h | 19 ++++- 9 files changed, 157 insertions(+), 12 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index a719da25602..93ec763424d 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -185,7 +185,8 @@ set(DEPS_OPS tensor_array_read_write_op gru_op adagrad_op - sgd_op) + sgd_op + hierarchical_sigmoid_op) op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) @@ -203,6 +204,7 @@ op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op) op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op) op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc) +op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) if(WITH_GPU) op_library(nccl_op DEPS nccl_common) endif() diff --git a/paddle/operators/hierarchical_sigmoid_op.cc b/paddle/operators/hierarchical_sigmoid_op.cc index 9b7af92662e..f81f3d34d19 100644 --- a/paddle/operators/hierarchical_sigmoid_op.cc +++ b/paddle/operators/hierarchical_sigmoid_op.cc @@ -85,12 +85,16 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { "(TensorArray, required) The input array. Each Tensor has the " "same shape with [N * D].") .AsDuplicable(); + AddInput("Parameters", + "(Tensor, required), The parameters of hierarchical " + "sigmoid operator, each of them is s a 2-D tensor.") + .AsDuplicable(); AddInput("Label", "(Tensor, required), The labels of training data. It's a" "1-D tensor."); AddInput("Bias", "(Tensor, optional), The bias is a 1-D tensor, " - "which is applied to the output"); + "which is applied to the output."); AddOutput( "Out", "(Tensor, required) The output of hierarchical sigmoid operator."); diff --git a/paddle/operators/hierarchical_sigmoid_op.h b/paddle/operators/hierarchical_sigmoid_op.h index 11a553a4039..baf655f2141 100644 --- a/paddle/operators/hierarchical_sigmoid_op.h +++ b/paddle/operators/hierarchical_sigmoid_op.h @@ -14,28 +14,61 @@ limitations under the License. */ #pragma once #include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/matrix_bit_code.h" namespace paddle { namespace operators { -template +template +using EigenMatrix = framework::EigenMatrix; + +template class HierarchicalSigmoidOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto ins = ctx.MultiInput("X"); + auto params = ctx.MultiInput("Parameters"); auto* label = ctx.Input("Label"); auto* bias = ctx.Input("Bias"); + auto* out = ctx.Output("Out"); size_t num_classes = static_cast(ctx.Attr("num_classes")); + + framework::Tensor sum; + framework::Tensor pre_out; + auto place = ctx.GetEigenDevice(); + auto& device_ctx = ctx.device_context(); + math::ColwiseSum col_sum; + math::RowwiseSum row_sum; + + auto pre_out_mat = EigenMatrix::From(pre_out); int64_t batch_size = ins[0]->dims()[0]; int64_t size = ins.size(); - framework::Tensor pre_out; + std::vector pre_out_dims({batch_size, size}); pre_out.mutable_data(framework::make_ddim(pre_out_dims), ctx.GetPlace()); + std::vector sum_dims({batch_size, 1UL}); + sum.mutable_data(framework::make_ddim(sum_dims), ctx.GetPlace()); + out->mutable_data(ctx.GetPlace()); - if (bias != NULL) { + if (bias) { math::AddByBitCode(num_classes, *label, pre_out, *bias); } + + for (size_t i = 0; i < ins.size(); ++i) { + math::MulByBitCode(num_classes, *label, pre_out, *params[i], *ins[i]); + } + // clip the matrix with (-40, 40) + pre_out_mat.device(place) = + pre_out_mat.abs().cwiseMax(static_cast(40.0)); + math::SumByBitCode(num_classes, *label, *out, pre_out, + static_cast(-1)); + // softrelu + pre_out_mat.device(place) = (static_cast(1) + pre_out_mat.exp()).log(); + + row_sum(device_ctx, pre_out, &sum); + col_sum(device_ctx, *out, &sum); } }; diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 2e333a8cde7..3bc0945fe3a 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -314,6 +314,8 @@ template struct RowwiseAdd; template struct RowwiseAdd; template struct ColwiseSum; template struct ColwiseSum; +template struct RowwiseSum; +template struct RowwiseSum; } // namespace math } // namespace operators diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 58356a4b778..1a226821f71 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -298,6 +298,8 @@ template struct RowwiseAdd; template struct RowwiseAdd; template struct ColwiseSum; template struct ColwiseSum; +template struct RowwiseSum; +template struct RowwiseSum; } // namespace math } // namespace operators diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index ffb99f53808..c21a20fc326 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -130,6 +130,12 @@ struct ColwiseSum { const framework::Tensor& input, framework::Tensor* vec); }; +template +struct RowwiseSum { + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, framework::Tensor* vec); +}; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h index 4dc17a4e525..8c1971fc611 100644 --- a/paddle/operators/math/math_function_impl.h +++ b/paddle/operators/math/math_function_impl.h @@ -78,6 +78,20 @@ void ColwiseSum::operator()(const platform::DeviceContext& context, in.sum(Eigen::array({{0}})).reshape(shape); } +template +void RowwiseSum::operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* vector) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[1]; + PADDLE_ENFORCE_EQ(vector->numel(), size); + + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenMatrix::From(*vector); + Eigen::array shape({{static_cast(size), 1}}); + vec.reshape(shape).device(*context.GetEigenDevice()) = + in.sum(Eigen::array({{0}})).reshape(shape); +} } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/matrix_bit_code.cc b/paddle/operators/math/matrix_bit_code.cc index 30c2ffc2cfd..8f68e2f79dd 100644 --- a/paddle/operators/math/matrix_bit_code.cc +++ b/paddle/operators/math/matrix_bit_code.cc @@ -53,18 +53,18 @@ namespace math { template static void AddByBitCodeT(Op op, CodeTable code_table, const framework::Tensor& codes, framework::Tensor& a, - framework::Tensor& b) { + const framework::Tensor& b) { size_t num_classes = code_table.size(); size_t max_code_length = code_table.get_max_code_length(); - size_t num_sample = a.dims()[0].size(); - size_t width = a.dims()[1].size(); + size_t num_sample = a.dims()[0]; + size_t width = a.dims()[1]; for (size_t i = 0; i < num_sample; ++i) { - auto code = code_table(codes.data()[i]) int code_length = - code.get_length(); + auto code = code_table(codes.data()[i]); + int code_length = code.get_length(); for (int j = 0; j < code_length; + j) { size_t index = code.calc_index(j); - op(a.data()[i * width + j], b.data()[index]); + op(a.data()[i * width + j], b.data()[index]); } } } @@ -79,6 +79,71 @@ void AddByBitCode(size_t num_classes, const framework::Tensor& codes, AddByBitCodeT(op, SimpleCodeTable(num_classes), codes, a, b); } +template +void SumByBitCodeT(CodeTable code_table, const framework::Tensor& codes, + framework::Tensor& tmat, framework::Tensor& sum, + const T& scale_sum) { + size_t max_code_length = code_table.get_max_code_length(); + size_t num_samples = tmat.dims()[0]; + size_t o_width = tmat.dims()[1]; + for (size_t i = 0; i < num_samples; ++i) { + T sm = 0; + auto code = code_table(codes.data()[i]); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + if (code.calc_bit(j)) { + sm += tmat.data()[i * o_width + j]; + } + } + sum.data()[i] = scale_sum * sm; + } +} +/* For j < codeLength: + sum(i, 0) = \sum_j bit(i, j) * input(i, j) +*/ +template +void SumByBitCode(size_t num_classes, const framework::Tensor& codes, + framework::Tensor& tmat, framework::Tensor& sum, + T scale_sum) { + SumByBitCodeT(SimpleCodeTable(num_classes), codes, tmat, scale_sum); +} + +template +void MulByBitCodeT(Op op, CodeTable code_table, const framework::Tensor& codes, + framework::Tensor& tmat, framework::Tensor& weight, + framework::Tensor& input) { + size_t num_classes = code_table.size(); + size_t max_code_length = code_table.get_max_code_length(); + size_t num_samples = tmat.dims()[0]; + size_t input_dim = input.dims()[1]; + size_t o_width = tmat.dims()[1]; + + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table(codes.data()[i]); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + op(tmat.data()[i * o_width + j], + weight.data() + index * weight.dims()[1], + input.data() + i * input.dims()[1], input_dim); + } + } +} + +template +void MulByBitCode(size_t num_classes, const framework::Tensor& codes, + framework::Tensor& tmat, const framework::Tensor& weight, + const framework::Tensor& input) { + auto op = [](T& t, const T* weight_row, const T* input_row, + size_t input_dim) { + T sum = 0; + for (size_t k = 0; k < input_dim; ++k) { + sum += weight_row[k] * input_row[k]; + } + t += sum; + }; + MulByBitCode(op, SimpleCodeTable(num_classes), codes, tmat, weight, input); +} } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/matrix_bit_code.h b/paddle/operators/math/matrix_bit_code.h index bb0599aa177..7bef5077b9b 100644 --- a/paddle/operators/math/matrix_bit_code.h +++ b/paddle/operators/math/matrix_bit_code.h @@ -59,10 +59,27 @@ struct SimpleCodeTable { int max_code_length_; }; +/* For j < codeLength + tmat(i, j) += vec(0, index(i, j)) +*/ template void AddByBitCode(size_t num_classes, const framework::Tensor& codes, - framework::Tensor& a, const framework::Tensor& b); + framework::Tensor& tmat, const framework::Tensor& vec); +/* For j < codeLength + sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) +*/ +template +void SumByBitCode(size_t num_classes, const framework::Tensor& codes, + framework::Tensor& tmat, framework::Tensor& sum, T scale_sum); + +/* For j < codeLength + input.row(i) += tmat(i, j) * weight.row(index(i, j)) +*/ +template +void MulByBitCode(size_t num_classes, const framework::Tensor& codes, + framework::Tensor& tmat, const framework::Tensor& weight, + const framework::Tensor& input); } // namespace math } // namespace operators } // namespace paddle -- GitLab