diff --git a/doc/design/selected_rows.md b/doc/design/selected_rows.md index 9e6f3b20cbcdc55e481fbe7bf5fa555d8b3c3d45..1a98839a957612b91b2276b58818623ecc62d1d5 100644 --- a/doc/design/selected_rows.md +++ b/doc/design/selected_rows.md @@ -1,6 +1,6 @@ # Design Doc: Selected Rows -`SelectedRows` is a kind of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in that tensor. It is straightforward to represent the sparse tensor by the following sparse tensor data structure: +`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure: ```cpp class SelectedRows { @@ -11,7 +11,7 @@ class SelectedRows { }; ``` -The field `height_` shows the first dimension of `SelectedRows`. The `rows` are the indices of which rows of `SelectedRows` are non-zeros. The `value_` field is an N-dim tensor and shape is `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`. +The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`. Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be: @@ -25,7 +25,7 @@ x = SelectedRow { ## SelectedRows in Protobuf -`SelectedRows` is a kind of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time since the `rows_` and `value_` are related to training data. +`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description. ```proto @@ -54,7 +54,7 @@ message VarDesc { ## InferShape for Selected Rows -Just like `LoD` information, `InferShape` method will inference output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor. +Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor. For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following @@ -68,7 +68,7 @@ void TableLookupGrad::InferShape(context) { ## Sparse Operators -There are several operators should be written to support `SelectedRows`. They are: +There are several operators that need to be written to support `SelectedRows`. These are: -1. Operators which generates `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`. +1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`. 2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`. diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index c78e05607179387e1c015f6fd24669c538587759..ac80879c541306dba987040b71b0dadd81af2a3c 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -281,12 +281,16 @@ static void CreateGradVarInBlock( auto ops = block_desc->AllOps(); for (size_t op_index = grad_op_start_index; op_index < ops.size(); ++op_index) { + bool need_infer_shape = false; ForEachVarName(ops[op_index]->Outputs(), [&](const std::string& grad_var_name) { if (block_desc->HasVar(grad_var_name)) { return false; } - block_desc->Var(grad_var_name); + need_infer_shape = true; + auto var = block_desc->Var(grad_var_name); + // FIXME(qiao) infer the datatype + var->SetDataType(framework::DataType::FP32); auto it = param_name_map.find(grad_var_name); if (it == param_name_map.end()) { return false; @@ -298,6 +302,9 @@ static void CreateGradVarInBlock( grad_record.op_idx_ = static_cast(op_index); return false; /* not break */ }); + if (need_infer_shape) { + ops[op_index]->InferShape(*block_desc); + } } } @@ -428,10 +435,16 @@ ParamGradInfoMap AppendBackward( auto& all_ops = root_block->ops_; // insert fill one op for target + // TODO(qiao) add some check to the target. std::string fill_one_op_out = GradVarName(target.Name()); + std::vector target_shape_desc = target.Shape(); + std::vector target_shape; + std::transform(target_shape_desc.begin(), target_shape_desc.end(), + std::back_inserter(target_shape), + [](int64_t dim) { return static_cast(dim); }); std::unique_ptr fill_one_op( new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}}, - {{"shape", std::vector{1}}, + {{"shape", target_shape}, {"value", static_cast(1.0)}, {"data_type", framework::DataType::FP32}})); all_ops.push_back(std::move(fill_one_op)); @@ -443,13 +456,22 @@ ParamGradInfoMap AppendBackward( auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx, &no_grad_var_names, &grad_to_var); - std::unordered_map retv; - - // Create Variable for (auto& ptr : backward_op_descs) { all_ops.push_back(std::move(ptr)); } - root_block->Var(fill_one_op_out); + // Create Variable + + // Create target gradient variable + std::unordered_map retv; + + auto var = root_block->Var(fill_one_op_out); + // FIXME(qiao) infer the data type + var->SetDataType(framework::DataType::FP32); + var->SetShape(target.Shape()); + auto& target_grad = retv[target.Name()]; + target_grad.name_ = fill_one_op_out; + target_grad.block_idx_ = root_block_idx; + target_grad.op_idx_ = static_cast(forward_op_num); // create grad_var for all blocks in this program CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv); diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc index 5302afcafb5c0e1c057302dac174be935649ef11..0c35a157bcfeb93193536fa026244e39e948cc2f 100644 --- a/paddle/framework/backward_test.cc +++ b/paddle/framework/backward_test.cc @@ -26,6 +26,20 @@ namespace framework { using DeviceContext = platform::DeviceContext; +class NoneOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override {} +}; + +template +class NoneKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override {} +}; + class RowWiseAddOpMaker : public OpProtoAndCheckerMaker { public: RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) @@ -215,19 +229,51 @@ class MinusOpMaker : public OpProtoAndCheckerMaker { namespace f = paddle::framework; namespace ops = paddle::operators; using EnforceNotMet = paddle::platform::EnforceNotMet; -REGISTER_OPERATOR(rowwise_add, f::NOP, f::RowWiseAddOpMaker, +// rowwise_add +REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker, f::RowWiseAddGradMaker); -REGISTER_OPERATOR(rowwise_add_grad, f::NOP); -REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP); -REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP); -REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker); -REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker); -REGISTER_OP(sum, f::NOP, f::SumOpMaker, sum_grad, f::NOP); +REGISTER_OP_CPU_KERNEL(rowwise_add, + f::NoneKernel); +REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(rowwise_add_grad, + f::NoneKernel); +// mul +REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel); +REGISTER_OP_CPU_KERNEL(mul_grad, + f::NoneKernel); +// sigmoid +REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(sigmoid, + f::NoneKernel); +REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker); +// fill_zeros_like +REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker); +REGISTER_OP_CPU_KERNEL(fill_zeros_like, + f::NoneKernel); +// sum +REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp); +REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel); +REGISTER_OP_CPU_KERNEL(sum_grad, + f::NoneKernel); +// fc REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker); -REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad, - f::NOP); -REGISTER_OP(mult_in_out, f::NOP, f::MultInOutOpMaker, mult_in_out_grad, f::NOP); -REGISTER_OPERATOR(minus, f::NOP, f::MinusOpMaker, f::MinusGradOpDescMaker); +// many_output_op +REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker, + many_output_op_grad, f::NoneOp); +// mult_in_out +REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad, + f::NoneOp); +REGISTER_OP_CPU_KERNEL(mult_in_out, + f::NoneKernel); +REGISTER_OP_CPU_KERNEL(mult_in_out_grad, + f::NoneKernel); +// minus +REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker); +REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel); +// scale +REGISTER_OPERATOR(scale, f::NoneOp); +REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel); TEST(Backward, simple_op_not_need_grad) { auto fwd = f::OpRegistry::CreateOp( @@ -463,6 +509,7 @@ TEST(Backward, simple_single_op) { f::ProgramDesc *program_desc = GetNewProgramDesc(); f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc); f::BlockDescBind *block = program.Block(0); + f::OpDescBind *op = block->AppendOp(); op->SetType("rowwise_add"); op->SetInput("X", {"x"}); @@ -487,7 +534,7 @@ TEST(Backward, simple_single_op) { EXPECT_EQ(grad_op->Output(f::GradVarName("b")), std::vector({f::GradVarName("b")})); - EXPECT_EQ(var_to_grad.size(), 2UL); + EXPECT_EQ(var_to_grad.size(), 3UL); EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2)); EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2)); @@ -588,7 +635,7 @@ TEST(Backward, simple_mult_op) { EXPECT_EQ(grad_op3->Output(f::GradVarName("b")), std::vector({f::GradVarName("b3")})); - EXPECT_EQ(var_to_grad.size(), 6UL); + EXPECT_EQ(var_to_grad.size(), 7UL); EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6)); EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6)); EXPECT_EQ(var_to_grad.at("out1"), @@ -666,7 +713,7 @@ TEST(Backward, intermedia_var_no_grad) { std::vector({f::GradVarName("out1")})); EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector()); - EXPECT_EQ(var_to_grad.size(), 3UL); + EXPECT_EQ(var_to_grad.size(), 4UL); EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6)); EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6)); EXPECT_EQ(var_to_grad.at("out1"), @@ -744,7 +791,7 @@ TEST(Backward, var_no_grad) { EXPECT_EQ(grad_op1->Output(f::GradVarName("H")), std::vector({f::GradVarName("h1")})); - EXPECT_EQ(var_to_grad.size(), 3UL); + EXPECT_EQ(var_to_grad.size(), 4UL); EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3)); EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5)); EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5)); @@ -830,7 +877,7 @@ TEST(Backward, shared_var) { EXPECT_EQ(grad_op1->Output(f::GradVarName("b")), std::vector({f::GradVarName("b1")})); - EXPECT_EQ(var_to_grad.size(), 5UL); + EXPECT_EQ(var_to_grad.size(), 6UL); EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4)); EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5)); EXPECT_EQ(var_to_grad.at("out1"), @@ -863,7 +910,7 @@ TEST(Backward, half_backward) { auto ops = block->AllOps(); ASSERT_EQ(3UL, ops.size()); - EXPECT_EQ(var_to_grad.size(), 1UL); + EXPECT_EQ(var_to_grad.size(), 2UL); EXPECT_EQ(var_to_grad.at("a"), f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1)); } diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc index 47b75228cdbd2a8b4f0c5ad33aa82f5e43044606..ba970254e590308851e31542e2037e9c5590c7fa 100644 --- a/paddle/framework/block_desc.cc +++ b/paddle/framework/block_desc.cc @@ -72,13 +72,13 @@ std::vector BlockDescBind::AllOps() const { void BlockDescBind::Flush() { if (need_update_) { auto &op_field = *this->desc_->mutable_ops(); - op_field.Clear(); + this->ClearPBOps(); op_field.Reserve(static_cast(ops_.size())); for (auto &op_desc : ops_) { op_field.AddAllocated(op_desc->Proto()); } auto &var_field = *this->desc_->mutable_vars(); - var_field.Clear(); + this->ClearPBVars(); var_field.Reserve(static_cast(vars_.size())); for (auto &var_desc : vars_) { var_field.AddAllocated(var_desc.second->Proto()); @@ -99,5 +99,21 @@ BlockDesc *BlockDescBind::Proto() { return desc_; } +void BlockDescBind::ClearPBOps() { + auto ops = this->desc_->mutable_ops(); + while (!ops->empty()) { + // we do not own the OpDesc, so release the ownership. + ops->ReleaseLast(); + } +} + +void BlockDescBind::ClearPBVars() { + auto vars = this->desc_->mutable_vars(); + while (!vars->empty()) { + // we do not own the VarDesc, so release the ownership. + vars->ReleaseLast(); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h index 9fb88f963283c72e1ec389b72dd2d98049c74f6d..dd7b1228befd6b4f19c9aa3dd493b0d8005359fc 100644 --- a/paddle/framework/block_desc.h +++ b/paddle/framework/block_desc.h @@ -36,6 +36,11 @@ class BlockDescBind { BlockDescBind(ProgramDescBind *prog, BlockDesc *desc) : prog_(prog), desc_(desc), need_update_(false) {} + ~BlockDescBind() { + this->ClearPBVars(); + this->ClearPBOps(); + } + int32_t ID() const { return desc_->idx(); } int32_t Parent() const { return desc_->parent_idx(); } @@ -60,6 +65,10 @@ class BlockDescBind { BlockDesc *Proto(); + private: + void ClearPBOps(); + void ClearPBVars(); + // FIXME(yuyang18): backward will access private data of BlockDesc. // Mark it public temporary. We can fix it later. public: diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 8e82e28bac478ad93ece3fcec9730c6cbabc392a..b3b85b586579c1e8129a40e44141ef4011e356e1 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -64,99 +64,23 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { auto& block = pdesc.blocks(block_id); auto& device = device_contexts_[0]; - // Instantiate all the vars in the global scope - for (auto& var : block.vars()) { - scope->Var(var.name()); - } - Scope& local_scope = scope->NewScope(); - std::vector should_run = Prune(pdesc, block_id); - PADDLE_ENFORCE_EQ(should_run.size(), static_cast(block.ops_size())); - for (size_t i = 0; i < should_run.size(); ++i) { - if (should_run[i]) { - for (auto& var : block.ops(i).outputs()) { - for (auto& argu : var.arguments()) { - if (local_scope.FindVar(argu) == nullptr) { - local_scope.Var(argu); - } - } - } - auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i)); - op->Run(local_scope, *device); + for (auto& var : block.vars()) { + if (var.persistable()) { + scope->Var(var.name()); + } else { + local_scope.Var(var.name()); } } - // TODO(tonyyang-svail): - // - Destroy local_scope -} - -std::vector Prune(const ProgramDesc& pdesc, int block_id) { - // TODO(tonyyang-svail): - // - will change to use multiple blocks for RNN op and Cond Op - - auto& block = pdesc.blocks(block_id); - auto& ops = block.ops(); - - bool expect_feed = true; - for (auto& op_desc : ops) { - PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed, - "All FeedOps are at the beginning of the ProgramDesc"); - expect_feed = (op_desc.type() == kFeedOpType); - } - - bool expect_fetch = true; - for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { - auto& op_desc = *op_iter; - PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch, - "All FetchOps must at the end of the ProgramDesc"); - expect_fetch = (op_desc.type() == kFetchOpType); - } - - std::set dependent_vars; - std::vector should_run; - for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { - auto& op_desc = *op_iter; - - bool found_dependent_vars = false; - for (auto& var : op_desc.outputs()) { - for (auto& argu : var.arguments()) { - if (dependent_vars.count(argu) != 0) { - found_dependent_vars = true; - } - } - } - - if (op_desc.type() == kFetchOpType || found_dependent_vars) { - // erase its output to the dependency graph - for (auto& var : op_desc.outputs()) { - for (auto& argu : var.arguments()) { - dependent_vars.erase(argu); - } - } - - // insert its input to the dependency graph - for (auto& var : op_desc.inputs()) { - for (auto& argu : var.arguments()) { - dependent_vars.insert(argu); - } - } - - should_run.push_back(true); - } else { - should_run.push_back(false); - } + for (auto& op_desc : block.ops()) { + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + op->Run(local_scope, *device); } // TODO(tonyyang-svail): - // - check this after integration of Init - // PADDLE_ENFORCE(dependent_vars.empty()); - - // since we are traversing the ProgramDesc in reverse order - // we reverse the should_run vector - std::reverse(should_run.begin(), should_run.end()); - - return should_run; + // - Destroy local_scope } } // namespace framework diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 4e3bc2c0a59dfee5b9993037671f14a109dc8a74..793ee954e25f7da6c9d04ea6acc2ad78812e8329 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -40,16 +40,5 @@ class Executor { std::vector device_contexts_; }; -/* @Brief - * Pruning the graph - * - * @param - * ProgramDesc - * - * @return - * vector Same size as ops. Indicates whether an op should be run. - */ -std::vector Prune(const ProgramDesc& pdesc, int block_id); - } // namespace framework } // namespace paddle diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 0bda87dfa193b58798da6addf485f870bd0d7e83..dfca46b789f3ee129d7a6e19398c808448e8afa0 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -45,18 +45,15 @@ class Registrar { template struct OperatorRegistrar : public Registrar { - explicit OperatorRegistrar(const char* op_type) : op_type(op_type) { + explicit OperatorRegistrar(const char* op_type) { PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type), "'%s' is registered more than once.", op_type); static_assert(sizeof...(ARGS) != 0, "OperatorRegistrar should be invoked at least by OpClass"); + OpInfo info; details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info); OpInfoMap::Instance().Insert(op_type, info); } - - const char* op_type; - - OpInfo info; }; class OpRegistry { diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h index 16971883b42786e2cb48faafd2a7e95d45075ac8..6888380290074318fe7f94d168b2931e776dda47 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.h +++ b/paddle/gserver/gradientmachines/NeuralNetwork.h @@ -135,7 +135,7 @@ public: const std::string& getName() const { return subModelName_; } /// some finish work, like convert the weight format of MKLDNNLayers - void finish() override; + void finish(); protected: /** diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 77a1e22b41e8dcd1fe78f3c4730653dee04db80e..aad1357598c629a4edfe0ad9b23f0241093a2522 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -130,6 +130,87 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +#ifdef PADDLE_USE_MKLML +// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. +template <> +void batched_gemm( + const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C, const int batchCount, const int strideA, const int strideB) { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + auto a_array = std::vector(batchCount); + auto b_array = std::vector(batchCount); + auto c_array = std::vector(batchCount); + for (int k = 0; k < batchCount; ++k) { + a_array[k] = &A[k * strideA]; + b_array[k] = &B[k * strideB]; + c_array[k] = &C[k * M * N]; + } + cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, + a_array.data(), &lda, b_array.data(), &ldb, &beta, + c_array.data(), &ldc, 1 /* group_count */, &batchCount); +} + +template <> +void batched_gemm( + const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C, const int batchCount, const int strideA, const int strideB) { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + auto a_array = std::vector(batchCount); + auto b_array = std::vector(batchCount); + auto c_array = std::vector(batchCount); + for (int k = 0; k < batchCount; ++k) { + a_array[k] = &A[k * strideA]; + b_array[k] = &B[k * strideB]; + c_array[k] = &C[k * M * N]; + } + cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, + a_array.data(), &lda, b_array.data(), &ldb, &beta, + c_array.data(), &ldc, 1 /* group_count */, &batchCount); +} +#else +// The below is a naive but correct serial implementation that just loops +// over the batch dimension. This is a fallback for when the batched gemm +// functions of Intel MKL are not available. In the future, this computation +// should be parallelized. +template <> +void batched_gemm( + const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C, const int batchCount, const int strideA, const int strideB) { + for (int k = 0; k < batchCount; ++k) { + const float* Ak = &A[k * strideA]; + const float* Bk = &B[k * strideB]; + float* Ck = &C[k * M * N]; + gemm(context, transA, transB, M, N, K, alpha, Ak, + Bk, beta, Ck); + } +} + +template <> +void batched_gemm( + const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C, const int batchCount, const int strideA, const int strideB) { + for (int k = 0; k < batchCount; ++k) { + const double* Ak = &A[k * strideA]; + const double* Bk = &B[k * strideB]; + double* Ck = &C[k * M * N]; + gemm(context, transA, transB, M, N, K, alpha, + Ak, Bk, beta, Ck); + } +} +#endif + template struct SetConstant; } // namespace math diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 7fbc03acf22231a6fa386aa67e43f738eadb18d3..5583683c6e12b88ba81015aef9161913de261ef2 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -155,6 +155,54 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +template <> +void batched_gemm( + const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C, const int batchCount, const int strideA, const int strideB) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int strideC = M * N; + + PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( + reinterpret_cast(context) + .cublas_handle(), + cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, + &beta, C, ldc, strideC, batchCount)); +} + +template <> +void batched_gemm( + const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C, const int batchCount, const int strideA, const int strideB) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int strideC = M * N; + + PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( + reinterpret_cast(context) + .cublas_handle(), + cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, + &beta, C, ldc, strideC, batchCount)); +} + template struct SetConstant; } // namespace math diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index 6f92d83aabbc77f7ea7d4159869e07126b270740..9777ebfd156709a370be2cb4ba0077ac7c6735fb 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -63,7 +63,7 @@ namespace math { // Support continuous memory now // If transA = N, and transB = N -// Then matrixA: M * K, matrixB: K * N matrixC : M * N +// Then matrixA: M * K, matrixB: K * N, matrixC : M * N // For more detailed info, please refer to // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html template @@ -85,6 +85,14 @@ void matmul(const platform::DeviceContext& context, const framework::Tensor& matrix_b, bool trans_b, T alpha, framework::Tensor* matrix_out, T beta); +// Batched gemm +template +void batched_gemm(const platform::DeviceContext& context, + const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, + const int M, const int N, const int K, const T alpha, + const T* A, const T* B, const T beta, T* C, + const int batchCount, const int strideA, const int strideB); + template struct SetConstant { void operator()(const platform::DeviceContext& context, diff --git a/paddle/operators/math/matmul.h b/paddle/operators/math/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..6ba9a0ba9a70bd938f9362179990ab68fa3186ba --- /dev/null +++ b/paddle/operators/math/matmul.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/operators/math/math_function.h" + +namespace paddle { +namespace operators { +namespace math { + +// Implements the logic of numpy matmul: +// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html +// +// but allowing also for a, b to be transposed +// +// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported +// yet. +template +class MatMulFunctor { + public: + void operator()(const platform::DeviceContext& context, + const framework::Tensor& a, bool trans_a, + const framework::Tensor& b, bool trans_b, T alpha, + framework::Tensor* out, T beta) { + auto dim_a = a.dims(); + auto dim_b = b.dims(); + + PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(), + "Tensors must all be in the same place."); + PADDLE_ENFORCE_GE(dim_a.size(), 1, + "Input tensor a must be at least 1-dimensional."); + PADDLE_ENFORCE_GE(dim_b.size(), 1, + "Input tensor b must be at least 1-dimensional."); + PADDLE_ENFORCE_LE(dim_a.size(), 3, + "Input tensor a must be at most 3-dimensional."); + PADDLE_ENFORCE_LE(dim_b.size(), 3, + "Input tensor b must be at most 3-dimensional."); + + int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0, + strideA = 0, strideB = 0; + + switch (dim_a.size()) { + case 1: + // similar to np.matmul: + // prepend dimension 1 (no transpose) or append dimension 1 (transpose) + M = trans_a ? dim_a[0] : 1; + kA = trans_a ? 1 : dim_a[0]; + break; + case 2: + M = trans_a ? dim_a[1] : dim_a[0]; + kA = trans_a ? dim_a[0] : dim_a[1]; + break; + case 3: + batchCountA = dim_a[0]; + M = trans_a ? dim_a[2] : dim_a[1]; + kA = trans_a ? dim_a[1] : dim_a[2]; + strideA = M * kA; + break; + default: + assert(false); + } + + switch (dim_b.size()) { + case 1: + // similar to np.matmul: + // append dimension 1 (no transpose) or prepend dimension 1 (transpose) + kB = trans_b ? 1 : dim_b[0]; + N = trans_b ? dim_b[0] : 1; + break; + case 2: + kB = trans_b ? dim_b[1] : dim_b[0]; + N = trans_b ? dim_b[0] : dim_b[1]; + break; + case 3: + batchCountB = dim_b[0]; + kB = trans_b ? dim_b[2] : dim_b[1]; + N = trans_b ? dim_b[1] : dim_b[2]; + strideB = kB * N; + break; + default: + assert(false); + } + + PADDLE_ENFORCE_EQ( + kA, kB, + "First matrix's width must be equal with second matrix's height."); + if (batchCountA && batchCountB) { + PADDLE_ENFORCE_EQ( + batchCountA, batchCountB, + "When input tensors a and b are both batched, they must have the " + "same batch dimension."); + } + int batchCount = std::max(batchCountA, batchCountB); + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + if (!batchCount) { + // regular matrix multiplication + gemm(context, transA, transB, M, N, kA, alpha, a.data(), + b.data(), beta, out->data()); + } else { + // batched matrix multiplication + batched_gemm(context, transA, transB, M, N, kA, alpha, + a.data(), b.data(), beta, out->data(), + batchCount, strideA, strideB); + } + } +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5ecbee3b413617e3a5523d9a32e72bc08bd316c5 --- /dev/null +++ b/paddle/operators/matmul_op.cc @@ -0,0 +1,208 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/matmul_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class MatMulOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of MatMulOp should not be null."); + PADDLE_ENFORCE(context->HasInput("Y"), + "Input(Y) of MatMulOp should not be null."); + PADDLE_ENFORCE(context->HasOutput("Out"), + "Output(Out) of MatMulOp should not be null."); + + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); + bool transpose_x = context->Attrs().Get("transpose_X"); + bool transpose_y = context->Attrs().Get("transpose_Y"); + + PADDLE_ENFORCE_GE(dim_x.size(), 1, + "Input tensor X must be at least 1-dimensional."); + PADDLE_ENFORCE_GE(dim_y.size(), 1, + "Input tensor Y must be at least 1-dimensional."); + PADDLE_ENFORCE_LE(dim_x.size(), 3, + "Input tensor X must be at most 3-dimensional."); + PADDLE_ENFORCE_LE(dim_y.size(), 3, + "Input tensor Y must be at most 3-dimensional."); + + int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0; + bool remove_initial_dim = false, remove_final_dim = false; + + switch (dim_x.size()) { + case 1: + if (transpose_x) { + M = dim_x[0]; + KX = 1; + } else { + M = 1; + KX = dim_x[0]; + remove_initial_dim = true; + } + break; + case 2: + M = transpose_x ? dim_x[1] : dim_x[0]; + KX = transpose_x ? dim_x[0] : dim_x[1]; + break; + case 3: + batchCountX = dim_x[0]; + M = transpose_x ? dim_x[2] : dim_x[1]; + KX = transpose_x ? dim_x[1] : dim_x[2]; + break; + default: + assert(false); + } + + switch (dim_y.size()) { + case 1: + if (transpose_y) { + N = dim_y[0]; + KY = 1; + } else { + N = 1; + KY = dim_y[0]; + remove_final_dim = true; + } + break; + case 2: + KY = transpose_y ? dim_y[1] : dim_y[0]; + N = transpose_y ? dim_y[0] : dim_y[1]; + break; + case 3: + batchCountY = dim_y[0]; + KY = transpose_y ? dim_y[2] : dim_y[1]; + N = transpose_y ? dim_y[1] : dim_y[2]; + break; + default: + assert(false); + } + + PADDLE_ENFORCE_EQ( + KX, KY, + "First matrix's width must be equal with second matrix's height."); + if (batchCountX && batchCountY) { + PADDLE_ENFORCE_EQ( + batchCountX, batchCountY, + "When Input(X) and Input(Y) are both three dimensional, they " + "must have the same batch dimension."); + } + int batchCount = std::max(batchCountX, batchCountY); + + std::vector dim_out; + if (batchCount) { + dim_out.push_back(batchCount); + } + if (!remove_initial_dim) { + dim_out.push_back(M); + } + if (!remove_final_dim) { + dim_out.push_back(N); + } + if (dim_out.size() == 0) { + // We don't support 0-dimensional Tensors (scalars), so instead + // treat the output as a Tensor of shape (1, ) in this case. + dim_out.push_back(1); + } + context->SetOutputDim("Out", framework::make_ddim(dim_out)); + context->ShareLoD("X", /*->*/ "Out"); + } +}; + +class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MatMulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The first input of MatMul op"); + AddInput("Y", "The second input of MatMul op"); + AddOutput("Out", "The output of MatMul op"); + AddAttr("transpose_X", + R"DOC(If true, use the transpose of `X`. + )DOC") + .SetDefault(false); + AddAttr("transpose_Y", + R"DOC(If true, use the transpose of `Y`. + )DOC") + .SetDefault(false); + AddComment(R"DOC( +The MatMul operator is used to perform (batched) matrix multiplication +over the last two dimensions of the input tensors `X` and `Y`. + +If a transpose flag is specified, the last two dimensions of the +tensor are transposed. If the tensor is rank-1 of shape [D], then +for `X` it is treated as [1, D] in nontransposed form and as [D, 1] +in transposed form, whereas for `Y` it is the opposite: It is treated +as [D, 1] in nontransposed form and as [1, D] in transposed form. + +Examples without transpose: +- X: [K], Y: [K] => Out: [1] +- X: [K], Y: [K, N] => Out: [N] +- X: [B, M, K], Y: [K] => Out: [B, M] +- X: [M, K], Y: [B, K, N] => Out: [B, M, N] +- X: [B, M, K], Y: [B, K, N] => Out: [B, M, N] + +The behavior is designed to be similar to the `numpy.matmul` function. +The differences are: +- Currently only rank 1 to rank 3 input tensors are supported. +- We add `transpose_X` and `transpose_Y` flags. + +Both the input `X` and `Y` can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD with input `X`. +)DOC"); + } +}; + +class MatMulOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = context->GetInputDim("X"); + auto y_dims = context->GetInputDim("Y"); + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + + if (context->HasOutput(x_grad_name)) { + context->SetOutputDim(x_grad_name, x_dims); + } + if (context->HasOutput(y_grad_name)) { + context->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad, + ops::MatMulOpGrad); +REGISTER_OP_CPU_KERNEL(matmul, + ops::MatMulKernel); +REGISTER_OP_CPU_KERNEL( + matmul_grad, ops::MatMulGradKernel); diff --git a/paddle/operators/matmul_op.cu b/paddle/operators/matmul_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..b7e66382f00445b087e14103e7a148d450b37405 --- /dev/null +++ b/paddle/operators/matmul_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/matmul_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(matmul, + ops::MatMulKernel); +REGISTER_OP_GPU_KERNEL( + matmul_grad, ops::MatMulGradKernel); diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8ae54e1eec33c4bce563f697bafbdc68f97ab746 --- /dev/null +++ b/paddle/operators/matmul_op.h @@ -0,0 +1,228 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + You may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/matmul.h" +#include "paddle/operators/transpose_op.h" + +namespace paddle { +namespace operators { +namespace matmul_detail { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; +using framework::make_ddim; +using framework::vectorize; + +template +class MatMulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor& x = *context.Input("X"); + const Tensor& y = *context.Input("Y"); + Tensor* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + bool transpose_x = context.Attr("transpose_X"); + bool transpose_y = context.Attr("transpose_Y"); + + math::MatMulFunctor()(context.device_context(), x, transpose_x, y, + transpose_y, T(1), out, T(0)); + } +}; + +template +inline Tensor Reshape(const Tensor& input, const DDim& dims) { + Tensor output; + output.ShareDataWith(input); + output.Resize(dims); + return output; +} + +// Reshape a rank-3 tensor from P x M x N to (P * M) x N. +// Identity op if the tensor is not of rank 3. +template +Tensor CombineBatchAndM(const Tensor& input) { + Tensor output; + output.ShareDataWith(input); + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + std::vector out_dims = {in_dims[0] * in_dims[1], in_dims[2]}; + output.Resize(make_ddim(out_dims)); + } + return output; +} + +// Reshape a rank-3 tensor from P x M x N to M x (P * N). +// (Warning: This requires transposing data and writes into new memory.) +// Identity op if the tensor is not of rank 3. +template +Tensor CombineBatchAndN(const framework::ExecutionContext& context, + const Tensor& input) { + Tensor output; + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + output.Resize(in_dims); + output.mutable_data(context.GetPlace()); + EigenTranspose(context, input, output, {1, 0, 2}); + std::vector out_dims = {in_dims[1], in_dims[0] * in_dims[2]}; + output.Resize(make_ddim(out_dims)); + } else { + output.ShareDataWith(input); + } + return output; +} + +// Using dimensional constraints on matrix multiplication, it is +// straight-forward to check the following table for when X and Y +// are both matrices. +// +// transpose_X | False | True | False | True +// transpose_Y | False | False | True | True +// -----------+----------+----------+----------+----------- +// dX = | dOut Y^T | Y dOut^T | dOut Y | Y^T dOut^T +// dY = | X^T dOut | X dOut | dOut^T X | dOut^T X^T +// +// When X is a vector of size K, we treat it instead as a matrix of shape +// (1, K). Similarly, when Y is a vector of size K, we treat it instead as +// a matrix of shape (K, 1). +// +// When X and Y are both 3-dimensional tensors, then the first dimension +// the batch dimension can be ignored and the exact same formulas apply +// as for two matrices. +// +// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end +// up with formulas like +// +// dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj} +// +// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N +// to X: (P * M) x K, dOut: (P * M) x N. +template +class MatMulGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor& x = *context.Input("X"); + const Tensor& y = *context.Input("Y"); + const Tensor& dout = *context.Input(framework::GradVarName("Out")); + Tensor* dx = context.Output(framework::GradVarName("X")); + Tensor* dy = context.Output(framework::GradVarName("Y")); + bool transpose_x = context.Attr("transpose_X"); + bool transpose_y = context.Attr("transpose_Y"); + + std::vector x_dims = vectorize(x.dims()); + std::vector y_dims = vectorize(y.dims()); + + // If X is a vector, reshape it to a matrix. + if (x_dims.size() == 1) { + x_dims.insert(x_dims.begin(), 1); + } + + // If Y is a vector, reshape it to a matrix. + if (y_dims.size() == 1) { + y_dims.push_back(1); + } + + // Fix the dOut dimensions. + int M = 0, N = 0, batchCountX = 0, batchCountY = 0; + + switch (x_dims.size()) { + case 2: + M = transpose_x ? x_dims[1] : x_dims[0]; + break; + case 3: + batchCountX = x_dims[0]; + M = transpose_x ? x_dims[2] : x_dims[1]; + break; + default: + assert(false); + } + + switch (y_dims.size()) { + case 2: + N = transpose_y ? y_dims[0] : y_dims[1]; + break; + case 3: + batchCountY = y_dims[0]; + N = transpose_y ? y_dims[1] : y_dims[2]; + break; + default: + assert(false); + } + if (batchCountX && batchCountY) { + PADDLE_ENFORCE_EQ( + batchCountX, batchCountY, + "When Input(X) and Input(Y) are both three dimensional, they " + "must have the same batch dimension."); + } + int batchCount = std::max(batchCountX, batchCountY); + std::vector dout_dims = {M, N}; + if (batchCount) { + dout_dims.insert(dout_dims.begin(), batchCount); + } + Tensor X = Reshape(x, make_ddim(x_dims)); + Tensor Y = Reshape(y, make_ddim(y_dims)); + Tensor dOut = Reshape(dout, make_ddim(dout_dims)); + + if (dx) { + dx->mutable_data(context.GetPlace()); + const Tensor& dOut_for_dX = + (x_dims.size() == 2 && y_dims.size() == 3) + ? CombineBatchAndN(context, dOut) + : dOut; + if (x_dims.size() == 2 && y_dims.size() == 3) { + Y = transpose_y ? CombineBatchAndM(Y) + : CombineBatchAndN(context, Y); + } + if (transpose_x) { + math::MatMulFunctor()(context.device_context(), Y, + transpose_y, dOut_for_dX, transpose_x, + T(1), dx, T(0)); + } else { + math::MatMulFunctor()(context.device_context(), dOut_for_dX, + transpose_x, Y, !transpose_y, T(1), dx, + T(0)); + } + } + + if (dy) { + dy->mutable_data(context.GetPlace()); + const Tensor& dOut_for_dY = (y_dims.size() == 2 && x_dims.size() == 3) + ? CombineBatchAndM(dOut) + : dOut; + if (y_dims.size() == 2 && x_dims.size() == 3) { + X = transpose_x ? CombineBatchAndN(context, X) + : CombineBatchAndM(X); + dOut = CombineBatchAndM(dOut); + } + if (transpose_y) { + math::MatMulFunctor()(context.device_context(), dOut_for_dY, + transpose_y, X, transpose_x, T(1), dy, + T(0)); + } else { + math::MatMulFunctor()(context.device_context(), X, + !transpose_x, dOut_for_dY, transpose_y, + T(1), dy, T(0)); + } + } + } +}; +} // namespace matmul_detail + +using matmul_detail::MatMulKernel; +using matmul_detail::MatMulGradKernel; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9be4d15a43d87ae1a27c81498e8b19b0049a3bfa --- /dev/null +++ b/paddle/operators/momentum_op.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/momentum_op.h" + +namespace paddle { +namespace operators { + +class MomentumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(param) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(grad) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Velocity"), + "Input(velocity) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of Momentum should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"), + "Output(VelocityOut) of Momentum should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "Param and Grad input of MomentumOp should have the same dimension."); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Velocity"), + "Param and Velocity of MomentumOp should have the same dimension."); + PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1, + "Learning_rate should be a scalar"); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("VelocityOut", param_dim); + } +}; + +class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MomentumOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter that has to be updated"); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter"); + AddInput("Velocity", + "(Tensor, default Tensor) " + "Input velocity (corresponding to the parameter) " + "that has to be updated"); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "Input learning rate"); + + AddOutput("ParamOut", "(Tensor) Output updated parameter"); + AddOutput("VelocityOut", "(Tensor) Output updated velocity"); + + AddAttr("mu", "(float) Momentum coefficient"); + AddComment(R"DOC( + +Momentum Algorithm (momentum). + +velocity = mu * velocity + gradient +param = param - learning_rate * velocity + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker); +REGISTER_OP_CPU_KERNEL( + momentum, ops::MomentumOpKernel); diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..efc24e795e05951024009f0b3258769c352df344 --- /dev/null +++ b/paddle/operators/momentum_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/momentum_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL( + momentum, ops::MomentumOpKernel); diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f7a724f048782ceee8509ddafcb4834fd8dbba8a --- /dev/null +++ b/paddle/operators/momentum_op.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class MomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out = ctx.Output("ParamOut"); + auto velocity_out = ctx.Output("VelocityOut"); + auto param = ctx.Input("Param"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto learning_rate = ctx.Input("LearningRate"); + + param_out->mutable_data(ctx.GetPlace()); + velocity_out->mutable_data(ctx.GetPlace()); + + float mu = ctx.Attr("mu"); + + auto p_out = framework::EigenVector::Flatten(*param_out); + auto v_out = framework::EigenVector::Flatten(*velocity_out); + + auto p = framework::EigenVector::Flatten(*param); + auto v = framework::EigenVector::Flatten(*velocity); + auto g = framework::EigenVector::Flatten(*grad); + auto lr = framework::EigenVector::Flatten(*learning_rate); + + auto place = ctx.GetEigenDevice(); + + Eigen::DSizes grad_dsize(grad->numel()); + v_out.device(place) = v * mu + g; + p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 943f81e94933bedd249086ef51ce2d0510c66a1c..065800f250d8b35a626060bac271e1bce6bb784b 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -104,10 +104,10 @@ class MulOpGrad : public framework::OperatorWithKernel { auto y_dims = ctx->GetInputDim("Y"); auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - auto x_mat_dims = - framework::flatten_to_2d(x_dims, Attr("x_num_col_dims")); - auto y_mat_dims = - framework::flatten_to_2d(y_dims, Attr("y_num_col_dims")); + auto x_mat_dims = framework::flatten_to_2d( + x_dims, ctx->Attrs().Get("x_num_col_dims")); + auto y_mat_dims = framework::flatten_to_2d( + y_dims, ctx->Attrs().Get("y_num_col_dims")); PADDLE_ENFORCE_EQ( x_mat_dims[0], out_dims[0], diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h index 9d8343c0b5e200b390ccda760f09816959952e9d..6b64539b0a9a4d535a53447fbcc0e458f3ac9129 100644 --- a/paddle/platform/dynload/cublas.h +++ b/paddle/platform/dynload/cublas.h @@ -77,6 +77,10 @@ extern void *cublas_dso_handle; __macro(cublasDgemmBatched); \ __macro(cublasCgemmBatched); \ __macro(cublasZgemmBatched); \ + __macro(cublasSgemmStridedBatched); \ + __macro(cublasDgemmStridedBatched); \ + __macro(cublasCgemmStridedBatched); \ + __macro(cublasZgemmStridedBatched); \ __macro(cublasSgetrfBatched); \ __macro(cublasSgetriBatched); \ __macro(cublasDgetrfBatched); \ diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index b360b05d16c9a1c135fa56cb37919dece8f16788..82aae72ba9378d6f05a9d4c0b95c7afd40650026 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -163,6 +163,11 @@ void BindBlockDesc(py::module &m) { return self.Var(name); }, py::return_value_policy::reference) + .def("has_var", + [](BlockDescBind &self, py::bytes byte_name) { + std::string name = byte_name; + return self.HasVar(name); + }) .def("find_var", [](BlockDescBind &self, py::bytes byte_name) { std::string name = byte_name; diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py index a17f988bf433078bfb4c06dab57896e2648953ce..3fb6efe42a2c69e37274dbb655fa47d78eabdf26 100644 --- a/python/paddle/v2/framework/framework.py +++ b/python/paddle/v2/framework/framework.py @@ -306,6 +306,14 @@ class Block(object): def idx(self): return self.desc.id + def var(self, name): + if name not in self.vars: + raise ValueError("var %s not in this block" % name) + return self.vars[name] + + def all_parameters(self): + return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)} + def create_var(self, *args, **kwargs): return Variable(self, *args, **kwargs) @@ -314,7 +322,8 @@ class Block(object): def create_parameter(self, *args, **kwargs): global_block = self.program.global_block() - return Parameter(global_block, *args, **kwargs) + param = Parameter(global_block, *args, **kwargs) + return param def append_op(self, *args, **kwargs): op_desc = self.desc.append_op() @@ -392,10 +401,16 @@ class Program(object): def global_block(self): return self.blocks[0] + def block(self, index): + return self.blocks[index] + def current_block(self): return self.blocks[self.current_block_idx] def append_backward(self, target, no_grad_set): + """ + return map(param_name -> (grad_name, block_index, op_index)) + """ assert isinstance(target, Variable) param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set) self.sync_with_cpp() diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..e356a7aadb8d6a87d0fe54a5dd2a11fea0d80a74 --- /dev/null +++ b/python/paddle/v2/framework/optimizer.py @@ -0,0 +1,124 @@ +import paddle.v2.framework.framework as framework + +__all__ = ['SGDOptimizer'] + + +class Optimizer(object): + """Optimizer Base class. + + Define the common interface of an optimizer. + User should not use this class directly, but need to use one of it's implementation. + """ + + def __init__(self): + pass + + def _append_optimize_op(self, block, param_and_grad): + """ append optimize operator to block and return all the added optimize_op + """ + raise NotImplementedError() + + def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None): + """ + create and add gradient Operators in BlockDesc to Compute gradients of `loss` + for parameters in parameter_list + + Args: + loss: an variable generated by cost function. + no_grad_set: variable that should not create gradient + parameter_list: parameters that need to compute gradient and update to optimize the lost. + + Returns: + list of (parameters, gradients) pair. + """ + assert isinstance(loss, framework.Variable) + param_grad_map = loss.block.program.append_backward(loss, no_grad_set or + set()) + if parameter_list is not None: + parameters = parameter_list + else: + params = loss.block.program.global_block().all_parameters() + parameters = [param.name for param in params] + params_and_grads = [] + for param in parameters: + if param not in param_grad_map: + raise Exception("param %s is not in map" % param) + grad_info = param_grad_map[param] + grad_block = loss.block.program.block(grad_info[1]) + if not grad_block.has_var(grad_info[0]): + raise Exception("grad block[%d] did not have grad var %s" % + grad_info[1], grad_info[0]) + param_var = loss.block.var(param) + grad_var = grad_block.var(grad_info[0]) + if loss.block.has_var(grad_info[0]): + params_and_grads.append((param_var, grad_var)) + else: + params_and_grads.append((param_var, None)) + return params_and_grads + + def create_optimization_pass(self, parameters_and_grads, loss): + """Add optimization operators to update gradients to variables. + + Args: + loss: the target that this optimization is for. + parameters_and_grads: a list of (variable, gradient) pair to update. + + Returns: + optmization_op_list: a list of optimization operator that will update parameter using gradient. + """ + optimize_ops = [] + for param_and_grad in parameters_and_grads: + if param_and_grad[1] is not None: + optimize_op = self._append_optimize_op(loss.block, + param_and_grad) + optimize_ops.append(optimize_op) + return optimize_ops + + def minimize(self, loss, parameter_list=None, no_grad_set=None): + """Add operations to minimize `loss` by updating `parameter_list`. + + This method combines interface `create_backward_pass()` and + `create_optimization_pass()` into one. + """ + params_grads = self.create_backward_pass(loss, parameter_list, + no_grad_set or set()) + optimize_ops = self.create_optimization_pass(params_grads, loss) + return optimize_ops + + +class SGDOptimizer(Optimizer): + """ Simple SGD optimizer without any state. + """ + + def __init__(self, learning_rate): + assert learning_rate is not None + super(Optimizer, self).__init__() + self.type = "sgd" + self._learning_rate = learning_rate + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + lr_shape = [1] + # create a var for learning_rate + lr = block.create_var(dtype="float32", shape=lr_shape, lod_level=0) + + # create an op to init the learning_rate + init_op = block.append_op( + type="fill_constant", + outputs={"Out": lr}, + attrs={"shape": lr_shape, + "value": self._learning_rate}) + + # create the optimize op + sgd_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": lr + }, + outputs={"ParamOut": param_and_grad[0]}, + attrs={"shape": [1], + "value": self._learning_rate}) + + return sgd_op diff --git a/python/paddle/v2/framework/tests/test_matmul_op.py b/python/paddle/v2/framework/tests/test_matmul_op.py new file mode 100644 index 0000000000000000000000000000000000000000..d51572c8ab7c44fa0c6e83e50b56f05780530c61 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_matmul_op.py @@ -0,0 +1,119 @@ +import unittest +import numpy as np +from op_test import OpTest + + +def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y): + BATCH_SIZE = 2 + M = 3 + N = 4 + K = 5 + if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y): + K = 1 + if dim_X == 1: + if transpose_X: + shape_X = [M] + else: + shape_X = [K] + if dim_Y == 1: + if transpose_Y: + shape_Y = [N] + else: + shape_Y = [K] + if dim_X >= 2: + if transpose_X: + shape_X = [K, M] + else: + shape_X = [M, K] + if dim_X == 3: + shape_X = [BATCH_SIZE] + shape_X + if dim_Y >= 2: + if transpose_Y: + shape_Y = [N, K] + else: + shape_Y = [K, N] + if dim_Y == 3: + shape_Y = [BATCH_SIZE] + shape_Y + return shape_X, shape_Y + + +def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): + """Reference forward implementation using np.matmul.""" + # np.matmul does not support the transpose flags, so we manually + # transpose X and Y appropriately. + if transpose_X: + if X.ndim == 1: + X = X.reshape((X.size, 1)) + elif X.ndim == 2: + X = X.T + elif X.ndim == 3: + X = np.transpose(X, (0, 2, 1)) + else: + raise ValueError('X must have between 1 and 3 dimensions') + if transpose_Y: + if Y.ndim == 1: + Y = Y.reshape((1, Y.size)) + elif Y.ndim == 2: + Y = Y.T + elif Y.ndim == 3: + Y = np.transpose(Y, (0, 2, 1)) + else: + raise ValueError('Y must have between 1 and 3 dimensions') + Out = np.matmul(X, Y) + if not Out.shape: + # We do not support 0-dimensional Tensors (scalars). So where + # np.matmul outputs a scalar, we must convert to a Tensor of + # shape (1, ) instead. + # Everywhere else, we are compatible with np.matmul. + Out = np.array([Out], dtype="float32") + return Out + + +class Generator(object): + def setUp(self): + self.op_type = "matmul" + X = np.random.random(self.shape_X).astype("float32") + Y = np.random.random(self.shape_Y).astype("float32") + Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y) + self.inputs = {'X': X, 'Y': Y} + self.attrs = { + 'transpose_X': self.transpose_X, + 'transpose_Y': self.transpose_Y + } + self.outputs = {'Out': Out} + + def test_check_output(self): + self.check_output(atol=1e-2) + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + + def test_check_grad_ignore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ignore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) + + +# Generate test cases for all possibilities +for dim_X in [1, 2, 3]: + for dim_Y in [1, 2, 3]: + for transpose_X in [False, True]: + for transpose_Y in [False, True]: + test_name = ( + 'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format( + dim_X, dim_Y, transpose_X, transpose_Y)) + shape_X, shape_Y = generate_compatible_shapes( + dim_X, dim_Y, transpose_X, transpose_Y) + test_class = type(test_name, (Generator, OpTest), { + 'shape_X': shape_X, + 'shape_Y': shape_Y, + 'transpose_X': transpose_X, + 'transpose_Y': transpose_Y, + }) + globals()[test_name] = test_class + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_momentum_op.py b/python/paddle/v2/framework/tests/test_momentum_op.py new file mode 100644 index 0000000000000000000000000000000000000000..d3353ff6e4f4da32eaefdd4e816a621ddac8bece --- /dev/null +++ b/python/paddle/v2/framework/tests/test_momentum_op.py @@ -0,0 +1,35 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestMomentumOp(OpTest): + def setUp(self): + self.op_type = "momentum" + + param = np.random.random((123, 321)).astype("float32") + grad = np.random.random((123, 321)).astype("float32") + velocity = np.zeros((123, 321)).astype("float32") + learning_rate = np.array([0.001]).astype("float32") + mu = 0.0001 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Velocity': velocity, + 'LearningRate': learning_rate + } + + self.attrs = {'mu': mu} + + velocity_out = mu * velocity + grad + param_out = param - learning_rate * velocity_out + + self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/framework/tests/test_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..3d6fa70737bf360df53785dc602feceda471ee70 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_optimizer.py @@ -0,0 +1,31 @@ +import unittest + +import paddle.v2.framework.framework as framework +import paddle.v2.framework.optimizer as optimizer + + +class TestOptimizer(unittest.TestCase): + def test_sgd_optimizer(self): + program = framework.g_program + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + mul_op = block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) + opts = sgd_optimizer.minimize(mul_out) + self.assertEqual(len(opts), 1) + sgd_op = opts[0] + self.assertEqual(sgd_op.type, "sgd") + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_program.py b/python/paddle/v2/framework/tests/test_program.py index d06f86c09fe4edf8364e7d124cb7b8b1ae6bcc64..c98dc3492b9506f5a1639d26e3f0f1c2c9ad75d7 100644 --- a/python/paddle/v2/framework/tests/test_program.py +++ b/python/paddle/v2/framework/tests/test_program.py @@ -34,49 +34,11 @@ class TestProgram(unittest.TestCase): self.assertEqual(1, b.idx) self.assertEqual(0, b.parent_idx) - def test_desc_append_backward(self): - prog = core.ProgramDesc.__create_program_desc__() - self.assertIsNotNone(prog) - block = prog.block(0) - self.assertIsNotNone(block) - - mul_op_desc = block.append_op() - mul_op_desc.set_type("mul") - mul_op_desc.set_input("X", ["x1"]) - mul_op_desc.set_input("Y", ["y1"]) - mul_op_desc.set_output("Out", ["out1"]) - - sum_op_desc = block.append_op() - sum_op_desc.set_type("elementwise_add") - sum_op_desc.set_input("X", ["out1"]) - sum_op_desc.set_input("Y", ["b1"]) - sum_op_desc.set_output("Out", ["out2"]) - - target = block.var("out2") - - expect_ops = [ - "mul", "elementwise_add", "fill_constant", "elementwise_add_grad", - "mul_grad" - ] - - def grad_name(name): - return name + "@GRAD" - - actual_ops = [] - param_to_grad = prog.append_backward(target, set()) - for var_name in ("x1", "y1", "out1", "b1"): - self.assertEqual(param_to_grad[var_name][0], grad_name(var_name)) - self.assertEqual(param_to_grad[var_name][1], 0) - - for op in block.all_ops(): - actual_ops.append(op.type()) - self.assertEqual(actual_ops, expect_ops) - def test_append_backward(self): prog = Program.instance() block = prog.global_block() - mul_x = block.create_parameter( + mul_x = block.create_var( dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") @@ -88,7 +50,35 @@ class TestProgram(unittest.TestCase): "Y": mul_y}, outputs={"Out": [mul_out]}, attrs={"x_num_col_dims": 1}) - param_to_grad = prog.append_backward(mul_out, set()) + + add_y = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="add.y") + add_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="add.out") + add_op = block.append_op( + type="elementwise_add", + inputs={"X": mul_out, + "Y": add_y}, + outputs={"Out": add_out}, + attrs={"x_num_col_dims": 1}) + + param_to_grad = prog.append_backward(add_out, set()) + + def grad_name(name): + return name + "@GRAD" + + for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out"): + self.assertEqual(param_to_grad[var_name][0], grad_name(var_name)) + self.assertEqual(param_to_grad[var_name][1], 0) + + expect_ops = [ + "mul", "elementwise_add", "fill_constant", "elementwise_add_grad", + "mul_grad" + ] + actual_ops = [] + for op in block.ops: + actual_ops.append(op.type) + self.assertEqual(actual_ops, expect_ops) if __name__ == '__main__':