diff --git a/CMakeLists.txt b/CMakeLists.txt index e5b2f32fba7cf6b2f1eb9356833b3ff3a0be4c6d..ed704585d8a6bf3befd9a549aa5a62a33fea3da9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,7 +62,6 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) -option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF) diff --git a/README.md b/README.md index 8ee67f66423df8bce27f70015be8752457cd9784..56d6c10c642787836abb55cb2974bda0b8d22da4 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0) +### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1) ### Install Latest Stable Release: ``` # Linux CPU @@ -27,9 +27,9 @@ pip install paddlepaddle # Linux GPU cuda9cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.0.1.post87 +pip install paddlepaddle-gpu==1.1.0.post87 # Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==1.0.1.post85 +pip install paddlepaddle-gpu==1.1.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` @@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.0.1.post85 ## Installation -It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website. +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website. ## Documentation -We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and -[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html) +- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html) We appreciate your contributions! diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index eb31b522f57b16c9dcc2cf2f83d6bce89b6d0967..5557cdb8c7f47e5019cc259f8e4711309f2e3dab 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', ' paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) -paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) @@ -174,6 +174,7 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 17188ac5f301102ae79c6ace676b84ee66e28801..aa6b7db5562f8596b1b30a16c0f08fcc433cfcd7 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -56,6 +56,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu # device_context reduce_op_handle ) cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) +cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle) cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 7c5f5bd80a937bf1a1c891155764833d7b21c5c2..b8690156763e4037811245b8016982710445e6a2 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, nccl_ctxs_(ctxs) { if (nccl_ctxs_) { for (auto &p : places_) { - this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p); + this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); } } } @@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; @@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() { *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); auto &p = places_[i]; auto *var = scope.FindVar(out_var_handles[i]->name_); - auto *dev_ctx = dev_ctxes_[p]; + auto *dev_ctx = dev_ctxes_.at(p); RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { auto &tensor_gpu = *var->GetMutable(); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 020d351e891c7afab37c59c0ff8d8e5e7ba184f2..72180fac864256ddda076c57e50ab1083c113d32 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase { nccl_ctxs_(nccl_ctxs) { if (nccl_ctxs_) { for (auto &p_ctx : nccl_ctxs_->contexts_) { - dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); + this->SetDeviceContext(platform::CUDAPlace(p_ctx.first), + p_ctx.second.ctx_.get()); } } } diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc index ab7412a19fbd13fa39dbae9af528d158cc9ddbd0..650de5a48de6b1fdab120cdeda563a169fd1a1c1 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -12,232 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/broadcast_op_handle.h" -#include "gtest/gtest.h" - -#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/framework/details/broadcast_op_handle_test.h" namespace paddle { namespace framework { namespace details { -namespace f = paddle::framework; -namespace p = paddle::platform; - -// test data amount -const f::DDim kDims = {20, 20}; - -struct TestBroadcastOpHandle { - std::vector> ctxs_; - std::vector local_scopes_; - std::vector param_scopes_; - Scope g_scope_; - std::unique_ptr op_handle_; - std::vector> vars_; - std::vector gpu_list_; - bool use_gpu_; -#ifdef PADDLE_WITH_CUDA - std::unique_ptr nccl_ctxs_; -#endif - - void WaitAll() { - for (size_t j = 0; j < ctxs_.size(); ++j) { - ctxs_[j]->Wait(); - } -#ifdef PADDLE_WITH_CUDA - if (nccl_ctxs_) { - nccl_ctxs_->WaitAll(); - } -#endif - } - - void InitCtxOnGpu(bool use_gpu) { - use_gpu_ = use_gpu; - if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA - int count = p::GetCUDADeviceCount(); - if (count <= 1) { - LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " - "device count is " - << count; - exit(0); - } - for (int i = 0; i < count; ++i) { - auto p = p::CUDAPlace(i); - gpu_list_.push_back(p); - ctxs_.emplace_back(new p::CUDADeviceContext(p)); - } - nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); -#else - PADDLE_THROW("CUDA is not support."); -#endif - } else { - int count = 8; - for (int i = 0; i < count; ++i) { - auto p = p::CPUPlace(); - gpu_list_.push_back(p); - ctxs_.emplace_back(new p::CPUDeviceContext(p)); - } -#ifdef PADDLE_WITH_CUDA - nccl_ctxs_.reset(nullptr); -#endif - } - } - - void InitBroadcastOp(size_t input_scope_idx) { - for (size_t j = 0; j < gpu_list_.size(); ++j) { - local_scopes_.push_back(&(g_scope_.NewScope())); - Scope& local_scope = local_scopes_.back()->NewScope(); - *local_scopes_.back() - ->Var(details::kLocalExecScopeName) - ->GetMutable() = &local_scope; - local_scope.Var("out"); - param_scopes_.emplace_back(&local_scope); - } - param_scopes_[input_scope_idx]->Var("input"); - - std::unique_ptr n = - ir::CreateNodeForTest("node0", ir::Node::Type::kOperation); - if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA - op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_, - nccl_ctxs_.get())); -#else - PADDLE_THROW("CUDA is not support."); -#endif - } else { -#ifdef PADDLE_WITH_CUDA - op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_, - nccl_ctxs_.get())); -#else - op_handle_.reset( - new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_)); -#endif - } - - std::unique_ptr v = - ir::CreateNodeForTest("node1", ir::Node::Type::kVariable); - auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input", - gpu_list_[input_scope_idx]); - vars_.emplace_back(in_var_handle); - op_handle_->AddInput(in_var_handle); - - // add dummy var - - std::unique_ptr v2 = - ir::CreateNodeForTest("node2", ir::Node::Type::kVariable); - vars_.emplace_back(new DummyVarHandle(v2.get())); - DummyVarHandle* dummy_var_handle = - static_cast(vars_.back().get()); - dummy_var_handle->ClearGeneratedOp(); - op_handle_->AddInput(dummy_var_handle); - - for (size_t j = 0; j < gpu_list_.size(); ++j) { - if (!use_gpu_) { - op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); - } - std::unique_ptr v3 = - ir::CreateNodeForTest("node3", ir::Node::Type::kVariable); - VarHandle* out_var_handle = - new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]); - vars_.emplace_back(out_var_handle); - op_handle_->AddOutput(out_var_handle); - } - - // add dummy var - std::unique_ptr v4 = - ir::CreateNodeForTest("node4", ir::Node::Type::kVariable); - vars_.emplace_back(new DummyVarHandle(v4.get())); - DummyVarHandle* out_dummy_var_handle = - static_cast(vars_.back().get()); - out_dummy_var_handle->ClearGeneratedOp(); - op_handle_->AddOutput(out_dummy_var_handle); - } - - void TestBroadcastLodTensor(size_t input_scope_idx) { - auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); - PADDLE_ENFORCE_NOT_NULL(in_var); - auto in_lod_tensor = in_var->GetMutable(); - in_lod_tensor->mutable_data(kDims, gpu_list_[input_scope_idx]); - - std::vector send_vector(static_cast(f::product(kDims))); - for (size_t k = 0; k < send_vector.size(); ++k) { - send_vector[k] = k; - } - f::LoD lod{{0, 10, 20}}; - paddle::framework::TensorFromVector( - send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor); - in_lod_tensor->set_lod(lod); - in_lod_tensor->Resize(kDims); - - op_handle_->Run(false); - - WaitAll(); - - p::CPUPlace cpu_place; - for (size_t j = 0; j < gpu_list_.size(); ++j) { - auto out_var = param_scopes_[j]->FindVar("out"); - PADDLE_ENFORCE_NOT_NULL(out_var); - auto out_tensor = out_var->Get(); - PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal."); - - f::Tensor result_tensor; - f::TensorCopySync(out_tensor, cpu_place, &result_tensor); - float* ct = result_tensor.mutable_data(cpu_place); - - for (int64_t i = 0; i < f::product(kDims); ++i) { - ASSERT_NEAR(ct[i], send_vector[i], 1e-5); - } - } - } - - void TestBroadcastSelectedRows(size_t input_scope_idx) { - auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); - PADDLE_ENFORCE_NOT_NULL(in_var); - auto in_selected_rows = in_var->GetMutable(); - auto value = in_selected_rows->mutable_value(); - value->mutable_data(kDims, gpu_list_[input_scope_idx]); - int height = static_cast(kDims[0]) * 2; - std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, - 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; - in_selected_rows->set_height(height); - in_selected_rows->set_rows(rows); - - std::vector send_vector(static_cast(f::product(kDims))); - for (size_t k = 0; k < send_vector.size(); ++k) { - send_vector[k] = k; - } - paddle::framework::TensorFromVector( - send_vector, *(ctxs_[input_scope_idx]), value); - - op_handle_->Run(false); - - WaitAll(); - - p::CPUPlace cpu_place; - for (size_t j = 0; j < gpu_list_.size(); ++j) { - auto out_var = param_scopes_[j]->FindVar("out"); - PADDLE_ENFORCE_NOT_NULL(out_var); - auto& out_select_rows = out_var->Get(); - auto rt = out_select_rows.value(); - - PADDLE_ENFORCE_EQ(out_select_rows.height(), height, - "height is not equal."); - for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { - PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]); - } - - f::Tensor result_tensor; - f::TensorCopySync(rt, cpu_place, &result_tensor); - float* ct = result_tensor.data(); - - for (int64_t i = 0; i < f::product(kDims); ++i) { - ASSERT_NEAR(ct[i], send_vector[i], 1e-5); - } - } - } -}; - TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) { TestBroadcastOpHandle test_op; size_t input_scope_idx = 0; diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h new file mode 100644 index 0000000000000000000000000000000000000000..1a2a9ac328c4a9b89bfb89106af81b9fb3ed3028 --- /dev/null +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -0,0 +1,271 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/details/broadcast_op_handle.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +namespace f = paddle::framework; +namespace p = paddle::platform; + +// test data amount +const f::DDim kDims = {20, 20}; + +struct TestBroadcastOpHandle { + std::vector> ctxs_; + std::vector local_scopes_; + std::vector param_scopes_; + Scope g_scope_; + std::unique_ptr op_handle_; + std::vector> vars_; + std::vector place_list_; + bool use_gpu_; +#ifdef PADDLE_WITH_CUDA + std::unique_ptr nccl_ctxs_; +#endif + + void WaitAll() { + for (size_t j = 0; j < ctxs_.size(); ++j) { + ctxs_[j]->Wait(); + } +#ifdef PADDLE_WITH_CUDA + if (nccl_ctxs_) { + nccl_ctxs_->WaitAll(); + } +#endif + } + + void InitCtxOnGpu(bool use_gpu) { + use_gpu_ = use_gpu; + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + auto p = p::CUDAPlace(i); + place_list_.push_back(p); + ctxs_.emplace_back(new p::CUDADeviceContext(p)); + } + nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { + int count = 8; + for (int i = 0; i < count; ++i) { + auto p = p::CPUPlace(); + place_list_.push_back(p); + ctxs_.emplace_back(new p::CPUDeviceContext(p)); + } +#ifdef PADDLE_WITH_CUDA + nccl_ctxs_.reset(nullptr); +#endif + } + } + + void InitBroadcastOp(size_t input_scope_idx) { + for (size_t j = 0; j < place_list_.size(); ++j) { + local_scopes_.push_back(&(g_scope_.NewScope())); + Scope& local_scope = local_scopes_.back()->NewScope(); + *local_scopes_.back() + ->Var(details::kLocalExecScopeName) + ->GetMutable() = &local_scope; + local_scope.Var("out"); + param_scopes_.emplace_back(&local_scope); + } + param_scopes_[input_scope_idx]->Var("input"); + + std::unique_ptr n = + ir::CreateNodeForTest("node0", ir::Node::Type::kOperation); + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, + place_list_, nccl_ctxs_.get())); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, + place_list_, nccl_ctxs_.get())); +#else + op_handle_.reset( + new BroadcastOpHandle(n.get(), local_scopes_, place_list_)); +#endif + } + + std::unique_ptr v = + ir::CreateNodeForTest("node1", ir::Node::Type::kVariable); + auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input", + place_list_[input_scope_idx]); + vars_.emplace_back(in_var_handle); + op_handle_->AddInput(in_var_handle); + + // add dummy var + + std::unique_ptr v2 = + ir::CreateNodeForTest("node2", ir::Node::Type::kVariable); + vars_.emplace_back(new DummyVarHandle(v2.get())); + DummyVarHandle* dummy_var_handle = + static_cast(vars_.back().get()); + dummy_var_handle->ClearGeneratedOp(); + op_handle_->AddInput(dummy_var_handle); + + for (size_t j = 0; j < place_list_.size(); ++j) { + if (!use_gpu_) { + op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get()); + } + std::unique_ptr v3 = + ir::CreateNodeForTest("node3", ir::Node::Type::kVariable); + VarHandle* out_var_handle = + new VarHandle(v3.get(), 2, j, "out", place_list_[j]); + vars_.emplace_back(out_var_handle); + op_handle_->AddOutput(out_var_handle); + } + + // add dummy var + std::unique_ptr v4 = + ir::CreateNodeForTest("node4", ir::Node::Type::kVariable); + vars_.emplace_back(new DummyVarHandle(v4.get())); + DummyVarHandle* out_dummy_var_handle = + static_cast(vars_.back().get()); + out_dummy_var_handle->ClearGeneratedOp(); + op_handle_->AddOutput(out_dummy_var_handle); + } + + std::vector InitLoDTensor(const std::string& varname, + size_t input_scope_idx, const f::LoD& lod, + float val_scalar = 0.0) { + auto var = param_scopes_[input_scope_idx]->FindVar(varname); + + PADDLE_ENFORCE_NOT_NULL(var); + auto lod_tensor = var->GetMutable(); + std::vector send_vector(static_cast(f::product(kDims))); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k + val_scalar; + } + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), lod_tensor); + lod_tensor->set_lod(lod); + lod_tensor->Resize(kDims); + return send_vector; + } + + std::vector InitSelectedRows(const std::string& varname, + size_t input_scope_idx, + const std::vector& rows, + int height, float value_scalar = 0.0) { + std::vector send_vector(static_cast(f::product(kDims))); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k + value_scalar; + } + + auto var = param_scopes_[input_scope_idx]->FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var); + auto selected_rows = var->GetMutable(); + auto value = selected_rows->mutable_value(); + value->mutable_data(kDims, place_list_[input_scope_idx]); + selected_rows->set_height(height); + selected_rows->set_rows(rows); + + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), value); + + return send_vector; + } + + void SelectedRowsEqual(const std::string& varname, int input_scope_idx, + const std::vector& send_vector, + const std::vector& rows, int height) { + auto var = param_scopes_[input_scope_idx]->FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var); + auto& selected_rows = var->Get(); + auto rt = selected_rows.value(); + PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal."); + + for (size_t k = 0; k < selected_rows.rows().size(); ++k) { + PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]); + } + + p::CPUPlace cpu_place; + f::Tensor result_tensor; + f::TensorCopySync(rt, cpu_place, &result_tensor); + float* ct = result_tensor.data(); + + for (int64_t i = 0; i < f::product(kDims); ++i) { + ASSERT_NEAR(ct[i], send_vector[i], 1e-5); + } + } + + void LoDTensorEqual(const std::string& varname, + const std::vector& send_vec, const f::LoD& lod, + framework::Scope* scope) { + p::CPUPlace cpu_place; + auto var = scope->FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var); + auto tensor = var->Get(); + PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal."); + f::Tensor result_tensor; + f::TensorCopySync(tensor, cpu_place, &result_tensor); + float* ct = result_tensor.mutable_data(cpu_place); + for (int64_t k = 0; k < f::product(kDims); ++k) { + ASSERT_NEAR(ct[k], send_vec[k], 1e-5); + } + } + + void TestBroadcastLodTensor(size_t input_scope_idx) { + f::LoD lod{{0, 10, 20}}; + auto send_vector = InitLoDTensor("input", input_scope_idx, lod); + + op_handle_->Run(false); + + WaitAll(); + for (size_t j = 0; j < place_list_.size(); ++j) { + LoDTensorEqual("out", send_vector, lod, param_scopes_[j]); + } + } + + void TestBroadcastSelectedRows(size_t input_scope_idx) { + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + int height = static_cast(kDims[0] * 2); + auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height); + + op_handle_->Run(false); + + WaitAll(); + for (size_t j = 0; j < place_list_.size(); ++j) { + SelectedRowsEqual("out", input_scope_idx, send_vector, rows, height); + } + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index b6282debdb4eb6b1f29c39e54ac4f3e2296838da..f9bbfe0016ce0ea0d15a83cb532c44518549b8ad 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() { bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) { bool need_wait = in_var && in_var->GeneratedOp() && - in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_]; + in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_); return need_wait; } diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc index 525d24322442ef4dd6e8c24212af61c908959b87..0b772f9b63e2cfb78175f5e0d7011db8e6a5ec20 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle( : OpHandleBase(node), local_scopes_(local_scopes), places_(places) { if (ctxs) { for (auto &p : places_) { - this->dev_ctxes_[p] = ctxs->DevCtx(p); + this->SetDeviceContext(p, ctxs->DevCtx(p)); } } } @@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() { PADDLE_ENFORCE_GT(places_.size(), 1, "Data balance can only be enabled when the number of " "places to run larger than 1."); - auto in_var_handles = DynamicCast(inputs_); - auto out_var_handles = DynamicCast(outputs_); + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0); PADDLE_ENFORCE_EQ( in_var_handles.size(), out_var_handles.size(), diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 6e22fedf1c39428528c00cce4c9a4460dfb95cb3..98fc390e72fab3701538fd6f974460fa5114fdb0 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -92,13 +92,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( size_t num_complete = 0; remaining_ = 0; - BlockingQueue complete_q; + auto complete_q = std::make_shared>(); for (auto op : bootstrap_ops_) { - RunOpAsync(op_deps.get(), op, &complete_q); + RunOpAsync(op_deps.get(), op, complete_q); } while (num_complete != op_deps->size()) { - size_t num_comp = complete_q.Pop(); + size_t num_comp = complete_q->Pop(); if (num_comp == -1UL) { int remaining = 0; while (true) { @@ -107,7 +107,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( break; } for (int i = 0; i < remaining; ++i) { - complete_q.Pop(); + complete_q->Pop(); } } exception_.ReThrow(); @@ -120,7 +120,8 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( } void FastThreadedSSAGraphExecutor::RunOpAsync( std::unordered_map> *op_deps, - OpHandleBase *op, BlockingQueue *complete_q) { + OpHandleBase *op, + const std::shared_ptr> &complete_q) { ++remaining_; this->pool_.enqueue([=] { OpHandleBase *op_to_run = op; @@ -144,7 +145,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( if (op_to_run == nullptr) { op_to_run = pending_op; } else { - this->RunOpAsync(op_deps, pending_op, complete_q); + RunOpAsync(op_deps, pending_op, complete_q); } } } @@ -156,8 +157,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( } void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { atomic_op_deps_ = pool_.enqueue([&] { - std::unordered_map> *op_deps = - new std::unordered_map>; + auto *op_deps = new std::unordered_map>; for (auto &pair : op_deps_) { (*op_deps)[pair.first] = pair.second; } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index dad3a231cba6402f57ba654a9ac5fb520b9c8f04..8b8382447105c8caa36963214684d6ee9fa15200 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -50,7 +50,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { std::atomic remaining_; void RunOpAsync(std::unordered_map> *op_deps, - OpHandleBase *op, BlockingQueue *complete_q); + OpHandleBase *op, + const std::shared_ptr> &complete_q); void PrepareAtomicOpDeps(); diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f12bd2b4e857648342aeb5ad33b6c0fe01c9c73 --- /dev/null +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -0,0 +1,165 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/details/broadcast_op_handle_test.h" + +namespace paddle { +namespace framework { +namespace details { + +struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { + std::vector out_varnames_; + + void InitFusedBroadcastOp(std::vector input_scope_idxes) { + // initialize scope and var + for (size_t i = 0; i < place_list_.size(); ++i) { + local_scopes_.push_back(&(g_scope_.NewScope())); + Scope& local_scope = local_scopes_.back()->NewScope(); + *local_scopes_.back() + ->Var(details::kLocalExecScopeName) + ->GetMutable() = &local_scope; + for (size_t j = 0; j < input_scope_idxes.size(); ++j) { + local_scope.Var("out_var" + j); + if (i == j) local_scope.Var("in_var" + j); + } + param_scopes_.emplace_back(&local_scope); + } + + // create op handle node + std::unique_ptr n = + ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation); + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset(new FusedBroadcastOpHandle( + n.get(), local_scopes_, place_list_, nccl_ctxs_.get())); +#else + PADDLE_THROW("CUDA is not supported."); +#endif + } else { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset(new FusedBroadcastOpHandle( + n.get(), local_scopes_, place_list_, nccl_ctxs_.get())); +#else + op_handle_.reset( + new FusedBroadcastOpHandle(n.get(), local_scopes_, place_list_)); +#endif + } + + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + // add input var handle + std::unique_ptr in_node = + ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable); + VarHandle* in_var_handle = + new VarHandle(in_node.get(), 1, input_scope_idxes[i], "in_var" + i, + place_list_[input_scope_idxes[i]]); + vars_.emplace_back(in_var_handle); + op_handle_->AddInput(in_var_handle); + + // add output var handle + for (size_t j = 0; j < place_list_.size(); ++j) { + std::unique_ptr out_node = + ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable); + VarHandle* out_var_handle = + new VarHandle(out_node.get(), 2, j, "out_var" + i, place_list_[j]); + vars_.emplace_back(out_var_handle); + op_handle_->AddOutput(out_var_handle); + } + } + } + + void TestFusedBroadcastLoDTensor(std::vector input_scope_idxes) { + std::vector> send_vec; + f::LoD lod{{0, 10, 20}}; + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string varname("in_var" + i); + float val_scalar = static_cast(i); + send_vec.push_back( + InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar)); + } + + op_handle_->Run(false); + + WaitAll(); + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string& varname("out_var" + i); + for (size_t j = 0; j < place_list_.size(); ++j) { + LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]); + } + } + } + + void TestFusedBroadcastSelectedRows(std::vector input_scope_idxes) { + std::vector> send_vector; + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + int height = static_cast(kDims[0] * 2); + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string varname("in_var" + i); + float val_scalar = static_cast(i); + send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i], + rows, height, val_scalar)); + } + + op_handle_->Run(false); + + WaitAll(); + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string& varname("out_var" + i); + for (size_t j = 0; j < place_list_.size(); ++j) { + SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows, + height); + } + } + } +}; + +TEST(FusedBroadcastTester, CPULodTensor) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(false); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastLoDTensor(input_scope_idxes); +} + +TEST(FusedBroadcastTester, CPUSelectedRows) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(false); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastSelectedRows(input_scope_idxes); +} + +#ifdef PADDLE_WITH_CUDA +TEST(FusedBroadcastTester, GPULodTensor) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(true); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastLoDTensor(input_scope_idxes); +} + +TEST(FusedBroadcastTester, GPUSelectedRows) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(true); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastSelectedRows(input_scope_idxes); +} +#endif + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc index 9aae19fc73de4387186da47c55710c94d53f1b88..ca4633c5a8f22fc9f7319b06aa766f9fe37dc68c 100644 --- a/paddle/fluid/framework/details/gather_op_handle.cc +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() { VarHandle *out_var_handle; { - auto out_var_handles = DynamicCast(outputs_); + auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, "The number of output should be one."); out_var_handle = out_var_handles.front(); @@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() { Tensor *out_tensor = out_value->mutable_value(); // copy - auto dev_ctx = dev_ctxes_[out_var_handle->place_]; + auto dev_ctx = dev_ctxes_.at(out_var_handle->place_); RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx, t_out_p] { int s = 0, e = 0; diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 3812f0abf1b7069525c4420054c61c01c908acfe..4822627ac3b65972f41d9a23d9fe3dba3de3f97d 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() { void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { for (auto *in : inputs_) { if (NeedWait(in)) { - in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]); + in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place)); } } } diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 7fc06f234d42a992328c0b6164f17945d8075c28..4503123eac810917cabcf1e62cff98552ed2f742 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -27,7 +27,7 @@ namespace framework { namespace details { void ReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); if (places_.size() == 1) return; // the input and output may have dummy var. diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index a6289b055f97b7b0e57928358d84117b33cf2df8..999828ae457ba43541da06088ce7c25331fd05ec 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase { nccl_ctxs_(nccl_ctxs) { if (nccl_ctxs_) { for (auto &p_ctx : nccl_ctxs_->contexts_) { - dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); + this->SetDeviceContext(platform::CUDAPlace(p_ctx.first), + p_ctx.second.ctx_.get()); } } } diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index f44b374edb29228dff5a8bf003d945291f166d49..65df7f2d510bf4e3e930398182c6dd1eae89241f 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() { continue; } if (in->GeneratedOp()) { - in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]); + in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p)); } } auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index ba243979b34aa1f683de707525403becaf0a1c00..ef1626599795a553e654fe5d3ed74ef3a3a67d78 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, coeff_(static_cast(1.0 / num_dev)), scope_(scope), place_(place) { - dev_ctxes_[place_] = dev_ctx; + this->SetDeviceContext(place_, dev_ctx); } ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} @@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() { } else { #ifdef PADDLE_WITH_CUDA this->RunAndRecordEvent([&] { - auto stream = - static_cast(this->dev_ctxes_[place_]) - ->stream(); + auto stream = static_cast( + this->dev_ctxes_.at(place_)) + ->stream(); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); VLOG(10) << place_ << "RUN Scale loss grad op"; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 31beef3ae829d72570ee7c879dac71ed600cd216..dc63effd1b7c8fe5bb3fc91058eb855e552d3926 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -39,7 +39,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); std::unordered_map pending_ops; std::unordered_set pending_vars; - BlockingQueue ready_vars; + auto ready_vars = std::make_shared>(); std::unordered_set ready_ops; // For ops (e.g. nccl_all_reduce) that need to coordinate multiple // streams from multiple GPUs, it's faster to buffer them and schedule @@ -51,12 +51,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto &var_map : graph_->Get(details::kGraphVars)) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { - InsertPendingVar(&pending_vars, &ready_vars, version_pair.get()); + InsertPendingVar(&pending_vars, ready_vars.get(), version_pair.get()); } } } for (auto &var : graph_->Get(details::kGraphDepVars)) { - InsertPendingVar(&pending_vars, &ready_vars, var.get()); + InsertPendingVar(&pending_vars, ready_vars.get(), var.get()); } for (auto &op : graph_->Get(details::kGraphOps)) { @@ -73,12 +73,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( FeedFetchList fetch_data(fetch_tensors.size()); InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops, - &pending_vars, &ready_vars, &fetch_data); + &pending_vars, ready_vars.get(), &fetch_data); auto run_all_ops = [&](std::unordered_set &set) { for (auto *op : set) { running_ops_++; - RunOp(&ready_vars, op); + RunOp(ready_vars, op); } set.clear(); }; @@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( run_op_futures_.clear(); exception_holder_.Clear(); event.reset(nullptr); - // Step 3. Execution while (!pending_vars.empty()) { // 1. Run All Ready ops @@ -103,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // 2. Find ready variable bool timeout; - auto cur_ready_vars = ready_vars.PopAll(1, &timeout); + auto cur_ready_vars = ready_vars->PopAll(1, &timeout); if (timeout) { if (exception_holder_.IsCaught()) { @@ -133,7 +132,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( } } PADDLE_ENFORCE(ready_ops.empty()); - // Wait FetchOps. ClearFetchOp(graph_.get(), &fetch_ops); @@ -206,7 +204,8 @@ void ThreadedSSAGraphExecutor::InsertPendingVar( } void ThreadedSSAGraphExecutor::RunOp( - BlockingQueue *ready_var_q, details::OpHandleBase *op) { + const std::shared_ptr> &ready_var_q, + details::OpHandleBase *op) { auto op_run = [ready_var_q, op, this] { try { if (VLOG_IS_ON(10)) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 512f8a4ca5a9b82a395dde11722b8db44ea5ec27..dbb0b498d995a897b109bd4ef98521b2193276ed 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -51,7 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ~ThreadedSSAGraphExecutor() {} private: - void RunOp(BlockingQueue *ready_var_q, + void RunOp(const std::shared_ptr> &ready_var_q, details::OpHandleBase *op); private: diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 29b604afbfcfc2bac67e447db8cd4c671c036dbe..b20d70132256bd5df7411c46ff4eb246b1f14ba8 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() { return result; } +bool GraphItemCMP(const std::pair &a, + const std::pair &b) { + if (a.first != b.first) { + return a.first < b.first; + } else { + return a.second < b.second; + } +} + // TODO(Superjomn) enhance the function as it marks unique unique as duplicates // see https://github.com/PaddlePaddle/Paddle/issues/13550 void GraphPatternDetector::UniquePatterns( @@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns( std::vector result; std::unordered_set set; + std::hash hasher; for (auto &g : *subgraphs) { - size_t key = 0; - for (auto &item : g) { - key ^= std::hash{}(item.first); - key ^= std::hash{}(item.second); + // Sort the items in the sub-graph, and transform to a string key. + std::vector> sorted_keys(g.begin(), g.end()); + std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP); + std::stringstream ss; + for (auto &item : sorted_keys) { + ss << item.first << ":" << item.second; } + auto key = hasher(ss.str()); if (!set.count(key)) { result.emplace_back(g); set.insert(key); diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 1e7da9a69c7cbf8c13306656599a759515802b76..669d08c70c9b7453264806b346a6c9eb211cfd4a 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor( PADDLE_ENFORCE_EQ(new_lod.size(), lod.size()); for (size_t j = 0; j < lod.size(); ++j) { auto &sub_lod = new_lod[j]; - auto &offset = sub_lod.back(); + size_t offset = sub_lod.back(); for (size_t k = 1; k < lod[j].size(); ++k) { sub_lod.push_back(lod[j][k] + offset); } diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h index 0ad6a709008406257d6c0a220bce38bb24e188cd..36a5c3c5d601390beedaf37ceb98ee2c63ecf5a6 100644 --- a/paddle/fluid/framework/lod_tensor_array.h +++ b/paddle/fluid/framework/lod_tensor_array.h @@ -19,81 +19,7 @@ limitations under the License. */ namespace paddle { namespace framework { -// NOTE The vector can't be replaced with the class LoDTensorArray -// directly, because there are many vector used accross the project, -// and some of them are treated as LoDTensorArray. -#if !defined(PADDLE_ON_INFERENCE) - using LoDTensorArray = std::vector; -#else // !PADDLE_ON_INFERENCE - -#pragma message "LoDTensorArray is replaced with the inference one." -/* - * A LoDTensorArray which will not deallocate buffer when resized, fix the data - * diff in inference, and more performance friendly in the concurrency - * scenerios. - */ -class LoDTensorArray { - public: - LoDTensorArray() = default; - - using iterator = std::vector::iterator; - using const_iterator = std::vector::const_iterator; - - const_iterator begin() const { return array_.begin(); } - const_iterator end() const { return array_.begin() + size_; } - iterator begin() { return array_.begin(); } - iterator end() { return array_.begin() + size_; } - - void push_back(const LoDTensor& x) { - if (size_ < array_.size()) { - array_[size_++] = x; - } else { - array_.push_back(x); - ++size_; - } - } - void resize(size_t size) { - if (array_.size() < size) { - array_.resize(size); - } - size_ = size; - } - - void emplace_back() { array_.emplace_back(); } - - void emplace_back(LoDTensor&& x) { array_.emplace_back(std::move(x)); } - - LoDTensor& back() { return array_.back(); } - - size_t space() const { return array_.size(); } - - void reserve(size_t size) { - // Naive warning to tell user this array might be to large. The memory and - // buffer used by this TensorArray will not be deleted during the training - // and inference phase, so attention not to make it expand too long. - if (size > 800UL) { - LOG(WARNING) << "TensorArray has more than 800 items"; - } - array_.reserve(size); - } - - bool empty() const { return size_ == 0UL; } - void clear() { size_ = 0UL; } - - LoDTensor& operator[](size_t id) { return array_[id]; } - const LoDTensor& operator[](size_t id) const { return array_[id]; } - LoDTensor& at(size_t id) { return array_.at(id); } - const LoDTensor& at(size_t id) const { return array_.at(id); } - - size_t size() const { return size_; } - - private: - size_t size_{0}; - std::vector array_; -}; -#endif // !PADDLE_ON_INFERENCE - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9259bb740a8a2408e4dc7be21711560fdf250752..45fc36c70633204dbfadbd10757c08b009d2cc74 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -354,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() { } } -static bool VarIsTensor(const Variable* var) { - return var->IsType() || var->IsType(); +static bool VarIsTensor(const Variable& var) { + return var.IsType() || var.IsType(); } -const Tensor* GetTensorFromVar(Variable* var) { - if (var->IsType()) { - return var->GetMutable(); - } else if (var->IsType()) { - return var->GetMutable()->mutable_value(); +const Tensor* GetTensorFromVar(const Variable& var) { + if (var.IsType()) { + return static_cast(&(var.Get())); + } else if (var.IsType()) { + return &(var.Get().value()); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + var.Type().name()); } } @@ -415,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const { template <> const Tensor* ExecutionContext::Input(const std::string& name) const { auto* var = InputVar(name); - return var == nullptr ? nullptr - : GetTensorFromVar(const_cast(var)); + return var == nullptr ? nullptr : GetTensorFromVar(*var); } template <> @@ -428,7 +427,7 @@ const std::vector ExecutionContext::MultiInput( std::transform(names.begin(), names.end(), std::back_inserter(res), [&](const std::string& sub_name) { auto var = scope_.FindVar(sub_name); - return var == nullptr ? nullptr : GetTensorFromVar(var); + return var == nullptr ? nullptr : GetTensorFromVar(*var); }); return res; } @@ -770,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack( for (auto& var_name : inplace_vars) { VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name)); - auto* transformed_tensor = - GetTensorFromVar(transfer_scope.FindVar(var_name)); + auto* var = transfer_scope.FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr", + var_name); + auto* transformed_tensor = GetTensorFromVar(*var); original_tensor->ShareDataWith(*transformed_tensor); } } @@ -784,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData( for (auto& var_name : var_name_item.second) { auto* var = scope.FindVar(var_name); // Only tensor can be tranfer to another device. - if (var == nullptr || !VarIsTensor(var)) { + if (var == nullptr || !VarIsTensor(*var)) { continue; } - auto* tensor_in = GetTensorFromVar(var); + auto* tensor_in = GetTensorFromVar(*var); if (!tensor_in->IsInitialized()) { continue; } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index a04d2834eb94c2d8df9c6e48782d10bb3254a6dd..96ad3205235b921a7cf60ed674a8350f74d18509 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -63,7 +63,7 @@ inline std::string GradVarName(const std::string& var_name) { } proto::VarType::Type GetDataTypeOfVar(const Variable* var); -const Tensor* GetTensorFromVar(Variable* var); +const Tensor* GetTensorFromVar(const Variable& var); class OperatorBase; class ExecutionContext; diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index cb2061c06a429d8e8116001a4aa4e8c46ea13428..a0a9a573603ceb6b577529101cb331adbc81337a 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -75,6 +75,19 @@ TEST(Tensor, MutableData) { platform::CPUPlace()); EXPECT_EQ(p1, p2); } + // Not sure if it's desired, but currently, Tensor type can be changed. + { + framework::Tensor src_tensor; + int8_t* p1 = src_tensor.mutable_data(framework::make_ddim({1}), + platform::CPUPlace()); + EXPECT_NE(p1, nullptr); + *p1 = 1; + + uint8_t* p2 = src_tensor.mutable_data(framework::make_ddim({1}), + platform::CPUPlace()); + EXPECT_NE(p2, nullptr); + EXPECT_EQ(static_cast(p2[0]), 1); + } #ifdef PADDLE_WITH_CUDA { diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index dbbe8bcba69a1d87e21c8eae18834fb708e8b1e4..d31c8e3b7d66a0cdb2c4725783c9a24f049c666d 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -1,3 +1,6 @@ +if(WITH_TESTING) + include(test.cmake) # some generic cmake funtion for inference +endif() # analysis and tensorrt must be added before creating static library, # otherwise, there would be undefined reference to them in static library. add_subdirectory(analysis) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index d4d2fd4634f9e11f3f002e11e177c332ced49885..0354f9e6e9588af601210b8a71ae98c1f90d62f0 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid) -function (inference_analysis_test TARGET) - if(WITH_TESTING) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS ARGS EXTRA_DEPS) - cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(mem_opt "") - if(WITH_GPU) - set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") - endif() - cc_test(${TARGET} - SRCS "${analysis_test_SRCS}" - DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} - ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS}) - set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) - endif(WITH_TESTING) +function(inference_analysis_test TARGET) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS EXTRA_DEPS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + inference_base_test(${TARGET} + SRCS ${analysis_test_SRCS} + DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} + ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS}) + endif() endfunction(inference_analysis_test) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index a55426f74f988176aeb180e48d1af8632ed3b5c7..49a9ebe3ddec1e4fd59ae1155a706859e249d25c 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -17,39 +17,12 @@ if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") endif(APPLE) - -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB} - ) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) endif() -function(inference_api_test TARGET_NAME) - if (WITH_TESTING) - set(options "") - set(oneValueArgs SRC) - set(multiValueArgs ARGS) - cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - if (WITH_GPU) - cc_test(${TARGET_NAME} - SRCS ${inference_test_SRC} - DEPS "${inference_deps}" - ARGS --dirname=${PYTHON_TESTS_DIR}/book/ --fraction_of_gpu_memory_to_use=0.15) - else() - cc_test(${TARGET_NAME} - SRCS ${inference_test_SRC} - DEPS "${inference_deps}" - ARGS --dirname=${PYTHON_TESTS_DIR}/book/) - endif() - if(inference_test_ARGS) - set_tests_properties(${TARGET_NAME} - PROPERTIES DEPENDS "${inference_test_ARGS}") - endif() - endif(WITH_TESTING) -endfunction(inference_api_test) - cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) @@ -59,8 +32,11 @@ cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) -inference_api_test(test_api_impl SRC api_impl_tester.cc - ARGS test_word2vec test_image_classification) +if(WITH_TESTING) + inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} + ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) + set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) +endif() cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api ARGS --dirname=${PYTHON_TESTS_DIR}/book) @@ -68,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine.cc DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) - -inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) + if(WITH_TESTING) + inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps} + ARGS --dirname=${WORD2VEC_MODEL_DIR}) + endif() endif() if (WITH_ANAKIN AND WITH_MKL) # only needed in CI diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 1d4dfb8649563ab23ffeec1f79bb305fd2ebae26..5152b8670ddb206f0927c03149684af4a096df42 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -22,12 +22,14 @@ limitations under the License. */ #include "paddle/fluid/inference/tests/test_helper.h" #ifdef __clang__ -#define ACC_DIFF 4e-2 +#define ACC_DIFF 4e-3 #else -#define ACC_DIFF 1e-2 +#define ACC_DIFF 1e-3 #endif -DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_string(word2vec_dirname, "", + "Directory of the word2vec inference model."); +DEFINE_string(book_dirname, "", "Directory of the book inference model."); namespace paddle { @@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { NativeConfig GetConfig() { NativeConfig config; - config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; + config.model_dir = FLAGS_word2vec_dirname; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; #ifdef PADDLE_WITH_CUDA @@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "/image_classification_resnet.inference.model"; + FLAGS_book_dirname + "/image_classification_resnet.inference.model"; const bool is_combined = false; std::vector> feed_target_shapes = @@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "/image_classification_resnet.inference.model"; + FLAGS_book_dirname + "/image_classification_resnet.inference.model"; auto main_predictor = CreatePaddlePredictor(config); std::vector jobs(num_jobs); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index 702158ea3bcab854eece3ccd40724d92efcbae67..89c9a65cb06ba565f0e0cbdb9b6031c6adbcb64e 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { //# 1. Create PaddlePredictor with a config. NativeConfig config0; - config0.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config0.model_dir = FLAGS_dirname; config0.use_gpu = true; config0.fraction_of_gpu_memory = 0.3; config0.device = 0; MixedRTConfig config1; - config1.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config1.model_dir = FLAGS_dirname; config1.use_gpu = true; config1.fraction_of_gpu_memory = 0.3; config1.device = 0; diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 1ac655bdbbf7c45bfdde2c5fa8026fab2c891903..ff718077c1ba6b10fe87aac10d84f96a23ad6bba 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB make -j - word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model' + word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model' if [ -d $word2vec_model ]; then for use_gpu in $use_gpu_list; do ./simple_on_word2vec \ diff --git a/paddle/fluid/inference/test.cmake b/paddle/fluid/inference/test.cmake new file mode 100644 index 0000000000000000000000000000000000000000..ab3a30ce6bba14a7d5ec700a159d90031e6b5dc7 --- /dev/null +++ b/paddle/fluid/inference/test.cmake @@ -0,0 +1,31 @@ +set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url") +set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING + "A path setting inference demo download directories.") +function (inference_download install_dir url filename) + message(STATUS "Download inference test stuff from ${url}/${filename}") + execute_process(COMMAND bash -c "mkdir -p ${install_dir}") + execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") + message(STATUS "finish downloading ${filename}") +endfunction() + +function (inference_download_and_uncompress install_dir url filename) + inference_download(${install_dir} ${url} ${filename}) + execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") +endfunction() + +set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") +if (NOT EXISTS ${WORD2VEC_INSTALL_DIR}) + inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") +endif() +set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") + +function (inference_base_test TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS DEPS) + cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(WITH_GPU) + set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") + endif() + cc_test(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS} ARGS ${mem_opt} ${base_test_ARGS}) +endfunction() diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index c3dd1f433691e1c96e9f38ef7b595befad26408f..71fdc67068b3d92a774db82f569d212f6cffad78 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,18 +1,4 @@ -set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com") -set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING - "A path setting inference demo download directories.") set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) -function (inference_download install_dir url filename) - message(STATUS "Download inference test stuff from ${url}/${filename}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") - message(STATUS "finish downloading ${filename}") -endfunction() - -function (inference_download_and_uncompress install_dir url filename) - inference_download(${install_dir} ${url} ${filename}) - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") -endfunction() function(download_model_and_data install_dir model_name data_name) if (NOT EXISTS ${install_dir}) diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed71594ba5781590f3291d56c4ba1a4443003bd5 --- /dev/null +++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedSpatialTransformerDescriptor = + platform::ScopedSpatialTransformerDescriptor; + +template +class CUDNNAffineGridOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto* theta = ctx.Input("Theta"); + auto* output = ctx.Output("Output"); + const T* theta_data = theta->data(); + + int n = theta->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + Tensor h_sizes; + int* h_size_data; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + h_size_data = h_sizes.data(); + } else { + h_size_data = h_sizes.mutable_data({4}, platform::CPUPlace()); + h_size_data[0] = n; + h_size_data[1] = size_attr[1]; + h_size_data[2] = size_attr[2]; + h_size_data[3] = size_attr[3]; + } + + T* output_data = output->mutable_data( + {n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace()); + ScopedSpatialTransformerDescriptor st_desc; + cudnnSpatialTransformerDescriptor_t cudnn_st_desc = + st_desc.descriptor(4, h_size_data); + + PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward( + handle, cudnn_st_desc, theta_data, output_data)); + } +}; + +template +class CUDNNAffineGridGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto theta_grad = ctx.Output(framework::GradVarName("Theta")); + + int n = output_grad->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + Tensor h_sizes; + int* h_size_data; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + h_size_data = h_sizes.data(); + } else { + h_size_data = h_sizes.mutable_data({4}, platform::CPUPlace()); + h_size_data[0] = n; + h_size_data[1] = size_attr[1]; + h_size_data[2] = size_attr[2]; + h_size_data[3] = size_attr[3]; + } + + ScopedSpatialTransformerDescriptor st_desc; + cudnnSpatialTransformerDescriptor_t cudnn_st_desc = + st_desc.descriptor(4, h_size_data); + + const T* output_grad_data = output_grad->data(); + T* theta_grad_data = theta_grad->mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward( + handle, cudnn_st_desc, output_grad_data, theta_grad_data)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace plat = paddle::platform; +REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNAffineGridOpKernel, + paddle::operators::CUDNNAffineGridOpKernel); +REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNAffineGridGradOpKernel, + paddle::operators::CUDNNAffineGridGradOpKernel); diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ea28265a245c9cd1a35a79324a33f7cf208a159 --- /dev/null +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -0,0 +1,233 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/affine_grid_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct Linspace { + framework::Tensor operator()(T start, T end, int count, + const framework::ExecutionContext& ctx) { + Tensor numbers; + T* number_data = numbers.mutable_data({count}, platform::CPUPlace()); + T slice = (end - start) / (T)(count - 1); + for (int i = 0; i < count; ++i) { + number_data[i] = start + (T)i * slice; + } + return numbers; + } +}; + +class AffineGridOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Theta"), + "Input(Theta) of AffineGridOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of AffineGridOp should not be null."); + auto theta_dims = ctx->GetInputDim("Theta"); + PADDLE_ENFORCE(theta_dims.size() == 3, + "AffineGrid's Input(Theta) should be 3-D tensor."); + + auto output_shape = ctx->Attrs().Get>("output_shape"); + if (output_shape.size() == 0) { + PADDLE_ENFORCE(ctx->HasInput("OutputShape"), + "Input(OutputShape) of AffineGridOp should not be null if " + "attr(output_shape) is not configured."); + auto output_shape_dims = ctx->GetInputDim("OutputShape"); + PADDLE_ENFORCE(output_shape_dims.size() == 1, + "AffineGrid's Input(OutputShape) should be 1-D tensor."); + } else { + PADDLE_ENFORCE(output_shape.size() == 4, + "The size of attr(output_shape) should be 4."); + } + + PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2."); + PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3."); + // N * H * W * 2 + ctx->SetOutputDim("Output", + framework::make_ddim({theta_dims[0], -1, -1, 2})); + ctx->ShareLoD("Theta", "Output"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library = framework::LibraryType::kCUDNN; + } +#endif + auto data_type = framework::ToDataType(ctx.Input("Theta")->type()); + return framework::OpKernelType(data_type, ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library); + } +}; + +class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Theta", + "(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. " + "It is used to transform coordinate (x_0, y_0) to coordinate (x_1, " + "y_1)."); + AddInput("OutputShape", + "(Tensor) The shape of target image with format [N, C, H, W].") + .AsDispensable(); + AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2]."); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(true); + AddAttr>( + "output_shape", + "The target output image shape with format [N, C, H, W].") + .SetDefault(std::vector()); + + AddComment(R"DOC( + It generates a grid of (x,y) coordinates using the parameters of the + affine transformation that correspond to a set of points where the input + feature map should be sampled to produce the transformed output feature map. + + Given: + Theta = [[[x_11, x_12, x_13] + [x_14, x_15, x_16]] + [[x_21, x_22, x_23] + [x_24, x_25, x_26]]] + + OutputShape = [2, 3, 5, 5] + + Step 1: + + Generate relative coordinates according to OutputShape. + The values of relative coordinates are in the interval between -1 and 1. + The shape of the relative coordinates is [2, H, W] as below: + + C = [[[-1. -1. -1. -1. -1. ] + [-0.5 -0.5 -0.5 -0.5 -0.5] + [ 0. 0. 0. 0. 0. ] + [ 0.5 0.5 0.5 0.5 0.5] + [ 1. 1. 1. 1. 1. ]] + [[-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ]]] + C[0] is the coordinates in height axis and C[1] is the coordinates in width axis. + + Step2: + Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get: + C_ = [[-1. -1. 1. ] + [-0.5 -1. 1. ] + [ 0. -1. 1. ] + [ 0.5 -1. 1. ] + [ 1. -1. 1. ] + [-1. -0.5 1. ] + [-0.5 -0.5 1. ] + [ 0. -0.5 1. ] + [ 0.5 -0.5 1. ] + [ 1. -0.5 1. ] + [-1. 0. 1. ] + [-0.5 0. 1. ] + [ 0. 0. 1. ] + [ 0.5 0. 1. ] + [ 1. 0. 1. ] + [-1. 0.5 1. ] + [-0.5 0.5 1. ] + [ 0. 0.5 1. ] + [ 0.5 0.5 1. ] + [ 1. 0.5 1. ] + [-1. 1. 1. ] + [-0.5 1. 1. ] + [ 0. 1. 1. ] + [ 0.5 1. 1. ] + [ 1. 1. 1. ]] + Step3: + Compute output by equation $$Output[i] = C_ * Theta[i]^T$$ + )DOC"); + } +}; + +class AffineGridOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + auto theta_dims = ctx->GetInputDim("Theta"); + if (ctx->HasOutput(framework::GradVarName("Theta"))) { + ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } +#endif + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Theta")->type()), + ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_); + } +}; + +class AffineGridGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("affine_grid_grad"); + op->SetInput("Theta", Input("Theta")); + op->SetInput("OutputShape", Input("OutputShape")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta")); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker, + ops::AffineGridGradMaker); +REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad); + +REGISTER_OP_CPU_KERNEL( + affine_grid, + ops::AffineGridOpKernel, + ops::AffineGridOpKernel); +REGISTER_OP_CPU_KERNEL( + affine_grid_grad, + ops::AffineGridGradOpKernel, + ops::AffineGridGradOpKernel); diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h new file mode 100644 index 0000000000000000000000000000000000000000..07e26c292c3bafc4d98bd392a9e1e21a9eb383a8 --- /dev/null +++ b/paddle/fluid/operators/affine_grid_op.h @@ -0,0 +1,190 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenTensor = framework::EigenTensor; + +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; +using Array3 = Eigen::DSizes; +using Array4 = Eigen::DSizes; + +/** + *Return a tensor with evenly spaced numbers over a specified interval. + */ +template +struct Linspace { + framework::Tensor operator()(T start, T end, int count, + const framework::ExecutionContext& ctx); +}; + +template +class AffineGridOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context().eigen_device(); + auto* theta = ctx.Input("Theta"); + int n = theta->dims()[0]; + + auto size_attr = ctx.Attr>("output_shape"); + int h = 0; + int w = 0; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + Tensor h_sizes; + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + const int* h_size_data = h_sizes.data(); + h = h_size_data[2]; + w = h_size_data[3]; + } else { + h = size_attr[2]; + w = size_attr[3]; + } + + auto* output = ctx.Output("Output"); + output->mutable_data({n, h, w, 2}, ctx.GetPlace()); + + math::SetConstant()( + ctx.template device_context(), output, + static_cast(0)); + + Linspace linspace; + // Get indexes of height with shape [height, width, 1] + auto h_idx = linspace((T)-1, (T)1, h, ctx); + auto h_idx_t = EigenTensor::From(h_idx); + // Get indexes of width with shape [height, width, 1] + auto w_idx = linspace((T)-1, (T)1, w, ctx); + auto w_idx_t = EigenTensor::From(w_idx); + // Get constant ones tensor with shape [height, width, 1] + Tensor ones; + ones.mutable_data({h, w, 1}, ctx.GetPlace()); + auto ones_t = EigenTensor::From(ones).setConstant((T)1); + // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and + // ones + Tensor grid; + grid.mutable_data({n, h, w, 3}, ctx.GetPlace()); + auto grid_t = EigenTensor::From(grid); + + grid_t.device(place) = w_idx_t.reshape(Array2(1, w)) + .broadcast(Array2(h, 1)) + .reshape(Array3(h, w, 1)) + .concatenate(h_idx_t.reshape(Array2(1, h)) + .broadcast(Array2(w, 1)) + .shuffle(Array2(1, 0)) + .reshape(Array3(h, w, 1)), + 2) + .eval() + .concatenate(ones_t, 2) + .reshape(Array4(1, h, w, 3)) + .broadcast(Array4(n, 1, 1, 1)); + + // output = grid * theta.T + // TODO(wanghaoshuang): Refine batched matrix multiply + auto blas = math::GetBlas(ctx); + for (int i = 0; i < n; ++i) { + Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3}); + Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3}); + Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2}); + blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out, + T(0)); + } + } +}; + +template +class AffineGridGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context().eigen_device(); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto theta_grad = ctx.Output(framework::GradVarName("Theta")); + + int n = output_grad->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + int h = 0; + int w = 0; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + Tensor h_sizes; + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + const int* h_size_data = h_sizes.data(); + h = h_size_data[2]; + w = h_size_data[3]; + } else { + h = size_attr[2]; + w = size_attr[3]; + } + + theta_grad->mutable_data({n, 2, 3}, ctx.GetPlace()); + + math::SetConstant()( + ctx.template device_context(), theta_grad, + static_cast(0)); + + Linspace linspace; + + // Get indexes of height with shape [height, width, 1] + auto h_idx = linspace((T)-1, (T)1, h, ctx); + auto h_idx_t = EigenTensor::From(h_idx); + // Get indexes of width with shape [height, width, 1] + auto w_idx = linspace((T)-1, (T)1, w, ctx); + auto w_idx_t = EigenTensor::From(w_idx); + // Get constant ones tensor with shape [height, width, 1] + Tensor ones; + ones.mutable_data({h, w, 1}, ctx.GetPlace()); + auto ones_t = EigenTensor::From(ones).setConstant((T)1); + // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and + // ones + Tensor grid; + grid.mutable_data({n, h, w, 3}, ctx.GetPlace()); + auto grid_t = EigenTensor::From(grid); + grid_t.device(place) = w_idx_t.reshape(Array2(1, w)) + .broadcast(Array2(h, 1)) + .reshape(Array3(h, w, 1)) + .concatenate(h_idx_t.reshape(Array2(1, h)) + .broadcast(Array2(w, 1)) + .shuffle(Array2(1, 0)) + .reshape(Array3(h, w, 1)), + 2) + .eval() + .concatenate(ones_t, 2) + .reshape(Array4(1, h, w, 3)) + .broadcast(Array4(n, 1, 1, 1)); + // output = grid * theta.T + // TODO(wanghaoshuang): Refine batched matrix multiply + auto blas = math::GetBlas(ctx); + for (int i = 0; i < n; ++i) { + Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3}); + Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2}); + Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3}); + blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1), + &sliced_theta_grad, T(0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc index d7a9bfbc437dbf4c723b9c87ff62ec6b62c38638..89416f7ab5d07ddac5b540b9bb361f831c1ef360 100644 --- a/paddle/fluid/operators/delete_var_op.cc +++ b/paddle/fluid/operators/delete_var_op.cc @@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase { } }; +class DeleteVarOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override {} +}; + class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -48,4 +53,5 @@ It should not be configured by users directly. REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp, paddle::framework::EmptyGradOpMaker, - paddle::operators::DeleteVarOpInfoMaker); + paddle::operators::DeleteVarOpInfoMaker, + paddle::operators::DeleteVarOpShapeInference); diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 089b541a0a61adb5efda6b2e027c913d5808dff0..f84ff206fffddef1030b7ed439e887bdfef342a6 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(gather_grad, ops::GatherGradOp); REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, - ops::GatherOpKernel, ops::GatherOpKernel); + ops::GatherOpKernel, ops::GatherOpKernel, + ops::GatherOpKernel); REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, + ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel); + ops::GatherGradientOpKernel); diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 7e014dd1cb47ee0575308dc13ba7bc7617baebff..9f4aef08cd58e72ce344a640e6564b9e360ce169 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, + ops::GatherGradOpCUDAKernel, + ops::GatherGradOpCUDAKernel, + ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 7be8539a7b0f1890898fd386a3056601fda8a7c3..6d491dbf1ed162ef07fda4c07e95cc57108486fd 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -31,7 +31,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class MaxSeqPoolFunctor { public: void operator()(const platform::CPUDeviceContext& context, @@ -70,7 +70,41 @@ class MaxSeqPoolFunctor { } } }; +// Instantisation of Max Sequence Pooling for test phase eg. no need to fill +// index buffer +template +class MaxSeqPoolFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& input, framework::Tensor* output, + framework::Tensor* index) { + auto in_dims = input.dims(); + auto out_dims = output->dims(); + PADDLE_ENFORCE_GT(in_dims.size(), 1); + PADDLE_ENFORCE_GT(out_dims.size(), 1); + for (int64_t i = 1; i < in_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + } + + auto starts = input.lod()[0]; + const T* in_data = input.data(); + T* out_data = output->data(); + int64_t num_seq = out_dims[0]; + int64_t dim = output->numel() / num_seq; + for (int64_t i = 0; i < num_seq; ++i) { + std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim], + dim * sizeof(T)); + for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) { + for (int64_t k = 0; k < dim; ++k) { + if (in_data[j * dim + k] > out_data[i * dim + k]) { + out_data[i * dim + k] = in_data[j * dim + k]; + } + } + } + } + } +}; template class MaxSeqPoolGradFunctor { public: @@ -188,11 +222,16 @@ class SequencePoolFunctor { /* max pool has index output */ void operator()(const platform::CPUDeviceContext& context, const std::string pooltype, const framework::LoDTensor& input, - framework::Tensor* output, + framework::Tensor* output, bool is_test, framework::Tensor* index = nullptr) { if (pooltype == "MAX") { - math::MaxSeqPoolFunctor max_pool; - max_pool(context, input, output, index); + if (is_test) { + math::MaxSeqPoolFunctor max_pool; + max_pool(context, input, output, index); + } else { + math::MaxSeqPoolFunctor max_pool; + max_pool(context, input, output, index); + } return; } if (pooltype == "LAST") { @@ -200,6 +239,7 @@ class SequencePoolFunctor { last_pool(context, input, output); return; } + if (pooltype == "FIRST") { math::FirstSeqPoolFunctor first_pool; first_pool(context, input, output); diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index a92aef805a0434f2ebcbc62d4e5eaef0cfb21bfa..0015fafbc892912424dfa6dbd1778438d384ca19 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -133,7 +133,7 @@ class SequencePoolFunctor { public: void operator()(const platform::CUDADeviceContext& context, const std::string pooltype, const framework::LoDTensor& input, - framework::Tensor* output, + framework::Tensor* output, bool is_test, framework::Tensor* index = nullptr) { auto& lod = input.lod()[0]; const size_t item_dim = output->numel() / output->dims()[0]; diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h index 8dcbee65d0b63a137e5f422ec8667cc950641b4a..a1046ea2160d0ae9c2251612c97d3f2640b0aad1 100644 --- a/paddle/fluid/operators/math/sequence_pooling.h +++ b/paddle/fluid/operators/math/sequence_pooling.h @@ -28,7 +28,7 @@ class SequencePoolFunctor { /* max pool has index output */ void operator()(const DeviceContext& context, const std::string pooltype, const framework::LoDTensor& input, framework::Tensor* output, - framework::Tensor* index = nullptr); + bool is_test = false, framework::Tensor* index = nullptr); }; template diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc index 15d3f064eb7b025dc9a85b2aabad24186061cbd4..217bb1610fd3f02f0f72d3b7750ebcdfad243f48 100644 --- a/paddle/fluid/operators/sequence_pool_op.cc +++ b/paddle/fluid/operators/sequence_pool_op.cc @@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) This tensor is used for the sequence max-pooling " "to record the max indexes.") .AsIntermediate(); + AddAttr("is_test", "").SetDefault(false); AddAttr( "pooltype", "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.") diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h index 2aa20792f24305a106c500a3d7a6e3d363bc31d8..f2e4a55dee49664b2fc09813f6dba5f68aaf11d5 100644 --- a/paddle/fluid/operators/sequence_pool_op.h +++ b/paddle/fluid/operators/sequence_pool_op.h @@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel { auto* in = context.Input("X"); auto* out = context.Output("Out"); std::string pooltype = context.Attr("pooltype"); - Tensor* index = nullptr; - if (pooltype == "MAX") { - index = context.Output("MaxIndex"); - } auto dims = in->dims(); auto lod = in->lod(); @@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel { dims[0] = lod[0].size() - 1; out->Resize({dims}); out->mutable_data(context.GetPlace()); - if (pooltype == "MAX") { + Tensor* index = nullptr; + + const bool is_test = context.Attr("is_test"); + + // Do not create index buffer for inference (is_test) mode + // TODO(jczaja): Skip index buffer creation for other devices eg. GPU + if (pooltype == "MAX" && + (is_test == false || + platform::is_cpu_place(context.GetPlace()) == false)) { + index = context.Output("MaxIndex"); index->Resize({dims}); index->mutable_data(context.GetPlace()); } math::SequencePoolFunctor pool; pool(context.template device_context(), pooltype, *in, out, - index); + is_test, index); } }; diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 6fe30630e9683f59044b216b9e9b1f7dd647b1e2..d19ac9839c90a116265b761e3b1b3f855e2d95e8 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -67,6 +67,7 @@ class SumOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto x_vars = ctx.MultiInputVar("X"); + auto x_vars_name = ctx.Inputs("X"); framework::LibraryType library{framework::LibraryType::kPlain}; framework::DataLayout layout{framework::DataLayout::kAnyLayout}; @@ -81,10 +82,11 @@ class SumOp : public framework::OperatorWithKernel { if (x_vars[0]->IsType()) { int dtype = -1; - for (auto& x_var : x_vars) { + for (size_t idx = 0; idx < x_vars.size(); ++idx) { + PADDLE_ENFORCE(x_vars[idx] != nullptr, + "Input var[%s] should not be nullptr", x_vars_name[idx]); // FIXME(zcd): The input x_var may be SelectedRows or LoDTensor. - auto tensor = framework::GetTensorFromVar( - const_cast(x_var)); + auto tensor = framework::GetTensorFromVar(*x_vars[idx]); if (tensor->numel() == 0) { continue; } diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index a29562b0692684a52a2f022023ea57c3ca1ef712..d7676f89ab5e781f910f98d03e72d5f7c1023a9a 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -147,7 +147,6 @@ function cmake_gen() { -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} - -DWITH_FAST_BUNDLE_TEST=ON -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} @@ -180,7 +179,6 @@ EOF -DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DCUDNN_ROOT=/usr/ \ -DWITH_TESTING=${WITH_TESTING:-ON} \ - -DWITH_FAST_BUNDLE_TEST=ON \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 604f3eacd75beff306915b224b30c369dd3a486f..22c60c1cbe4faa8577fa655766e42694652e498d 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -884,12 +884,13 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs): load_prog = Program() load_block = load_prog.global_block() + need_delete_vars = [] for var_tuple in slice_vars_and_attrs: orig_var = var_tuple[0] start = var_tuple[1] slice_var = var_tuple[2] - end = start + reduce(lambda x, y: x * y, slice_var.shape) + end = start + slice_var.shape[0] clone_orig_var = load_block.create_var( name=orig_var.name, @@ -917,5 +918,8 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs): attrs={'axes': [0], 'starts': [start], 'ends': [end]}) - + need_delete_vars.append(clone_orig_var) + load_block.append_op( + type='delete_var', + inputs={'X': need_delete_vars}, ) executor.run(load_prog) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2d27ccbb118cd5c73c161862a5c182dd49bb5e60..3f5b0bcd7ba8d34069e819b9053e82cdff8ebc2b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -154,6 +154,7 @@ __all__ = [ 'mul', 'sigmoid_cross_entropy_with_logits', 'maxout', + 'affine_grid', 'sequence_reverse', 'affine_channel', 'hash', @@ -711,8 +712,18 @@ def dynamic_gru(input, The first part are weights of the update gate and reset gate with shape :math:`(D \\times 2D)`, and the second part are weights for candidate hidden state with shape :math:`(D \\times D)`. - bias_attr(ParamAttr): The parameter attribute for learnable the - hidden-hidden bias. + + If it is set to None or one attribute of ParamAttr, dynamic_gru will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias + of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates + the bias in the update gate, reset gate and candidate calculations. + If it is set to False, no bias will be applied to the update gate, + reset gate and candidate calculations. If it is set to None or one + attribute of ParamAttr, dynamic_gru will create ParamAttr as + bias_attr. If the Initializer of the bias_attr is not set, the bias + is initialized zero. Default: None. is_reverse(bool): Whether to compute reversed GRU, default :attr:`False`. gate_activation(str): The activation for update gate and reset gate. @@ -750,7 +761,7 @@ def dynamic_gru(input, attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True) batch_size = input.shape[0] inputs = {'Input': input, 'Weight': weight, 'Bias': bias} - if h_0 != None: + if h_0: assert h_0.shape == ( batch_size, size ), 'The shape of h0 should be(batch_size, %d)' % size @@ -811,10 +822,29 @@ def gru_unit(input, Args: input (Variable): The fc transformed input value of current step. - hidden (Variable): The hidden value of lstm unit from previous step. + hidden (Variable): The hidden value of gru unit from previous step. size (integer): The input dimension value. - param_attr (ParamAttr): The weight parameters for gru unit. Default: None - bias_attr (ParamAttr): The bias parameters for gru unit. Default: None + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weight matrix. Note: + + - The shape of the weight matrix is :math:`(T \\times 3D)`, where + :math:`D` is the hidden size. + - All elements in the weight matrix can be divided into two parts. + The first part are weights of the update gate and reset gate with + shape :math:`(D \\times 2D)`, and the second part are weights for + candidate hidden state with shape :math:`(D \\times D)`. + + If it is set to None or one attribute of ParamAttr, gru_unit will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias + of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates + the bias in the update gate, reset gate and candidate calculations. + If it is set to False, no bias will be applied to the update gate, + reset gate and candidate calculations. If it is set to None or one + attribute of ParamAttr, gru_unit will create ParamAttr as + bias_attr. If the Initializer of the bias_attr is not set, the bias + is initialized zero. Default: None. activation (string): The activation type for cell (actNode). Default: 'tanh' gate_activation (string): The activation type for gates (actGate). @@ -1826,7 +1856,7 @@ def conv3d(input, return helper.append_activation(pre_act) -def sequence_pool(input, pool_type): +def sequence_pool(input, pool_type, is_test=False): """ This function add the operator for sequence pooling. It pools features of all time-steps of each instance, and is applied @@ -1863,6 +1893,7 @@ def sequence_pool(input, pool_type): input(variable): The input variable which is a LoDTensor. pool_type (string): The pooling type of sequence_pool. It supports average, sum, sqrt and max. + is_test(bool, Default False): Used distinguish training from scoring mode. Returns: The sequence pooling variable which is a Tensor. @@ -1890,7 +1921,8 @@ def sequence_pool(input, pool_type): inputs={"X": input}, outputs={"Out": pool_out, "MaxIndex": max_index}, - attrs={"pooltype": pool_type.upper()}) + attrs={"pooltype": pool_type.upper(), + "is_test": is_test}) # when pool_type is max, variable max_index is initialized, # so we stop the gradient explicitly here @@ -3019,7 +3051,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None): x = fluid.layers.data(name='y', shape=[10, 5], dtype='float32', lod_level=1) - pad_value = fluid.layers.assign(input=numpy.array([0])) + pad_value = fluid.layers.assign( + input=numpy.array([0], dtype=numpy.float32)) out = fluid.layers.sequence_pad(x=x, pad_value=pad_value) """ @@ -4441,7 +4474,10 @@ def transpose(x, perm, name=None): Examples: .. code-block:: python - x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32') + # use append_batch_size=False to avoid prepending extra + # batch size in shape + x = fluid.layers.data(name='x', shape=[5, 10, 15], + dtype='float32', append_batch_size=False) x_transposed = layers.transpose(x, perm=[1, 0, 2]) """ @@ -6106,6 +6142,124 @@ def crop(x, shape=None, offsets=None, name=None): return out +def affine_grid(theta, out_shape, name=None): + """ + It generates a grid of (x,y) coordinates using the parameters of + the affine transformation that correspond to a set of points where + the input feature map should be sampled to produce the transformed + output feature map. + + .. code-block:: text + + * Case 1: + + Given: + + theta = [[[x_11, x_12, x_13] + [x_14, x_15, x_16]] + [[x_21, x_22, x_23] + [x_24, x_25, x_26]]] + + out_shape = [2, 3, 5, 5] + + Step 1: + + Generate normalized coordinates according to out_shape. + The values of the normalized coordinates are in the interval between -1 and 1. + The shape of the normalized coordinates is [2, H, W] as below: + + C = [[[-1. -1. -1. -1. -1. ] + [-0.5 -0.5 -0.5 -0.5 -0.5] + [ 0. 0. 0. 0. 0. ] + [ 0.5 0.5 0.5 0.5 0.5] + [ 1. 1. 1. 1. 1. ]] + [[-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ]]] + C[0] is the coordinates in height axis and C[1] is the coordinates in width axis. + + Step2: + + Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get: + C_ = [[-1. -1. 1. ] + [-0.5 -1. 1. ] + [ 0. -1. 1. ] + [ 0.5 -1. 1. ] + [ 1. -1. 1. ] + [-1. -0.5 1. ] + [-0.5 -0.5 1. ] + [ 0. -0.5 1. ] + [ 0.5 -0.5 1. ] + [ 1. -0.5 1. ] + [-1. 0. 1. ] + [-0.5 0. 1. ] + [ 0. 0. 1. ] + [ 0.5 0. 1. ] + [ 1. 0. 1. ] + [-1. 0.5 1. ] + [-0.5 0.5 1. ] + [ 0. 0.5 1. ] + [ 0.5 0.5 1. ] + [ 1. 0.5 1. ] + [-1. 1. 1. ] + [-0.5 1. 1. ] + [ 0. 1. 1. ] + [ 0.5 1. 1. ] + [ 1. 1. 1. ]] + Step3: + Compute output by equation $$Output[i] = C_ * Theta[i]^T$$ + + Args: + theta (Variable): A batch of affine transform parameters with shape [N, 2, 3]. + out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. + out_shape can be a Variable or a list or tuple. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The output with shape [N, H, W, 2]. + + Raises: + ValueError: If the type of arguments is not supported. + + Examples: + + .. code-block:: python + theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32") + out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32") + data = fluid.layers.affine_grid(theta, out_shape) + + # or + data = fluid.layers.affine_grid(theta, [5, 3, 28, 28]) + + """ + helper = LayerHelper('affine_grid') + + if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \ + isinstance(out_shape, Variable)): + raise ValueError("The out_shape should be a list, tuple or Variable.") + + if not isinstance(theta, Variable): + raise ValueError("The theta should be a Variable.") + + out = helper.create_variable_for_type_inference(theta.dtype) + ipts = {'Theta': theta} + attrs = {} + if isinstance(out_shape, Variable): + ipts['OutputShape'] = out_shape + else: + attrs['output_shape'] = out_shape + + helper.append_op( + type='affine_grid', + inputs=ipts, + outputs={'Output': out}, + attrs=None if len(attrs) == 0 else attrs) + return out + + def rank_loss(label, left, right, name=None): """ **Rank loss layer for RankNet** diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt index 673c965b662a022739f8d489c331f4de9455a926..91c1d17eb5391ea37a41a886594cc71c6e6c56bd 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt @@ -1,7 +1,19 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") -# default test -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() +if(NOT APPLE) + # default test + foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) + endforeach() +else() + foreach(src ${TEST_OPS}) + if(${src} STREQUAL "test_image_classification_vgg") + message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) + elseif(${src} STREQUAL "test_image_classification_resnet") + message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) + elseif() + py_test(${src} SRCS ${src}.py) + endif() + endforeach() +endif() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index e53c49b13e0330e7e59644d366fefb1246142259..2e87d8f4b4fa07773f205fd0a2151095a2353fc6 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist) LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec) + LIST(REMOVE_ITEM TEST_OPS test_dist_ctr) + LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow) + LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge) + LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) endif(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 @@ -89,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) -py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) +if(NOT APPLE) + py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) +endif() diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index 01e9795d8b1beb67270f45fe7ba2819bf8c3be3e..1cda2711f765622b0bda6f4c688f69352bbd2a6f 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -90,8 +90,10 @@ class TestDistMnist2x2(TestDistRunnerBase): inference_program = fluid.default_main_program().clone() # Optimization - opt = fluid.optimizer.AdamOptimizer( - learning_rate=0.001, beta1=0.9, beta2=0.999) + # TODO(typhoonzero): fix distributed adam optimizer + # opt = fluid.optimizer.AdamOptimizer( + # learning_rate=0.001, beta1=0.9, beta2=0.999) + opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) # Reader train_reader = paddle.batch( diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py new file mode 100644 index 0000000000000000000000000000000000000000..edc60550058f53da456c21de4b41142b907743df --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_save_load.py @@ -0,0 +1,174 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import signal +import subprocess +import argparse +import time +import math +import random +from multiprocessing import Process +from functools import reduce + +import numpy as np +import unittest +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid import io + +from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP +from dist_simnet_bow import TestDistSimnetBow2x2, DATA_URL, DATA_MD5 + + +class TestDistSaveLoad2x2(TestDistSimnetBow2x2): + def _load_persistable_vars(self, executor, dirname, program): + def _is_checkpoint_var(var): + """ + the checkpoint will not save or load all the variables. + var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. + + : param var(Variable) + """ + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.RAW: + return False + # @GRAD are named for gradient variables, checkpoint will not save it. + if "@GRAD" in var.name: + return False + # .trainer_ are named for distribute train variables, checkpoint will not save it. + if ".trainer_" in var.name: + return False + + # .block is named for distribute train variables, checkpoint will not save it. + if ".block" in var.name: + return False + + if "tmp_" in var.name: + return False + + return var.persistable + + io.load_vars( + executor, + dirname=dirname, + main_program=program, + predicate=_is_checkpoint_var, + filename=None) + + def run_pserver(self, args): + self.get_model(batch_size=2) + # NOTE: pserver should not call memory optimize + t = self.get_transpiler(args.trainer_id, + fluid.default_main_program(), args.endpoints, + args.trainers, args.sync_mode) + pserver_prog = t.get_pserver_program(args.current_endpoint) + startup_prog = t.get_startup_program(args.current_endpoint, + pserver_prog) + + need_load = bool(int(os.getenv("LOAD", "0"))) + model_dir = os.getenv("MODEL_DIR", "") + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + + if need_load and model_dir: + self._load_persistable_vars(exe, model_dir, startup_prog) + exe.run(pserver_prog) + + def run_trainer(self, args): + test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ + self.get_model(batch_size=2) + + if args.mem_opt: + fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) + if args.is_dist: + t = self.get_transpiler(args.trainer_id, + fluid.default_main_program(), + args.endpoints, args.trainers, + args.sync_mode) + + trainer_prog = t.get_trainer_program() + else: + trainer_prog = fluid.default_main_program() + + if args.use_cuda: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + + startup_exe = fluid.Executor(place) + startup_exe.run(fluid.default_startup_program()) + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + strategy.allow_op_delay = False + + build_stra = fluid.BuildStrategy() + + if args.use_reduce: + build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + else: + build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + + exe = fluid.ParallelExecutor( + args.use_cuda, + loss_name=avg_cost.name, + exec_strategy=strategy, + build_strategy=build_stra) + + feed_var_list = [ + var for var in trainer_prog.global_block().vars.values() + if var.is_data + ] + + feeder = fluid.DataFeeder(feed_var_list, place) + reader_generator = train_reader() + + def get_data(): + origin_batch = next(reader_generator) + if args.is_dist and args.use_reader_alloc: + new_batch = [] + for offset, item in enumerate(origin_batch): + if offset % 2 == args.trainer_id: + new_batch.append(item) + return new_batch + else: + return origin_batch + + need_save = bool(int(os.getenv("SAVE", "0"))) + model_dir = os.getenv("MODEL_DIR", "") + + if need_save: + for _ in six.moves.xrange(RUN_STEP): + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(get_data())) + if need_save and model_dir: + io.save_persistables(startup_exe, model_dir, trainer_prog) + + var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor()) + print(np.ravel(var).tolist()) + + +if __name__ == "__main__": + paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train") + runtime_main(TestDistSaveLoad2x2) diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py new file mode 100644 index 0000000000000000000000000000000000000000..576d00940c4c7a5e30af5550e14b674a73e7df11 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py @@ -0,0 +1,79 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +def AffineGrid(theta, size): + n = size[0] + w = size[3] + h = size[2] + h_idx = np.repeat( + np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis] + w_idx = np.repeat( + np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis] + grid = np.concatenate( + [w_idx, h_idx, np.ones([h, w, 1])], axis=2) # h * w * 3 + grid = np.repeat(grid[np.newaxis, :], size[0], axis=0) # n * h * w *3 + + ret = np.zeros([n, h * w, 2]) + theta = theta.transpose([0, 2, 1]) + for i in range(len(theta)): + ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i]) + +# print ret.reshape([h * w, 2]).astype("float32") + return ret.reshape([n, h, w, 2]).astype("float32") + + +class TestAffineGridOp(OpTest): + def setUp(self): + self.initTestCase() + self.op_type = "affine_grid" + theta = np.random.randint(1, 3, self.theta_shape).astype("float32") + theta = np.ones(self.theta_shape).astype("float32") + self.inputs = {'Theta': theta} + self.attrs = {"use_cudnn": True} + if self.dynamic_shape: + self.inputs['OutputShape'] = self.output_shape + else: + self.attrs['output_shape'] = self.output_shape + self.outputs = {'Output': AffineGrid(theta, self.output_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad( + ['Theta'], + 'Output', + no_grad_set=['OutputShape'], + max_relative_error=0.006) + + def initTestCase(self): + self.theta_shape = (3, 2, 3) + self.output_shape = np.array([3, 2, 5, 7]).astype("int32") + self.dynamic_shape = False + + +class TestAffineGridOpCase1(TestAffineGridOp): + def initTestCase(self): + self.theta_shape = (3, 2, 3) + self.output_shape = np.array([3, 2, 5, 7]).astype("int32") + self.dynamic_shape = True + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 87fd03ca61d33a53b9323edb2ec7e1c71655816b..07814bc2571b380ec24c825615e3ef3d16e694be 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -22,6 +22,8 @@ import signal import subprocess import six import argparse +import pickle +import numpy as np import paddle.fluid as fluid @@ -128,10 +130,15 @@ class TestDistRunnerBase(object): else: return origin_batch + out_losses = [] for _ in six.moves.xrange(RUN_STEP): loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) - print(loss) + out_losses.append(loss[0]) + if six.PY2: + print(pickle.dumps(out_losses)) + else: + sys.stdout.buffer.write(pickle.dumps(out_losses)) def runtime_main(test_class): @@ -149,7 +156,7 @@ def runtime_main(test_class): parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--use_reduce', action='store_true') parser.add_argument( - '--use_reader_alloc', action='store_true', required=False, default=True) + '--use_reader_alloc', action='store_true', required=False) parser.add_argument('--batch_size', required=False, type=int, default=2) parser.add_argument( '--batch_merge_repeat', required=False, type=int, default=1) @@ -188,7 +195,7 @@ class TestDistBase(unittest.TestCase): self._pservers = 2 self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( self._find_free_port(), self._find_free_port()) - self._python_interp = "python" + self._python_interp = sys.executable self._sync_mode = True self._enforce_place = None self._mem_opt = False @@ -237,21 +244,6 @@ class TestDistBase(unittest.TestCase): return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe - def _wait_ps_ready(self, pid): - retry_times = 50 - while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(3) - try: - # the listen_and_serv_op would touch a file which contains the listen port - # on the /tmp directory until it was ready to process all the RPC call. - os.stat("/tmp/paddle.%d.port" % pid) - return - except os.error as e: - sys.stderr.write('waiting for pserver: %s, left retry %d\n' % - (e, retry_times)) - retry_times -= 1 - def _run_local(self, model, envs, @@ -288,23 +280,20 @@ class TestDistBase(unittest.TestCase): env=envs) local_out, local_err = local_proc.communicate() - local_ret = cpt.to_text(local_out) if check_error_log: err_log.close() - sys.stderr.write('local_stdout: %s\n' % local_ret) + sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out)) sys.stderr.write('local_stderr: %s\n' % local_err) - local_losses = local_ret.split("\n") - return local_losses + return pickle.loads(local_out) def _run_cluster(self, model, envs, check_error_log): # Run dist train to compare with local results ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model, check_error_log, envs) - self._wait_ps_ready(ps0.pid) - self._wait_ps_ready(ps1.pid) + ps0_ep, ps1_ep = self._ps_endpoints.split(",") tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" @@ -339,8 +328,8 @@ class TestDistBase(unittest.TestCase): env0.update(envs) env1.update(envs) - print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0)) - print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1)) + print("tr0_cmd:{}".format(tr0_cmd)) + print("tr1_cmd:{}".format(tr1_cmd)) tr0_pipe = open("/tmp/tr0_err.log", "wb") tr1_pipe = open("/tmp/tr1_err.log", "wb") @@ -356,9 +345,7 @@ class TestDistBase(unittest.TestCase): env=env1) tr0_out, tr0_err = tr0_proc.communicate() - tr0_loss_text = cpt.to_text(tr0_out) tr1_out, tr1_err = tr1_proc.communicate() - tr1_loss_text = cpt.to_text(tr1_out) # close trainer file tr0_pipe.close() @@ -373,15 +360,13 @@ class TestDistBase(unittest.TestCase): ps1.terminate() # print log - sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text) - sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err) - sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text) + sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out)) + sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) + sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) - tr0_losses = tr0_loss_text.split("\n") - tr1_losses = tr1_loss_text.split("\n") - - return tr0_losses, tr1_losses + # return tr0_losses, tr1_losses + return pickle.loads(tr0_out), pickle.loads(tr1_out) def check_with_place(self, model_file, @@ -411,9 +396,9 @@ class TestDistBase(unittest.TestCase): check_error_log) for step_id in range(RUN_STEP): - local_loss = eval(local_losses[step_id])[0] - tr0_loss = eval(tr0_losses[step_id])[0] - tr1_loss = eval(tr1_losses[step_id])[0] - dist_loss = (tr0_loss + tr1_loss) / 2 - print(str(local_loss) + ":" + str(dist_loss)) - self.assertAlmostEqual(local_loss, dist_loss, delta=delta) + local_loss = local_losses[step_id] + tr0_loss = tr0_losses[step_id] + tr1_loss = tr1_losses[step_id] + dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2 + print("=======", local_loss, ":", dist_loss[0], "=======") + self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta) diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py new file mode 100644 index 0000000000000000000000000000000000000000..8b50a312341ddb9bffcaf9f685805f80519f8427 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py @@ -0,0 +1,89 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import shutil +import unittest +import tempfile + +import numpy as np + +from test_dist_base import TestDistBase, RUN_STEP + + +class TestDistSaveLoadDense2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def check_with_place(self, + model_file, + delta=1e-3, + check_error_log=False, + need_envs={}): + + required_envs = { + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "http_proxy": "" + } + + required_envs.update(need_envs) + + if check_error_log: + required_envs["GLOG_v"] = "7" + required_envs["GLOG_logtostderr"] = "1" + + model_dir = tempfile.mkdtemp() + + local_env = {} + local_env["SAVE"] = "1" + local_env["MODEL_DIR"] = model_dir + local_env.update(required_envs) + + cluster_env = {} + cluster_env["LOAD"] = "1" + cluster_env["MODEL_DIR"] = model_dir + cluster_env.update(required_envs) + + local_var = self._run_local(model_file, local_env, check_error_log) + tr0_var, tr1_var = self._run_cluster(model_file, cluster_env, + check_error_log) + + shutil.rmtree(model_dir) + + local_np = np.array(eval(local_var[0])) + train0_np = np.array(eval(tr0_var[0])) + train1_np = np.array(eval(tr1_var[0])) + self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta) + self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta) + self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta) + + def test_dist(self): + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '0', + 'IS_SELF_CONTAINED_LR': '1' + } + self.check_with_place( + "dist_save_load.py", + delta=0, + check_error_log=False, + need_envs=need_envs) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index c0989ca709e100d8f147a08970b0e858c81ce09b..c2a4e5ca0c050813785f602c5d2088466e616971 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase): self._use_reader_alloc = False def test_dist_train(self): - self.check_with_place("dist_se_resnext.py", delta=100) + self.check_with_place("dist_se_resnext.py", delta=1e-7) class TestDistseResnXt2x2WithMemopt(TestDistBase): def _setup_config(self): self._sync_mode = True self._mem_opt = True + self._use_reader_alloc = False def test_dist_train(self): - self.check_with_place("dist_se_resnext.py", delta=100) + self.check_with_place("dist_se_resnext.py", delta=1e-7) class TestDistSeResneXt2x2Async(TestDistBase): diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index c4511a98b0667ecccaa8f63b3064c4fc4e86cc78..986fdd9ff27fe2be54ce97f330028b4ae2358714 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -283,6 +283,25 @@ class TestDecayedAdagrad(TranspilerTest): trainer, _ = self.get_trainer() +class TestFtrl(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=fluid.ParamAttr(name='fc_b')) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + opt = fluid.optimizer.Ftrl(learning_rate=0.1) + opt.minimize(avg_cost) + + def transpiler_test_impl(self): + pserver, startup = self.get_pserver(self.pserver1_ep) + trainer, _ = self.get_trainer() + + class TestLRDecayConditional(TranspilerTest): def net_conf(self): x = fluid.layers.data(name='x', shape=[1000], dtype='float32') @@ -405,18 +424,43 @@ class TestL2DecayWithPiecewise(TranspilerTest): ["sum", "scale", "scale", "elementwise_add", "momentum"]) +class TestEmptyPserverOptimizeBlocks(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + # only one parameter + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=False) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + sgd_optimizer = fluid.optimizer.SGD(learning_rate=1.0) + sgd_optimizer.minimize(avg_cost) + + def transpiler_test_impl(self): + config = fluid.DistributeTranspilerConfig() + config.slice_var_up = False + + pserver, startup = self.get_pserver(ep=self.pserver2_ep, config=config) + + self.assertEqual(len(pserver.blocks), 2) + self.assertEqual(len(pserver.blocks[1].ops), 0) + + class TestDistLookupTableBase(TranspilerTest): def network_with_table(self, is_sparse, is_distributed): self.table_size = 1000 self.emb_size = 64 self.lookup_table_name = 'shared_w' - def emb_pool(ids): + def emb_pool(ids, table_name, is_distributed): emb = fluid.layers.embedding( input=ids, size=[self.table_size, self.emb_size], dtype='float32', - param_attr=self.lookup_table_name, # share parameter + param_attr=table_name, is_sparse=is_sparse, is_distributed=is_distributed) pool = fluid.layers.sequence_pool(input=emb, pool_type='average') @@ -426,9 +470,13 @@ class TestDistLookupTableBase(TranspilerTest): name='title_ids', shape=[1], dtype='int64', lod_level=1) brand_ids = fluid.layers.data( name='brand_ids', shape=[1], dtype='int64', lod_level=1) - title_emb = emb_pool(title_ids) - brand_emb = emb_pool(brand_ids) - fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1) + profile_ids = fluid.layers.data( + name='brand_ids', shape=[1], dtype='int64', lod_level=1) + title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed) + brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed) + profile_emb = emb_pool(profile_ids, "profile_emb", False) + fc0 = fluid.layers.concat( + input=[title_emb, brand_emb, profile_emb], axis=1) predict = fluid.layers.fc(input=fc0, size=2, act=None, @@ -449,7 +497,7 @@ class TestLocalLookupTable(TestDistLookupTableBase): def transpiler_test_impl(self): pserver1, startup1 = self.get_pserver(self.pserver1_ep) - self.assertEqual(len(pserver1.blocks), 3) + self.assertEqual(len(pserver1.blocks), 4) # 0 listen_and_serv # 1 optimize for fc_w or fc_b adam self.assertEqual([op.type for op in pserver1.blocks[1].ops], @@ -459,16 +507,23 @@ class TestLocalLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sum", "scale", "adam", "scale", "scale"]) + # 3 optimize for table 2 adam + # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num + self.assertEqual([op.type for op in pserver1.blocks[3].ops], + ["sum", "scale", "adam", "scale", "scale"]) + trainer, _ = self.get_trainer() self.assertEqual(len(trainer.blocks), 1) ops = [ 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', - 'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad', - 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', - 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', - 'lookup_table_grad', 'sum', 'split_selected_rows', 'send', - 'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat' + 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', + 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', + 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', + 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'split_selected_rows', 'send', 'sequence_pool_grad', + 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv', + 'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) @@ -480,39 +535,45 @@ class TestDistLookupTable(TestDistLookupTableBase): def transpiler_test_impl(self): pserver1, startup1 = self.get_pserver(self.pserver1_ep) - self.assertEqual(len(pserver1.blocks), 5) + self.assertEqual(len(pserver1.blocks), 6) # 0 listen_and_serv # 1 optimize for fc_w or fc_b adam self.assertEqual([op.type for op in pserver1.blocks[1].ops], ["sum", "scale", "adam", "scale", "scale"]) - # 2 optimize for table sgd + # 4 prefetch -> lookup_sparse_table for data0 self.assertEqual([op.type for op in pserver1.blocks[2].ops], + ["sum", "scale", "adam", "scale", "scale"]) + # 2 optimize for table sgd + self.assertEqual([op.type for op in pserver1.blocks[3].ops], ["sum", "sgd"]) # 3 prefetch -> lookup_sparse_table for data0 - self.assertEqual([op.type for op in pserver1.blocks[3].ops], + self.assertEqual([op.type for op in pserver1.blocks[4].ops], ["lookup_sparse_table"]) - # 4 save table - self.assertEqual([op.type for op in pserver1.blocks[4].ops], ["save"]) + # 5 save table + self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"]) trainer, trainer_startup = self.get_trainer() self.assertEqual(len(trainer.blocks), 1) ops = [ 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', - 'sequence_pool', 'concat', 'mul', 'elementwise_add', - 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', - 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', - 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', - 'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_ids', - 'send', 'send_barrier', 'recv', 'recv', 'fetch_barrier' + 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', + 'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', + 'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', + 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'split_selected_rows', 'send', + 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier', + 'recv', 'recv', 'recv', 'fetch_barrier', 'concat' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) - startup_ops = [ 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', - 'fill_constant', 'fill_constant', 'uniform_random', 'recv', 'recv', - 'fetch_barrier', 'fake_init' + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'uniform_random', + 'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat', + 'fake_init' ] self.assertEqual([op.type for op in trainer_startup.blocks[0].ops], startup_ops) @@ -526,7 +587,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): config = fluid.DistributeTranspilerConfig() pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False) - self.assertEqual(len(pserver1.blocks), 3) + self.assertEqual(len(pserver1.blocks), 4) # 0 listen_and_serv # 1 optimize for fc_w or fc_b adam self.assertEqual([op.type for op in pserver1.blocks[1].ops], @@ -535,17 +596,23 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["adam", "scale", "scale"]) + # 3 optimize for table adam + # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num + self.assertEqual([op.type for op in pserver1.blocks[3].ops], + ["adam", "scale", "scale"]) trainer, _ = self.get_trainer(config) self.assertEqual(len(trainer.blocks), 1) ops = [ 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', - 'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad', - 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', - 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', - 'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv', - 'recv', 'recv', 'concat' + 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', + 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', + 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', + 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'split_selected_rows', 'send', 'sequence_pool_grad', + 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv', + 'recv', 'concat', 'concat' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) @@ -559,29 +626,34 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False) - self.assertEqual(len(pserver1.blocks), 5) + self.assertEqual(len(pserver1.blocks), 6) # 0 listen_and_serv # 1 optimize for fc_w or fc_b adam self.assertEqual([op.type for op in pserver1.blocks[1].ops], ["adam", "scale", "scale"]) - # 2 optimize for table sgd - self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"]) - # 3 prefetch -> lookup_sparse_table for data0 - self.assertEqual([op.type for op in pserver1.blocks[3].ops], + # 2 optimize for table adam + self.assertEqual([op.type for op in pserver1.blocks[2].ops], + ["adam", "scale", "scale"]) + # 3 optimize for table sgd + self.assertEqual([op.type for op in pserver1.blocks[3].ops], ["sgd"]) + # 4 prefetch -> lookup_sparse_table for data0 + self.assertEqual([op.type for op in pserver1.blocks[4].ops], ["lookup_sparse_table"]) - # 4 save table - self.assertEqual([op.type for op in pserver1.blocks[4].ops], ["save"]) + # 5 save table + self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"]) trainer, _ = self.get_trainer(config) self.assertEqual(len(trainer.blocks), 1) ops = [ 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', - 'sequence_pool', 'concat', 'mul', 'elementwise_add', - 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', - 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', - 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', - 'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_ids', - 'send', 'recv', 'recv' + 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', + 'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', + 'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', + 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'split_selected_rows', 'send', + 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv', + 'recv', 'concat' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index c0c174f1dbcb3a298d270ab5c8c21b370baedd61..f85beee9be528431c1b30424bf6bbd2fb0d26b4c 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -873,6 +873,22 @@ class TestBook(unittest.TestCase): out = layers.grid_sampler(x, grid) self.assertIsNotNone(out) print(str(program)) + + def test_affine_grid(self): + program = Program() + with program_guard(program): + data = layers.data(name='data', shape=[2, 3, 3], dtype="float32") + out, ids = layers.argsort(input=data, axis=1) + + theta = layers.data(name="theta", shape=[2, 3], dtype="float32") + out_shape = layers.data( + name="out_shape", shape=[-1], dtype="float32") + data_0 = layers.affine_grid(theta, out_shape) + data_1 = layers.affine_grid(theta, [5, 3, 28, 28]) + + self.assertIsNotNone(data_0) + self.assertIsNotNone(data_1) + print(str(program)) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 48b52a5412eb99fbc7a5c8534a766ede4954e849..a0358f8b401e301312b5b9c0b18733d4275045e3 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -55,6 +55,46 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id): exe.run(pserver_prog) +def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers, + trainer_id): + x = fluid.layers.data(name='x', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None, bias_attr=False) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + + # loss function + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + # optimizer + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + ps1 = ip + ":" + str(int(port) + 1) + ps2 = ip + ":" + port + pserver_endpoints = ps1 + "," + ps2 + + config = fluid.DistributeTranspilerConfig() + config.slice_var_up = False + t = fluid.DistributeTranspiler(config=config) + t.transpile( + trainer_id, + pservers=pserver_endpoints, + trainers=trainers, + sync_mode=sync_mode) + pserver_prog = t.get_pserver_program(ps2) + + # pserver2 have no parameter + assert (len(pserver_prog.blocks) == 2) + assert (len(pserver_prog.blocks[1].ops) == 0) + + pserver_startup = t.get_startup_program(ps2, pserver_prog) + exe.run(pserver_startup) + exe.run(pserver_prog) + + class TestListenAndServOp(OpTest): def setUp(self): self.ps_timeout = 5 @@ -63,9 +103,9 @@ class TestListenAndServOp(OpTest): self.trainers = 1 self.trainer_id = 0 - def _start_pserver(self, use_cuda, sync_mode): + def _start_pserver(self, use_cuda, sync_mode, pserver_func): p = Process( - target=run_pserver, + target=pserver_func, args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, self.trainer_id)) p.daemon = True @@ -92,7 +132,24 @@ class TestListenAndServOp(OpTest): def test_handle_signal_in_serv_op(self): # run pserver on CPU in sync mode - p1 = self._start_pserver(False, True) + p1 = self._start_pserver(False, True, run_pserver) + self._wait_ps_ready(p1.pid) + + # raise SIGTERM to pserver + os.kill(p1.pid, signal.SIGINT) + p1.join() + + # run pserver on CPU in async mode + p2 = self._start_pserver(False, False, run_pserver) + self._wait_ps_ready(p2.pid) + + # raise SIGTERM to pserver + os.kill(p2.pid, signal.SIGTERM) + p2.join() + + def test_list_and_serv_run_empty_optimize_block(self): + # run pserver on CPU in sync mode + p1 = self._start_pserver(False, True, run_pserver_with_empty_block) self._wait_ps_ready(p1.pid) # raise SIGTERM to pserver @@ -100,7 +157,7 @@ class TestListenAndServOp(OpTest): p1.join() # run pserver on CPU in async mode - p2 = self._start_pserver(False, False) + p2 = self._start_pserver(False, False, run_pserver_with_empty_block) self._wait_ps_ready(p2.pid) # raise SIGTERM to pserver diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py index 641eb03a5fbf1bb140b20cc3518cea83386fa577..a80ad5b079891efe1b0e1222b3c2455d4891d5f5 100644 --- a/python/paddle/fluid/tests/unittests/test_seq_pool.py +++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py @@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D): out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11)) +class TestSeqMaxPool2DInference(TestSeqMaxPool2D): + def compute(self, x, offset, out): + self.attrs = {'pooltype': "MAX", 'is_test': True} + for i in range(len(offset[0]) - 1): + sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :], + (-1, 3 * 11)) + out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11)) + + def test_check_grad(self): + """Grad computation does not apply to Sequence MAX + Pool executed when is_test is true """ + return + + class TestSeqLastPool2D(TestSeqAvgPool2D): def compute(self, x, offset, out): self.attrs = {'pooltype': "LAST"} diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 8daac0f43b41b9497812a07fa2f96bffb727413d..9066fc9d1bf13176862f6debf0ed0bedaaaf3eba 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -35,6 +35,7 @@ import sys import numpy as np import collections import six +import logging from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .. import core, framework @@ -767,6 +768,15 @@ in a single call.") prefetch_var_name_to_block_id.extend( lookup_table_var_name_to_block_id) + if len(optimize_blocks) == 0: + logging.warn("pserver [" + str(endpoint) + + "] has no optimize block!!") + pre_block_idx = pserver_program.num_blocks - 1 + empty_block = pserver_program._create_block(pre_block_idx) + optimize_blocks.append(empty_block) + + # In some case, some parameter server will have no parameter to optimize + # So we give an empty optimize block to parameter server. attrs = { "optimize_blocks": optimize_blocks, "endpoint": endpoint, @@ -910,11 +920,11 @@ to transpile() call.") block_idx = int(block_name.split(block_suffix)[1]) orig_var = self.origin_program.global_block().vars[orig_var_name] - skip_numel = 0 + skip_dim0 = 0 slice_vars = self.param_var_mapping[orig_var_name] for slice_var in slice_vars[:block_idx]: - skip_numel += reduce(lambda x, y: x * y, slice_var.shape) - slice_vars_and_attrs.append([orig_var, skip_numel, param]) + skip_dim0 += slice_var.shape[0] + slice_vars_and_attrs.append([orig_var, skip_dim0, param]) return slice_vars_and_attrs @@ -1065,7 +1075,12 @@ to transpile() call.") continue_search_lookup_table_op = False all_ops = program.global_block().ops for op in all_ops: - if op.type == LOOKUP_TABLE_TYPE: + if op.type == LOOKUP_TABLE_TYPE and self.table_name == op.input( + "W")[0]: + if not op.attr('is_distributed'): + raise RuntimeError( + "lookup_table_op that lookup an distributed embedding table" + "should set is_distributed to true") continue_search_lookup_table_op = True lookup_table_op_index = lookup_table_op_index if lookup_table_op_index != -1 else list( @@ -1275,7 +1290,6 @@ to transpile() call.") } outputs = {"ParamOut": [param_var]} # only support sgd now - import logging logging.warn( "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of " + table_opt_op.type) @@ -1442,6 +1456,9 @@ to transpile() call.") elif op_type == "decayed_adagrad": if varkey == "Moment": return param_shape + elif op_type == "ftrl": + if varkey in ["SquaredAccumulator", "LinearAccumulator"]: + return param_shape elif op_type == "sgd": pass else: diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index 5269bd94cec47a5262e2389c5b02f91edd5a7d17..9a13cecc646e8534a157fad882fd97836348deb4 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -61,6 +61,9 @@ class InferenceTranspiler(object): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) + if use_mkldnn: + self._depthwise_conv_mkldnn(program) + self._fuse_batch_norm(program, place, scope) if use_mkldnn: self._fuse_conv_bias_mkldnn(program) @@ -70,6 +73,31 @@ class InferenceTranspiler(object): program) # ResNet residual block merging self._fuse_bn_relu_mkldnn(program) + def _depthwise_conv_mkldnn(self, program): + ''' + Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program. + The result is: + - before: + - any_other_op->depthwise_conv->any_other_op + - after: + - any_other_op->conv->any_other_op + :param program: program to transpile + :type program: Program + ''' + self.block = program.block(0) + + i = 0 + while i < len(self.block.ops): + current_op = self.block.ops[i] + if current_op.type == 'depthwise_conv2d': + current_op.desc.set_type("conv2d") + i = i + 1 + + # TODO(luotao): use clone() method to flush the program.desc in force, + # since some large program.desc will not be flushed immediately. + # And a better solution will be considered later. + program = program.clone() + def _fuse_conv_eltwise_mkldnn(self, program): ''' Transpile the program fusing elementwise_add into conv for MKLDNN