diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py index 6c7d2c10363d3e311dfae455f3dd8fcfc51077a0..05b5f3977cbed2f08df73c6d8ba2fff687db3313 100644 --- a/benchmark/cluster/vgg16/vgg16_fluid.py +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -80,6 +80,8 @@ parser.add_argument( type=str, default="", help="Comma-separated list of hostname:port pairs") +parser.add_argument( + "--profile", action='store_true', help="If set, profile a few steps.") # Flags for defining the tf.train.Server parser.add_argument( @@ -183,8 +185,8 @@ def main(): start_time = time.time() num_samples = 0 train_pass_acc.reset() - for batch_id, data in enumerate(train_reader()): - ts = time.time() + + def run_step(batch_id, data): img_data = np.array( map(lambda x: x[0].reshape(data_shape), data)).astype( "float32") @@ -196,14 +198,28 @@ def main(): feed={"pixel": img_data, "label": y_data}, fetch_list=[avg_cost, batch_acc, batch_size]) + return loss, acc, b_size + + if args.profile and args.task_index == 0: + # warmup. + for batch_id, data in enumerate(train_reader()): + if batch_id > 5: break + run_step(batch_id, data) + with profiler.profiler('All', 'total', '/tmp/profile_vgg'): + for batch_id, data in enumerate(train_reader()): + if batch_id > 5: break + run_step(batch_id, data) + + for batch_id, data in enumerate(train_reader()): + ts = time.time() + loss, acc, b_size = run_step(batch_id, data) iters += 1 num_samples += len(data) train_pass_acc.add(value=acc, weight=b_size) print( - "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " - "Speed = %.2f img/s " % (args.task_index, pass_id, iters, - loss, acc, - len(data) / (time.time() - ts)) + "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " + "Speed = %.2f img/s" % (pass_id, iters, loss, acc, + len(data) / (time.time() - ts)) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time diff --git a/doc/fluid/design/dist_train/distributed_traing_review.md b/doc/fluid/design/dist_train/distributed_traing_review.md index 74066a3c2bf7b5e43b18c271de3e18271ee80925..c09b7c99159ace9b3df989f803ede20bc3585d92 100644 --- a/doc/fluid/design/dist_train/distributed_traing_review.md +++ b/doc/fluid/design/dist_train/distributed_traing_review.md @@ -42,7 +42,3 @@ Codistillation is a technique that tries to scale the training further. A few tr [3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation. [4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION - - - - diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index 9f753478d8ecf12441d4b1745a9f6750a1038e31..1b6f656a006489485a55b5c13b5e2de93c3da0ed 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -143,7 +143,7 @@ OpDesc *BlockDesc::InsertOp(size_t index) { } void BlockDesc::RemoveOp(size_t s, size_t e) { - if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { + if (ops_.begin() + s >= ops_.end() || ops_.begin() + e > ops_.end()) { return; } need_update_ = true; diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 96c181f983a33961e3d5fb8745740f2fdbb210de..9de44beafbb69b3510b97afcc43d4b489a029c35 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -15,12 +15,14 @@ if(WITH_GPU) dynload_cuda) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) + nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) + else() set(multi_devices_graph_builder_deps) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim) + cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) endif() -cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 33e02ab65a251a338225ee621ff14acbb0631992..2afa47c81bead6fb104f49886713bf75dc1b4dc0 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -19,14 +19,12 @@ namespace paddle { namespace framework { namespace details { -BroadcastOpHandle::BroadcastOpHandle(const std::vector &local_scopes, - const std::vector &places) - : local_scopes_(local_scopes), places_(places) {} void BroadcastOpHandle::RunImpl() { - // the input and output may have dummy var. - VarHandle *in_var_handle; + if (places_.size() == 1) return; + // The input and output may have dummy vars. + VarHandle *in_var_handle; { auto in_var_handles = DynamicCast(inputs_); PADDLE_ENFORCE_EQ(in_var_handles.size(), 1, @@ -55,27 +53,97 @@ void BroadcastOpHandle::RunImpl() { Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); - for (auto *out : out_var_handles) { - if (*out == *in_var_handle) { + // NOTE: The tensors' Place of input and output must be all on GPU or all on + // CPU. + for (auto *out_var_handle : out_var_handles) { + if (out_var_handle->IsTheSameVar(*in_var_handle)) { continue; } - - auto &out_p = out->place_; - auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_); + auto t_out_p = out_var_handle->place_; + auto *out_var = var_scopes.at(out_var_handle->scope_idx_) + ->FindVar(out_var_handle->name_); PADDLE_ENFORCE_NOT_NULL(out_var); - PADDLE_ENFORCE_EQ(out_p.which(), in_var_handle->place_.which(), - "Places must be all on CPU or all on CUDA."); - + if (platform::is_gpu_place(in_tensor.place())) { + PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), + "Places of input and output must be all on GPU."); + } else { + t_out_p = platform::CPUPlace(); + } VariableVisitor::ShareDimsAndLoD(*in_var, out_var); - VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p, + VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p, in_tensor.type()); + } + + if (platform::is_cpu_place(in_tensor.place())) { + for (auto *out_var_handle : out_var_handles) { + if (out_var_handle->IsTheSameVar(*in_var_handle)) { + continue; + } + auto &out_p = out_var_handle->place_; + auto *out_var = var_scopes.at(out_var_handle->scope_idx_) + ->FindVar(out_var_handle->name_); + + RunAndRecordEvent(out_p, [in_tensor, out_var] { + paddle::framework::TensorCopy( + in_tensor, platform::CPUPlace(), + &VariableVisitor::GetMutableTensor(out_var)); + }); + } + } else { +#ifdef PADDLE_WITH_CUDA + VarHandle *out_handle = nullptr; + int root_id = boost::get(in_tensor.place()).device; + std::vector> broadcast_calls; + + for (auto out_var_handle : out_var_handles) { + Variable *out_var = var_scopes.at(out_var_handle->scope_idx_) + ->FindVar(out_var_handle->name_); + + int dst_id = + boost::get(out_var_handle->place_).device; + + auto &nccl_ctx = nccl_ctxs_->at(dst_id); + + void *send_recv_buffer = nullptr; + if (root_id == dst_id) { + send_recv_buffer = const_cast(in_tensor.data()); + out_handle = out_var_handle; + } else { + send_recv_buffer = + VariableVisitor::GetMutableTensor(out_var).mutable_data( + out_var_handle->place_); + } + + int type = platform::ToNCCLDataType(in_tensor.type()); + size_t numel = static_cast(in_tensor.numel()); + broadcast_calls.emplace_back( + [send_recv_buffer, numel, type, root_id, &nccl_ctx] { + PADDLE_ENFORCE(platform::dynload::ncclBcast( + send_recv_buffer, numel, static_cast(type), + root_id, nccl_ctx.comm_, nccl_ctx.stream())); + }); + } - auto dev_ctx = dev_ctxes_.at(out_p); - RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] { - paddle::framework::TensorCopy( - in_tensor, out_p, *(dev_ctx), - &VariableVisitor::GetMutableTensor(out_var)); + this->RunAndRecordEvent([&] { + { + platform::NCCLGroupGuard guard; + for (auto &call : broadcast_calls) { + call(); + } + } + + if (!out_handle->IsTheSameVar(*in_var_handle)) { + auto out_var = var_scopes.at(in_var_handle->scope_idx_) + ->FindVar(out_var_handles[0]->name_); + paddle::framework::TensorCopy( + in_tensor, in_var_handle->place_, + *(dev_ctxes_.at(in_var_handle->place_)), + &VariableVisitor::GetMutableTensor(out_var)); + } }); +#else + PADDLE_THROW("CUDA is not enabled."); +#endif } } diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 92420f10ac5972b7924d83b43bb28234079e5072..984a95008c0393eff01c2d419cc98949aed14980 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -24,14 +24,32 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/nccl_helper.h" +#endif + namespace paddle { namespace framework { namespace details { struct BroadcastOpHandle : public OpHandleBase { public: +#ifdef PADDLE_WITH_CUDA + BroadcastOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap *nccl_ctxs) + : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) { + if (nccl_ctxs_) { + for (auto &p_ctx : nccl_ctxs_->contexts_) { + dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); + } + } + } +#else BroadcastOpHandle(const std::vector &local_scopes, - const std::vector &places); + const std::vector &places) + : local_scopes_(local_scopes), places_(places) {} +#endif std::string Name() const override; @@ -44,6 +62,9 @@ struct BroadcastOpHandle : public OpHandleBase { private: const std::vector &local_scopes_; const std::vector &places_; +#ifdef PADDLE_WITH_CUDA + const platform::NCCLContextMap *nccl_ctxs_; +#endif }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc index 8f1b6d1615312fced0887f9ff14ae17877371b7e..c6e923ef77ff03413eefe4f26457a5322747618e 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -35,15 +35,25 @@ struct TestBroadcastOpHandle { std::unique_ptr op_handle_; std::vector> vars_; std::vector gpu_list_; + bool use_gpu_; +#ifdef PADDLE_WITH_CUDA + std::unique_ptr nccl_ctxs_; +#endif void WaitAll() { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } +#ifdef PADDLE_WITH_CUDA + if (nccl_ctxs_) { + nccl_ctxs_->WaitAll(); + } +#endif } void InitCtxOnGpu(bool use_gpu) { - if (use_gpu) { + use_gpu_ = use_gpu; + if (use_gpu_) { #ifdef PADDLE_WITH_CUDA int count = p::GetCUDADeviceCount(); if (count <= 1) { @@ -57,6 +67,7 @@ struct TestBroadcastOpHandle { gpu_list_.push_back(p); ctxs_.emplace_back(new p::CUDADeviceContext(p)); } + nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); #else PADDLE_THROW("CUDA is not support."); #endif @@ -67,6 +78,9 @@ struct TestBroadcastOpHandle { gpu_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } +#ifdef PADDLE_WITH_CUDA + nccl_ctxs_.reset(nullptr); +#endif } } @@ -82,7 +96,21 @@ struct TestBroadcastOpHandle { } param_scopes_[input_scope_idx]->Var("input"); - op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_)); + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset( + new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get())); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset( + new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get())); +#else + op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_)); +#endif + } auto* in_var_handle = new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]); @@ -97,7 +125,9 @@ struct TestBroadcastOpHandle { op_handle_->AddInput(dummy_var_handle); for (size_t j = 0; j < gpu_list_.size(); ++j) { - op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); + if (!use_gpu_) { + op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); + } VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]); vars_.emplace_back(out_var_handle); op_handle_->AddOutput(out_var_handle); diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc index 3ed7723919fc3a547b15c28b846de758a8155e66..3dfc972a44c62bd2adfc1331f29ffb1cca537652 100644 --- a/paddle/fluid/framework/details/gather_op_handle.cc +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -25,6 +25,7 @@ GatherOpHandle::GatherOpHandle(const std::vector &local_scopes, : local_scopes_(local_scopes), places_(places) {} void GatherOpHandle::RunImpl() { + if (places_.size() == 1) return; // the input and output may have dummy var. auto in_var_handles = DynamicCast(inputs_); @@ -35,7 +36,6 @@ void GatherOpHandle::RunImpl() { VarHandle *out_var_handle; { auto out_var_handles = DynamicCast(outputs_); - PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, "The number of output should be one."); out_var_handle = out_var_handles.front(); @@ -50,68 +50,62 @@ void GatherOpHandle::RunImpl() { auto pre_in_var = var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_); PADDLE_ENFORCE_NOT_NULL(pre_in_var); + PADDLE_ENFORCE(pre_in_var->IsType(), "Currently, gather_op only can gather SelectedRows."); - auto pre_place = in_0_handle->place_; - PADDLE_ENFORCE_EQ(out_var_handle->place_.which(), pre_place.which(), - "The place of input and output should be the same."); - // Wait input done, this Wait is asynchronous operation WaitInputVarGenerated(in_var_handles); + auto &pre_in_value = pre_in_var->Get(); std::vector out_rows; std::vector in_tensors; - std::vector in_places; - auto &pre_in = pre_in_var->Get(); - // gather the inputs + // Gather the inputs for (auto *in_handle : in_var_handles) { - auto in_p = in_handle->place_; - in_places.push_back(in_p); - PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(), - "Places must be all on CPU or all on CUDA."); auto *in_var = var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_); - auto &in_sr = in_var->Get(); + PADDLE_ENFORCE_NOT_NULL(in_var); + VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var); - PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(), - "The type of input is not consistent."); - PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(), - "The height of inputs is not consistent."); - PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), - "The dims of inputs is not consistent."); + auto &in_sr_value = in_var->Get(); - auto &in_sr_rows = in_sr.rows(); + auto &in_sr_rows = in_sr_value.rows(); out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end()); - - in_tensors.emplace_back(in_sr.value()); + in_tensors.emplace_back(in_sr_value.value()); } - // write the output - auto &out_place = out_var_handle->place_; - auto out_scope_idx = out_var_handle->scope_idx_; - auto out_var = var_scopes.at(out_scope_idx)->FindVar(out_var_handle->name_); + // NOTE: The Places of all input tensor must be all on CPU or all on GPU. + platform::Place t_out_p = out_var_handle->place_; + if (platform::is_gpu_place(pre_in_value.place())) { + PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), + "Places of input and output must be all on GPU."); + } else { + t_out_p = platform::CPUPlace(); + } - auto out = out_var->GetMutable(); - out->set_height(pre_in.height()); - out->set_rows(out_rows); + auto out_var = + var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_); + PADDLE_ENFORCE_NOT_NULL(out_var); + auto out_value = out_var->GetMutable(); + out_value->set_height(pre_in_value.height()); + out_value->set_rows(out_rows); size_t rows = out_rows.size(); - DDim out_dim = pre_in.GetCompleteDims(); + DDim out_dim = pre_in_value.GetCompleteDims(); out_dim[0] = static_cast(rows); - out->mutable_value()->Resize(out_dim); - out->mutable_value()->mutable_data(out_place, pre_in.value().type()); - Tensor *out_tensor = out->mutable_value(); + out_value->mutable_value()->Resize(out_dim).mutable_data( + t_out_p, pre_in_value.value().type()); + Tensor *out_tensor = out_value->mutable_value(); // copy - auto dev_ctx = dev_ctxes_[out_place]; - RunAndRecordEvent(out_place, [in_tensors, out_tensor, dev_ctx, out_place] { + auto dev_ctx = dev_ctxes_[out_var_handle->place_]; + RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx, + t_out_p] { int s = 0, e = 0; for (size_t j = 0; j < in_tensors.size(); ++j) { e += in_tensors[j].dims()[0]; auto sub_out = out_tensor->Slice(s, e); - paddle::framework::TensorCopy(in_tensors[j], out_place, *(dev_ctx), - &sub_out); + paddle::framework::TensorCopy(in_tensors[j], t_out_p, *dev_ctx, &sub_out); s = e; } }); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index daba9bf2dd861d6366723ed71eada66bc67c78f2..21197d587b772aa046d6b3ce4394d3057ed6bf35 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -11,9 +11,11 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" +#include +#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/send_op_handle.h" #include "paddle/fluid/framework/scope.h" @@ -34,8 +36,8 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes, bool use_default_grad_scale, - platform::NCCLContextMap *nccl_ctxs) + const std::vector &local_scopes, + platform::NCCLContextMap *nccl_ctxs, bool use_default_grad_scale) : loss_var_name_(loss_var_name), places_(places), local_scopes_(local_scopes), @@ -105,6 +107,11 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op, std::unique_ptr MultiDevSSAGraphBuilder::Build( const ProgramDesc &program) const { + std::unordered_map var_types; + for (auto *var : program.Block(0).AllVars()) { + var_types[var->Name()] = var->GetType(); + } + auto graph = new SSAGraph(); SSAGraph &result = *graph; std::unordered_set og_has_been_broadcast; @@ -133,12 +140,17 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( is_forwarding = false; } else { CreateComputationalOps(&result, *op, places_.size()); - if (!is_forwarding) { + if (!is_forwarding && places_.size() > 1) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. for (auto &og : op->OutputArgumentNames()) { if (IsParameterGradientOnce(og, &og_has_been_broadcast)) { - InsertNCCLAllReduceOp(&result, og); + if (IsSparseGradient(var_types, og)) { + CreateReduceOp(&result, og, 0); + CreateBroadcastOp(&result, og, 0); + } else { + InsertNCCLAllReduceOp(&result, og); + } } } } @@ -165,6 +177,50 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( return std::unique_ptr(graph); } +bool MultiDevSSAGraphBuilder::IsSparseGradient( + const std::unordered_map &var_types, + const std::string &og) const { + PADDLE_ENFORCE(var_types.count(og) != 0); + if (var_types.at(og) == proto::VarType::SELECTED_ROWS) { + return true; + } + return false; +} + +void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result, + const std::string &p_name, + size_t src_dev_id) const { +#ifdef PADDLE_WITH_CUDA + auto *op_handle = new BroadcastOpHandle(local_scopes_, places_, nccl_ctxs_); +#else + auto *op_handle = new BroadcastOpHandle(local_scopes_, places_); +#endif + + result->ops_.emplace_back(op_handle); + auto *in = result->vars_.at(src_dev_id).at(p_name).back().get(); + op_handle->AddInput(in); + + for (size_t i = 0; i < places_.size(); ++i) { + auto &vars = result->vars_.at(i).at(p_name); + auto &p = places_[i]; + auto *out_var = new VarHandle(vars.size(), i, p_name, p); + vars.emplace_back(out_var); + op_handle->AddOutput(out_var); +#ifndef ADDLE_WITH_CUDA + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); +#endif + } +} + +void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result, + const OpDesc &op, + int dev_id) const { + result->ops_.emplace_back( + new ComputationOpHandle(op, local_scopes_[dev_id], places_[dev_id])); + CreateOpHandleIOs(result, op, dev_id); +} + OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc( const ProgramDesc &program) const { for (auto *op : program.Block(0).AllOps()) { @@ -174,7 +230,6 @@ OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc( } return nullptr; } - void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp( SSAGraph *result, const std::string &og) const { #ifdef PADDLE_WITH_CUDA @@ -247,6 +302,36 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result, } } +VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result, + const std::string &og, + int dst_dev_id) const { +#ifdef PADDLE_WITH_CUDA + result->ops_.emplace_back( + new ReduceOpHandle(local_scopes_, places_, nccl_ctxs_)); +#else + result->ops_.emplace_back(new ReduceOpHandle(local_scopes_, places_)); +#endif + auto *op_handle = result->ops_.back().get(); + + for (size_t i = 0; i < places_.size(); ++i) { + auto &vars = result->vars_[i][og]; +#ifndef PADDLE_WITH_CUDA + auto &p = places_[i]; + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); +#endif + PADDLE_ENFORCE(!vars.empty()); + auto &prev_grad = vars.back(); + op_handle->AddInput(prev_grad.get()); + } + auto &vars = result->vars_[dst_dev_id][og]; + auto var = + new VarHandle(vars.size() - 1, dst_dev_id, og, places_[dst_dev_id]); + vars.emplace_back(var); + op_handle->AddOutput(var); + return var; +} + void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result, const OpDesc &op) const { auto &p = places_[0]; @@ -263,6 +348,7 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { return op.OutputArgumentNames().size() == 1 && op.OutputArgumentNames()[0] == GradVarName(loss_var_name_); } + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index bad47458ef4cd1cd42e902341e8be66da5c210ed..674e2779a112c26d05b84cd54df2c826e9a63373 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -13,8 +13,8 @@ // limitations under the License. #pragma once - #include +#include #include #include "paddle/fluid/framework/details/ssa_graph_builder.h" @@ -27,6 +27,7 @@ class NCCLContextMap; namespace framework { class Scope; namespace details { + class MultiDevSSAGraphBuilder : public SSAGraphBuilder { public: #ifdef PADDLE_WITH_CUDA @@ -34,8 +35,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &loss_var_name, const std::unordered_set ¶ms, const std::vector &local_scopes, - bool skip_scale_loss, - platform::NCCLContextMap *nccl_ctxs); + platform::NCCLContextMap *nccl_ctxs, + bool use_default_grad_scale); #else MultiDevSSAGraphBuilder(const std::vector &places, const std::string &loss_var_name, @@ -74,6 +75,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { size_t num_places) const; void CreateScaleLossGradOp(SSAGraph *result) const; + VarHandle *CreateReduceOp(SSAGraph *result, const std::string &og, + int dst_dev_id) const; + void CreateComputationalOp(SSAGraph *result, const OpDesc &op, + int dev_id) const; bool IsParameterGradientOnce( const std::string &og, @@ -81,11 +86,18 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const; + void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, + size_t src_dev_id) const; + /** * Get send op in the global block of program. * nullptr if not found. */ OpDesc *GetSendOpDesc(const ProgramDesc &program) const; + + bool IsSparseGradient( + const std::unordered_map &var_types, + const std::string &og) const; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 409e8f72b841de03dcb50e62de447ae9895df2c0..1bb04c1dfca107f4b7ce4c599e9aa132de3e5985 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -22,6 +22,7 @@ namespace framework { namespace details { void ReduceOpHandle::RunImpl() { + if (places_.size() == 1) return; // the input and output may have dummy var. auto in_var_handles = DynamicCast(inputs_); @@ -51,44 +52,48 @@ void ReduceOpHandle::RunImpl() { // Wait input done, this Wait is asynchronous operation WaitInputVarGenerated(in_var_handles); - auto pre_place = in_0_handle->place_; - std::vector in_places; - auto pre_in_tensor = VariableVisitor::GetMutableTensor(pre_in_var); - for (auto *in_handle : in_var_handles) { - auto in_p = in_handle->place_; - PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(), - "Places must be all on CPU or all on CUDA."); - in_places.emplace_back(in_p); + // NOTE: The Places of all input tensor must be all on CPU or all on GPU. + std::vector in_places; // used to get dev_ctx + for (auto *in_handle : in_var_handles) { + in_places.emplace_back(in_handle->place_); auto in_var = var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_); PADDLE_ENFORCE_NOT_NULL(in_var); - - auto in_tensor = VariableVisitor::GetMutableTensor(in_var); - PADDLE_ENFORCE_EQ(in_tensor.type(), pre_in_tensor.type(), - "The type of input is not consistent."); + VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var); } auto out_var = var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_); PADDLE_ENFORCE_NOT_NULL(out_var); + // NOTE: The tensors' Place of input and output must be all on GPU or all on + // CPU. + auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place(); + platform::Place t_out_p; + if (platform::is_gpu_place(in_p)) { + PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place_), + "Places of input and output must be all on GPU."); + t_out_p = out_var_handle->place_; + } else { + t_out_p = platform::CPUPlace(); + } + if (pre_in_var->IsType()) { std::vector in_selected_rows = GetInputValues(in_var_handles, var_scopes); - GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, - out_var_handle->place_, + GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p, out_var->GetMutable()); } else { std::vector lod_tensors = GetInputValues(in_var_handles, var_scopes); - if (paddle::platform::is_cpu_place(pre_place)) { + if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) { ReduceLoDTensor func(lod_tensors, out_var->GetMutable()); VisitDataType(ToDataType(lod_tensors[0]->type()), func); - } else if (paddle::platform::is_gpu_place(pre_place)) { + } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { #ifdef PADDLE_WITH_CUDA auto pre_in = pre_in_var->Get(); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); @@ -96,7 +101,7 @@ void ReduceOpHandle::RunImpl() { out_var_handle->place_, pre_in.type()); auto out_p = out_var_handle->place_; - int root = boost::get(out_p).device; + int root_id = boost::get(out_p).device; std::vector> all_reduce_calls; for (size_t i = 0; i < var_scopes.size(); ++i) { auto &p = in_places[i]; @@ -104,23 +109,23 @@ void ReduceOpHandle::RunImpl() { int dev_id = boost::get(p).device; auto &nccl_ctx = nccl_ctxs_->at(dev_id); - auto stream = nccl_ctx.stream(); - auto comm = nccl_ctx.comm_; void *buffer = const_cast(lod_tensor.data()); void *recvbuffer = nullptr; - if (root == dev_id) { + if (root_id == dev_id) { recvbuffer = out_var->GetMutable()->mutable_data( out_var_handle->place_); } int type = platform::ToNCCLDataType(lod_tensor.type()); - all_reduce_calls.emplace_back([=] { - PADDLE_ENFORCE(platform::dynload::ncclReduce( - buffer, recvbuffer, static_cast(lod_tensor.numel()), - static_cast(type), ncclSum, root, comm, stream)); - }); + size_t numel = static_cast(lod_tensor.numel()); + all_reduce_calls.emplace_back( + [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] { + PADDLE_ENFORCE(platform::dynload::ncclReduce( + buffer, recvbuffer, numel, static_cast(type), + ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream())); + }); } this->RunAndRecordEvent([&] { @@ -130,7 +135,7 @@ void ReduceOpHandle::RunImpl() { } }); #else - PADDLE_THROW("CUDA is not support."); + PADDLE_THROW("CUDA is not enabled."); #endif } else { PADDLE_THROW("Place should be CPUPlace or CUDAPlace."); diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 9746b3bdbde14d24a83a27a593c5f1ebfec201ff..59731d348d17755fbd8bf3b6fa29b32bdefaf71e 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -55,7 +55,7 @@ struct ReduceOpHandle : public OpHandleBase { std::string Name() const override; - bool IsMultiDeviceTransfer() override { return false; }; + bool IsMultiDeviceTransfer() override { return true; }; protected: void RunImpl() override; diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index 9f7fd69e64fe9d7ef0bf3037bea7f686cb2eee0b..cae9af7217660fb7e4b8535ee8e022fb3a127668 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -62,7 +62,7 @@ struct VarHandle : public VarHandleBase { std::string name_; platform::Place place_; - bool operator==(const VarHandle& o) const { + bool IsTheSameVar(const VarHandle& o) const { return o.generated_op_ == generated_op_ && o.name_ == name_ && o.scope_idx_ == scope_idx_; } diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index 10bac0fae9504215fab11dd8cca7c278feaa4bda..3dfd14419d94379a0bf79f55d7a139acd77cbd7e 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -88,6 +88,52 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) { VisitVariable(src, &visitor); } +struct EnforceShapeAndDTypeEQVisitor { + const Variable* trg_; + + void operator()(const LoDTensor& src) { + auto& tensor = trg_->Get(); + PADDLE_ENFORCE_EQ( + src.place().which(), tensor.place().which(), + "The Places of the two Variable must be all on CPU or all on GPU."); + PADDLE_ENFORCE_EQ(src.type(), tensor.type(), + "The dtype of the two Variable is not equal."); + PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(), + "The dims of the two Variable is not equal."); + PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(), + "The lod of the two Variable is not equal."); + PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(), + "The layout of the two Variable's tensor is not equal."); + } + + void operator()(const SelectedRows& src) { + auto& selected_rows = trg_->Get(); + PADDLE_ENFORCE_EQ( + src.place().which(), selected_rows.place().which(), + "The Places of the two Variable must be all on CPU or all on GPU."); + PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(), + "The dtype of the two Variable is not equal."); + PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(), + "The layout of the two Variable's tensor is not equal."); + PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(), + "The height of the two Variable is not equal."); + PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(), + "The dims of the two Variable is not equal."); + } + + template + void operator()(const T&) { + PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s", + typeid(T).name()); + } +}; + +void VariableVisitor::EnforceShapeAndDTypeEQ(const Variable& var1, + const Variable& var2) { + EnforceShapeAndDTypeEQVisitor visitor{&var1}; + VisitVariable(var2, &visitor); +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h index 67baa1895e4513738fa73d49c46660da92279b9d..ca9a19bdcf1be7bf0e1d2b0de560a38f528a2d2c 100644 --- a/paddle/fluid/framework/details/variable_visitor.h +++ b/paddle/fluid/framework/details/variable_visitor.h @@ -26,6 +26,9 @@ class VariableVisitor { static Tensor &GetMutableTensor(Variable *var); static void ShareDimsAndLoD(const Variable &src, Variable *trg); + + static void EnforceShapeAndDTypeEQ(const Variable &var1, + const Variable &var2); }; } // namespace details diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 766bf0ab0c1c50146ad3f6e048738209428707b9..ce91d7a82674364560b8065277b28b51ae1b303a 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -348,8 +348,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } } } + platform::DeviceContextPool::Instance().Get(place_)->Wait(); if (create_vars && create_local_scope) { scope->DeleteScope(local_scope); + } else { + // Delete the local scopes created in operators. + scope->DropKids(); } if (FLAGS_benchmark) { VLOG(2) << "-------------------------------------------------------"; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 4712efeff68cf98a50b610acf78bb52d7519c520..9eea8d1c1861b8a7f6e49621b27c9871b0c1a590 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -93,7 +93,7 @@ ParallelExecutor::ParallelExecutor( #ifdef PADDLE_WITH_CUDA details::MultiDevSSAGraphBuilder builder( member_->places_, loss_var_name, params, member_->local_scopes_, - use_default_grad_scale, member_->nccl_ctxs_.get()); + member_->nccl_ctxs_.get(), use_default_grad_scale); #else details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, params, member_->local_scopes_, diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 288789d6e484100820c937e6081701f1e9245706..c8b656394b403c4965e01e96c9215d9406091907 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,5 @@ nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda) +nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc) add_subdirectory(convert) diff --git a/paddle/fluid/inference/tensorrt/io_converter.cc b/paddle/fluid/inference/tensorrt/io_converter.cc new file mode 100644 index 0000000000000000000000000000000000000000..2baac96c26453af7e70e541d80b437df3d5c2657 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/io_converter.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/io_converter.h" +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +using platform::is_gpu_place; +using platform::is_cpu_place; + +class DefaultInputConverter : public EngineInputConverter { + public: + DefaultInputConverter() {} + // NOTE out is GPU memory. + virtual void operator()(const LoDTensor& in, void* out, + size_t max_size) override { + PADDLE_ENFORCE(out != nullptr); + PADDLE_ENFORCE_LE(in.memory_size(), max_size); + const auto& place = in.place(); + if (is_cpu_place(place)) { + PADDLE_ENFORCE(stream_ != nullptr); + PADDLE_ENFORCE_EQ(0, + cudaMemcpyAsync(out, in.data(), in.memory_size(), + cudaMemcpyHostToDevice, *stream_)); + + } else if (is_gpu_place(place)) { + PADDLE_ENFORCE_EQ(0, + cudaMemcpyAsync(out, in.data(), in.memory_size(), + cudaMemcpyHostToHost, *stream_)); + + } else { + PADDLE_THROW("Unknown device for converter"); + } + cudaStreamSynchronize(*stream_); + } +}; + +REGISTER_TENSORRT_INPUT_CONVERTER(mul, DefaultInputConverter); + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/io_converter.h b/paddle/fluid/inference/tensorrt/io_converter.h new file mode 100644 index 0000000000000000000000000000000000000000..6ea61cbbac05f106f736b7d6a13912157c5ef48c --- /dev/null +++ b/paddle/fluid/inference/tensorrt/io_converter.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +using framework::LoDTensor; + +/* + * Convert Input from Fluid to an Engine. + * TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in + * most cases just need to copy the data. + */ +class EngineInputConverter { + public: + EngineInputConverter() {} + + virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {} + + void SetStream(cudaStream_t* stream) { stream_ = stream; } + + static void Run(const std::string& in_op_type, const LoDTensor& in, void* out, + size_t max_size, cudaStream_t* stream) { + PADDLE_ENFORCE(stream != nullptr); + auto* converter = Registry::Lookup(in_op_type); + PADDLE_ENFORCE_NOT_NULL(converter); + converter->SetStream(stream); + (*converter)(in, out, max_size); + } + + virtual ~EngineInputConverter() {} + + protected: + cudaStream_t* stream_{nullptr}; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \ + struct trt_input_##in_op_type__##_converter { \ + trt_input_##in_op_type__##_converter() { \ + ::paddle::inference::Registry::Register< \ + Converter__>(#in_op_type__); \ + } \ + }; \ + trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__; diff --git a/paddle/fluid/inference/tensorrt/test_io_converter.cc b/paddle/fluid/inference/tensorrt/test_io_converter.cc new file mode 100644 index 0000000000000000000000000000000000000000..365e9366862bee25c70dba0cdd92f318ab3ee90f --- /dev/null +++ b/paddle/fluid/inference/tensorrt/test_io_converter.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/tensorrt/io_converter.h" + +#include + +namespace paddle { +namespace inference { +namespace tensorrt { + +class EngineInputConverterTester : public ::testing::Test { + public: + void SetUp() override { tensor.Resize({10, 10}); } + + framework::LoDTensor tensor; +}; + +TEST_F(EngineInputConverterTester, DefaultCPU) { + void* buffer; + tensor.mutable_data(platform::CPUPlace()); + ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0); + + cudaStream_t stream; + EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(), + &stream); +} + +TEST_F(EngineInputConverterTester, DefaultGPU) { + void* buffer; + tensor.mutable_data(platform::CUDAPlace()); + ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0); + + cudaStream_t stream; + EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(), + &stream); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt index 97d9f03f88ad3e851a2dd4256d34e8ca76fdfb01..cc179a86256e6b552c08a091402157bdcc86b383 100644 --- a/paddle/fluid/inference/tests/book/CMakeLists.txt +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -24,6 +24,11 @@ function(inference_test TARGET_NAME) endforeach() endfunction(inference_test) +#################### +# Inference tests here depend on fluid/tests/book. If users want to run +# individual test with ctest, they need to run tests in fluid/tests/book +# first to generate saved model. +#################### # This unittest is buggy! #inference_test(fit_a_line) inference_test(image_classification ARGS vgg resnet) diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h new file mode 100644 index 0000000000000000000000000000000000000000..f05921067c45f156319375b613f51101cfda8e90 --- /dev/null +++ b/paddle/fluid/inference/utils/singleton.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { + +// NOTE not thread-safe. +template +struct Singleton { + static T& Global() { + static T* x = new T; + return *x; + } + + Singleton() = delete; + Singleton& operator=(const Singleton&) = delete; +}; + +/* + * An registor for any type. + * NOTE not thread-safe. + */ +template +struct Registry { + static Registry& Global() { + static auto* x = new Registry; + return *x; + } + + template + static void Register(const std::string& name) { + PADDLE_ENFORCE_EQ(items_.count(name), 0); + items_[name] = new ItemChild; + } + + static ItemParent* Lookup(const std::string& name) { + auto it = items_.find(name); + if (it == items_.end()) return nullptr; + return it->second; + } + + ~Registry() { + for (auto& item : items_) { + delete item.second; + } + } + + private: + Registry() = default; + static std::unordered_map items_; +}; + +template +std::unordered_map Registry::items_; + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc index c9ed221a6e662e8c213fe1d34ff85a3f77483a3c..7bdb3f274aa9bacb6b261e0d0cd00b72f1d409ae 100644 --- a/paddle/fluid/operators/adadelta_op.cc +++ b/paddle/fluid/operators/adadelta_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; class AdadeltaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -55,6 +56,12 @@ class AdadeltaOp : public framework::OperatorWithKernel { ctx->SetOutputDim("AvgSquaredGradOut", param_dim); ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("Param")->type()); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc index 0153e1253b00ded21a7a14e37faf5a76d904d8d1..1227129429addb0ed412c7f1755fd39c9ca77157 100644 --- a/paddle/fluid/operators/adagrad_op.cc +++ b/paddle/fluid/operators/adagrad_op.cc @@ -23,6 +23,7 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; class AdagradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -56,6 +57,12 @@ class AdagradOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("MomentOut", param_dims); } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("Param")->type()); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class AdagradOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc index 267dcab8104c337c8590180c8093098c756ab27d..f12f0c6663d1785b8af852244ffe32358fb1b693 100644 --- a/paddle/fluid/operators/adam_op.cc +++ b/paddle/fluid/operators/adam_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; class AdamOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -69,6 +70,12 @@ class AdamOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment2Out", param_dims); } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("Param")->type()); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class AdamOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc index 7e2f1cc66ebf8b7deebf55057a27129844129d5d..608b855d58a2f701fbb8631cb5f24768a61f3deb 100644 --- a/paddle/fluid/operators/adamax_op.cc +++ b/paddle/fluid/operators/adamax_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; class AdamaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -63,6 +64,12 @@ class AdamaxOp : public framework::OperatorWithKernel { ctx->SetOutputDim("MomentOut", param_dims); ctx->SetOutputDim("InfNormOut", param_dims); } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("Param")->type()); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index cf410c3ca16955620610634b99ee9111106ef99f..7a7b8b76e43b1f91a3ba2767c217993cc39f26b6 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -366,7 +366,8 @@ REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvGradOpKernel, paddle::operators::CUDNNConvGradOpKernel); diff --git a/paddle/fluid/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc index 5eeb3dee095e330174f35fa8eebd418bf764b132..5a1315fb2a80bf7f7f57388d0d6832686442c4ff 100644 --- a/paddle/fluid/operators/decayed_adagrad_op.cc +++ b/paddle/fluid/operators/decayed_adagrad_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; class DecayedAdagradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -51,6 +52,12 @@ class DecayedAdagradOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("MomentOut", param_dims); } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("Param")->type()); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index 02bb2b9cebb87b83aa1cbef0c644f969b4d17284..fffa9ae7a43ea5cd7b2bda6fbbf6ef9f7d23009d 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -69,6 +69,10 @@ message VariableMessage { bytes rows = 9; // Look up table block execution output variable name. string out_varname = 10; + // If true, the ps server will start profiling, the ps + // server stops profiling and generates a profile to /tmp/profile_ps_* + // when profile switches from true to false. + bool profile = 11; } message VoidMessage {} diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index 766bcf1ac5e06628638fcc8a305c00ab2795bbf2..d68cf467f7b0c6157dc1f69571e5d0c0b3c70348 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/detail/proto_encoder_helper.h" #include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -45,6 +46,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void* payload = nullptr; size_t payload_size; ProtoEncodeHelper e(static_cast(buf), 1024); + // Note: normally the profiler is enabled in 1 trainer, hence only + // 1 trainer returns true for ShouldSendProfileState(). It tells PS + // servers the trainer's profiling state so that PS can follow the + // trainer. + if (platform::ShouldSendProfileState()) { + e.WriteBool(VarMsg::kProfileFieldNumber, platform::IsProfileEnabled()); + } e.WriteString(VarMsg::kVarnameFieldNumber, name); if (var->IsType()) { e.WriteUint64(VarMsg::kTypeFieldNumber, 0); diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index fbef8d02a4d765052fccf3792ebe0373d46b1ef6..f4a374d56d28a30201f0d482e97e1a40e7a8bf41 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -17,6 +17,7 @@ #include #include #include +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h" @@ -427,7 +428,26 @@ int VariableResponse::Parse(Source* source) { meta_.set_out_varname(temp); break; } - + case sendrecv::VariableMessage::kProfileFieldNumber: { + bool profiling; + if (!input.ReadRaw(reinterpret_cast(&profiling), 1)) { + return tag; + } + meta_.set_profile(profiling); + int64_t listener_id = platform::ListenerId(); + if (listener_id <= 0) { + break; + } + if (profiling && !platform::IsProfileEnabled()) { + platform::EnableProfiler(platform::ProfilerState::kCPU); + } else if (!profiling && platform::IsProfileEnabled()) { + // TODO(panyx0718): Should we allow to customize file dir. + platform::DisableProfiler( + platform::EventSortingKey::kDefault, + string::Sprintf("/tmp/profile_ps_%lld", listener_id)); + } + break; + } default: { // Unknown tag, return unknown error. return -1; diff --git a/paddle/fluid/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc index 0a456f0981e5753a7de5a6f2ba029681beb347a5..cbdcce9beb3fafb0775d0b5fc39cb381ad128d0c 100644 --- a/paddle/fluid/operators/ftrl_op.cc +++ b/paddle/fluid/operators/ftrl_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; class FTRLOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -53,6 +54,12 @@ class FTRLOp : public framework::OperatorWithKernel { ctx->SetOutputDim("SquaredAccumOut", param_dim); ctx->SetOutputDim("LinearAccumOut", param_dim); } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("Param")->type()); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class FTRLOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 59b94511552874e1557d8a9d7a687af14f96c31c..8acbf820250957163397342c645b333f0da0801c 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/listen_and_serv_op.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -294,6 +295,8 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, void ListenAndServOp::RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const { + // Mark this as PS that it should decide profiling by listening from trainer. + platform::SetProfileListener(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); framework::Scope &recv_scope = scope.NewScope(); @@ -328,9 +331,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, rpc_service_->WaitServerReady(); // Write to a file of server selected port for python use. - std::string file_path = - string::Sprintf("/tmp/paddle.%d.selected_port", - static_cast(::getpid())); + std::string file_path = string::Sprintf("/tmp/paddle.%d.selected_port", + static_cast(::getpid())); SavePort(file_path); if (sync_mode) { RunSyncLoop(&executor, program, &recv_scope, prefetch_block); diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h index bd19d8908e35e51872d324ea5aa6bb02110d5a92..d36aa0ce025a1c0f717913131fcc75040d16afac 100644 --- a/paddle/fluid/operators/lod_reset_op.h +++ b/paddle/fluid/operators/lod_reset_op.h @@ -46,8 +46,7 @@ class LoDResetKernel : public framework::OpKernel { auto* lod = lod_t->data(); if (platform::is_gpu_place(ctx.GetPlace())) { framework::Tensor lod_cpu; - framework::TensorCopy(*lod_t, platform::CPUPlace(), - ctx.device_context(), &lod_cpu); + framework::TensorCopySync(*lod_t, platform::CPUPlace(), &lod_cpu); lod = lod_cpu.data(); } level0 = std::vector(lod, lod + lod_t->numel()); diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc index f1839e456d66ab95fb4ccac933cf7b635c54f5a0..66b626ed792ddec9d57fcf6c81655dffcc23ca99 100644 --- a/paddle/fluid/operators/lookup_sparse_table_op.cc +++ b/paddle/fluid/operators/lookup_sparse_table_op.cc @@ -62,7 +62,7 @@ class LookupSparseTableOp : public framework::OperatorBase { auto w_t = w_var->GetMutable(); std::vector keys; keys.resize(ids_t.numel()); - for (size_t i = 0; i < ids_t.numel(); ++i) { + for (int64_t i = 0; i < ids_t.numel(); ++i) { keys[i] = ids_t.data()[i]; } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index f36e9444dfb6dce2a7ea9eba153cda174f1ed6f1..53a478c1ac0bdf8c0a3f3721161779ef10cb14f8 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -41,7 +41,7 @@ math_library(depthwise_conv) math_library(gru_compute DEPS activation_functions math_function) math_library(im2col) math_library(lstm_compute DEPS activation_functions) -cc_library(blas SRCS blas.cc DEPS cblas framework_proto) +cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) math_library(math_function DEPS blas) math_library(maxouting) math_library(pooling) diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc index f0847aafae78f17eb28745bd224d45ec86497030..a46f2d51ca64501a622b5b48b424dffa16efc5b4 100644 --- a/paddle/fluid/operators/math/concat_test.cc +++ b/paddle/fluid/operators/math/concat_test.cc @@ -69,8 +69,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); - paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); + paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a); + paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b); } std::vector input; @@ -86,8 +86,8 @@ void testConcat() { int* out_ptr; if (paddle::platform::is_gpu_place(Place())) { - paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, - &out_cpu); + paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(), + &out_cpu); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); @@ -142,8 +142,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); - paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); + paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a); + paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b); } input.clear(); @@ -157,8 +157,8 @@ void testConcat() { PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); if (paddle::platform::is_gpu_place(Place())) { - paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, - &out_cpu); + paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(), + &out_cpu); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); @@ -215,8 +215,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); - paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); + paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a); + paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b); } input.clear(); @@ -230,8 +230,8 @@ void testConcat() { PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); if (paddle::platform::is_gpu_place(Place())) { - paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, - &out_cpu); + paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(), + &out_cpu); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); @@ -290,8 +290,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); - paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); + paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a); + paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b); } input.clear(); @@ -305,8 +305,8 @@ void testConcat() { PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); if (paddle::platform::is_gpu_place(Place())) { - paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, - &out_cpu); + paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(), + &out_cpu); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index 19f6b213aa3bc06f7f5750fa42745fd8755c51b9..ccbd05c82ad6a880d21269092088be9656b35c99 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -59,9 +59,9 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, r_prev_state = value.prev_state_value[i]; } - op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, - r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, - active_gate, active_state); + op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state, + &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO, + active_node, active_gate, active_state); value_in[i] = r_value_in; value_ig[i] = r_value_ig; @@ -125,11 +125,11 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, r_prev_state = value.prev_state_value[i]; } - op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, - r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, - r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, - r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, - active_state); + op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, + &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, + &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, + &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, + active_node, active_gate, active_state); grad_in[i] = r_grad_in; grad_ig[i] = r_grad_ig; @@ -186,9 +186,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i]; } - op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, - r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, - active_gate, active_state); + op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state, + &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO, + active_node, active_gate, active_state); value_in[i] = r_value_in; value_ig[i] = r_value_ig; @@ -258,11 +258,11 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i]; } - op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, - r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, - r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, - r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, - active_state); + op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, + &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, + &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, + &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, + active_node, active_gate, active_state); grad_in[i] = r_grad_in; grad_ig[i] = r_grad_ig; diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h index d29c780dcfb1f1a3cbab25256238769d3a5ccd93..2aecb69237fdf344ebc0bfe72d9c7c147f06358d 100644 --- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h @@ -70,9 +70,9 @@ __global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, r_prev_state = value.prev_state_value[frame_idx]; } - op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, - r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate, - active_state); + op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state, + &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO, + active_node, active_gate, active_state); value.gate_value[frame_idx] = r_value_in; value.gate_value[frame_idx + frame_size] = r_value_ig; @@ -145,11 +145,11 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue value, r_prev_state = value.prev_state_value[frame_idx]; } - op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, - r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, - r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, - r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, - active_state); + op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig, + &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state, + &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF, + &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, active_node, + active_gate, active_state); grad.gate_grad[frame_idx] = r_grad_in; grad.gate_grad[frame_idx + frame_size] = r_grad_ig; diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h index 9080634f2b3fc122a420e049314f53abd50376e0..cbe73d62938d7c4c03a2c8731665260624417fd7 100644 --- a/paddle/fluid/operators/math/detail/lstm_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_kernel.h @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once +#include #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/platform/hostdevice.h" -#include - namespace paddle { namespace operators { namespace math { @@ -27,19 +27,19 @@ namespace forward { template class lstm { public: - HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, - T &prev_state, T &state, T &state_atv, T &output, - T &checkI, T &checkF, T &checkO, + HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og, + T *prev_state, T *state, T *state_atv, T *output, + T *checkI, T *checkF, T *checkO, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { - value_in = activation(value_in, active_node); - value_ig = activation(value_ig + prev_state * checkI, active_gate); - value_fg = activation(value_fg + prev_state * checkF, active_gate); - state = value_in * value_ig + prev_state * value_fg; - value_og = activation(value_og + state * checkO, active_gate); - state_atv = activation(state, active_state); - output = value_og * state_atv; + *value_in = activation(*value_in, active_node); + *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate); + *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate); + *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg); + *value_og = activation(*value_og + (*state) * (*checkO), active_gate); + *state_atv = activation(*state, active_state); + *output = (*value_og) * (*state_atv); } #ifndef __NVCC__ #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default @@ -48,27 +48,27 @@ class lstm { // Only float support AVX optimization static const bool avx = std::is_same::value; - HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig, - __m256 &value_fg, __m256 &value_og, - __m256 &prev_state, __m256 &state, - __m256 &state_atv, __m256 &output, __m256 &checkI, - __m256 &checkF, __m256 &checkO, + HOSTDEVICE void operator()(__m256 *value_in, __m256 *value_ig, + __m256 *value_fg, __m256 *value_og, + __m256 *prev_state, __m256 *state, + __m256 *state_atv, __m256 *output, __m256 *checkI, + __m256 *checkF, __m256 *checkO, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { - value_in = activation(value_in, active_node); - value_ig = - activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)), - active_gate); - value_fg = - activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)), - active_gate); - state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig), - _mm256_mul_ps(prev_state, value_fg)); - value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)), - active_gate); - state_atv = activation(state, active_state); - output = _mm256_mul_ps(value_og, state_atv); + *value_in = activation(*value_in, active_node); + *value_ig = activation( + _mm256_add_ps(*value_ig, _mm256_mul_ps(*prev_state, *checkI)), + active_gate); + *value_fg = activation( + _mm256_add_ps(*value_fg, _mm256_mul_ps(*prev_state, *checkF)), + active_gate); + *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig), + _mm256_mul_ps(*prev_state, *value_fg)); + *value_og = activation( + _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate); + *state_atv = activation(*state, active_state); + *output = _mm256_mul_ps(*value_og, *state_atv); } #endif #endif @@ -81,26 +81,29 @@ namespace backward { template class lstm { public: - HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, - T &grad_in, T &grad_ig, T &grad_fg, T &grad_og, - T &prev_state, T &prev_state_grad, T &state, - T &state_grad, T &state_atv, T &output_grad, - T &checkI, T &checkF, T &checkO, T &checkIGrad, - T &checkFGrad, T &checkOGrad, + HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og, + T *grad_in, T *grad_ig, T *grad_fg, T *grad_og, + T *prev_state, T *prev_state_grad, T *state, + T *state_grad, T *state_atv, T *output_grad, + T *checkI, T *checkF, T *checkO, T *checkIGrad, + T *checkFGrad, T *checkOGrad, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { - grad_og = activation(output_grad * state_atv, value_og, active_gate); - state_grad += activation(output_grad * value_og, state_atv, active_state) + - grad_og * checkO; - grad_in = activation(state_grad * value_ig, value_in, active_node); - grad_ig = activation(state_grad * value_in, value_ig, active_gate); - grad_fg = activation(state_grad * prev_state, value_fg, active_gate); - prev_state_grad = - grad_ig * checkI + grad_fg * checkF + state_grad * value_fg; - checkIGrad = grad_ig * prev_state; - checkFGrad = grad_fg * prev_state; - checkOGrad = grad_og * state; + *grad_og = + activation((*output_grad) * (*state_atv), *value_og, active_gate); + *state_grad += + activation((*output_grad) * (*value_og), *state_atv, active_state) + + (*grad_og) * (*checkO); + *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node); + *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate); + *grad_fg = + activation((*state_grad) * (*prev_state), *value_fg, active_gate); + *prev_state_grad = (*grad_ig) * (*checkI) + (*grad_fg) * (*checkF) + + (*state_grad) * (*value_fg); + *checkIGrad = (*grad_ig) * (*prev_state); + *checkFGrad = (*grad_fg) * (*prev_state); + *checkOGrad = (*grad_og) * (*state); } #ifndef __NVCC__ #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default @@ -109,32 +112,33 @@ class lstm { // Only float support AVX optimization static const bool avx = std::is_same::value; HOSTDEVICE void operator()( - __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og, - __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og, - __m256 &prev_state, __m256 &prev_state_grad, __m256 &state, - __m256 &state_grad, __m256 &state_atv, __m256 &output_grad, - __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, - __m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node, + __m256 *value_in, __m256 *value_ig, __m256 *value_fg, __m256 *value_og, + __m256 *grad_in, __m256 *grad_ig, __m256 *grad_fg, __m256 *grad_og, + __m256 *prev_state, __m256 *prev_state_grad, __m256 *state, + __m256 *state_grad, __m256 *state_atv, __m256 *output_grad, + __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad, + __m256 *checkFGrad, __m256 *checkOGrad, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { - grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og, - active_gate); - state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og), - state_atv, active_state), - state_grad); - state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad); - grad_in = - activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node); - grad_ig = - activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate); - grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg, - active_gate); - prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI), - _mm256_mul_ps(grad_fg, checkF)); - prev_state_grad = - _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad); - checkIGrad = _mm256_mul_ps(grad_ig, prev_state); - checkFGrad = _mm256_mul_ps(grad_fg, prev_state); - checkOGrad = _mm256_mul_ps(grad_og, state); + *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og, + active_gate); + *state_grad = + _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og), + *state_atv, active_state), + *state_grad); + *state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad); + *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in, + active_node); + *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig, + active_gate); + *grad_fg = activation(_mm256_mul_ps(*state_grad, *prev_state), *value_fg, + active_gate); + *prev_state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_ig, *checkI), + _mm256_mul_ps(*grad_fg, *checkF)); + *prev_state_grad = + _mm256_add_ps(_mm256_mul_ps(*state_grad, *value_fg), *prev_state_grad); + *checkIGrad = _mm256_mul_ps(*grad_ig, *prev_state); + *checkFGrad = _mm256_mul_ps(*grad_fg, *prev_state); + *checkOGrad = _mm256_mul_ps(*grad_og, *state); } #endif #endif diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index b9a1b9ae4d6e8c82aa782190d9e145e7a2b502be..b0c201db0ccbe81d8f57cd984d2cdfd2f6a48f25 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -41,7 +41,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod, if (paddle::platform::is_cpu_place(*place)) { seq = cpu_seq; } else { - TensorCopy(cpu_seq, *place, *context, &seq); + TensorCopySync(cpu_seq, *place, &seq); seq.set_lod(lod); } @@ -64,7 +64,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod, if (paddle::platform::is_cpu_place(*place)) { cpu_seq_back = seq_back; } else { - TensorCopy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back); + TensorCopySync(seq_back, paddle::platform::CPUPlace(), &cpu_seq_back); cpu_seq_back.set_lod(lod); } diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu index 45a2550793511f7cb8c20644ac79e9e88629ce7b..2f8a602f3c5c0a7c262235f99943ce336e20a7b4 100644 --- a/paddle/fluid/operators/multiplex_op.cu +++ b/paddle/fluid/operators/multiplex_op.cu @@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); + TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); platform::CUDAPlace place = boost::get(ctx.GetPlace()); @@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); + TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index 39c862b03ad497dca5c38ccecff20be510ab60e5..d60a99994edc926456706ff6a3ba998a3e5e7dd5 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -174,7 +174,8 @@ REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); + ops::PoolCUDNNOpKernel, + ops::PoolCUDNNOpKernel); REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, ops::PoolCUDNNGradOpKernel, ops::PoolCUDNNGradOpKernel); diff --git a/paddle/fluid/operators/proximal_adagrad_op.cc b/paddle/fluid/operators/proximal_adagrad_op.cc index 38cd97c17b16a4cc64f7e6d52150fae392df6036..e057244c1e974edea1b9bbc76c0585c295495299 100644 --- a/paddle/fluid/operators/proximal_adagrad_op.cc +++ b/paddle/fluid/operators/proximal_adagrad_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; class ProximalAdagradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -55,6 +56,12 @@ class ProximalAdagradOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("MomentOut", param_dim); } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("Param")->type()); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/proximal_gd_op.cc b/paddle/fluid/operators/proximal_gd_op.cc index efb4e1ac204ce79bfad7d77038f342be09e8f0e8..ed1472631870e5aee6b0e8b8f80bb5e6c84a3851 100644 --- a/paddle/fluid/operators/proximal_gd_op.cc +++ b/paddle/fluid/operators/proximal_gd_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; class ProximalGDOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -43,6 +44,12 @@ class ProximalGDOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dim); } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("Param")->type()); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index 082f761d37ebdecccfa69981856dea2daef818b8..9ae80da6550bcef39c07f05e35d4153c24738f09 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -224,7 +224,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout, for (int offset = 16; offset > 0; offset = offset / 2) { // blockDim.x is 32. - val += platform::__shfl_down_sync(mask, val, offset); + val += platform::CudaShuffleDownSync(mask, val, offset); } __syncthreads(); @@ -284,7 +284,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, for (int offset = 16; offset > 0; offset = offset / 2) { // blockDim.x is 32. - val += platform::__shfl_down_sync(mask, val, offset); + val += platform::CudaShuffleDownSync(mask, val, offset); } __syncthreads(); diff --git a/paddle/fluid/operators/sequence_slice_op.h b/paddle/fluid/operators/sequence_slice_op.h index b9c565cac9581a2e830697c1136919062eef345c..b5ea6ff49bbb29571f9a6ef6358ef881acd9be9e 100644 --- a/paddle/fluid/operators/sequence_slice_op.h +++ b/paddle/fluid/operators/sequence_slice_op.h @@ -66,13 +66,11 @@ class SequenceSliceOpKernel : public framework::OpKernel { if (platform::is_gpu_place(ctx.GetPlace())) { offset_cpu.mutable_data(offset->dims(), platform::CPUPlace()); - framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(), - &offset_cpu); + framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu); offset_data = offset_cpu.data(); length_cpu.mutable_data(length->dims(), platform::CPUPlace()); - framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(), - &length_cpu); + framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu); length_data = length_cpu.data(); } @@ -127,13 +125,11 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { if (platform::is_gpu_place(ctx.GetPlace())) { offset_cpu.mutable_data(offset->dims(), platform::CPUPlace()); - framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(), - &offset_cpu); + framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu); offset_data = offset_cpu.data(); length_cpu.mutable_data(length->dims(), platform::CPUPlace()); - framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(), - &length_cpu); + framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu); length_data = length_cpu.data(); } diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index faaae1f9b6ae1e2c577017c34116dab95e261ab2..9da8551eb2d7ea66ad434c42b54522432095ce29 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -241,7 +241,8 @@ __device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, CREATE_SHFL_MASK(mask, true); if (maxid[0] / 32 == warp) { - if (platform::__shfl_sync(mask, *beam, (maxid[0]) % 32, 32) == MaxLength) + if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) == + MaxLength) break; } } diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index 2405f33d4f0ad83611e57d07a47e787eab439285..e81c385727be5c2ba3f02bfbd86168cb4650dfda 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -18,34 +18,33 @@ limitations under the License. */ namespace paddle { namespace platform { -// __shfl_down and __shfl have been deprecated as of CUDA 9.0. #if CUDA_VERSION < 9000 -template -__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) { - return __shfl_down(val, delta); -} - -template -__forceinline__ __device__ T __shfl_sync(unsigned, T val, int src_line, - int width) { - return __shfl(val, src_line, width); -} #define CREATE_SHFL_MASK(mask, predicate) mask = 0u; #else #define FULL_WARP_MASK 0xFFFFFFFF #define CREATE_SHFL_MASK(mask, predicate) \ mask = __ballot_sync(FULL_WARP_MASK, (predicate)) +#endif + template -__forceinline__ __device__ T __shfl_down_sync(unsigned mask, T val, int delta) { - return __shfl_down_sync(mask, val, delta); +__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, + int delta, int width = 32) { +#if CUDA_VERSION < 9000 + return __shfl_down(val, delta, width); +#else + return __shfl_down_sync(mask, val, delta, width); +#endif } template -__forceinline__ __device__ T __shfl_sync(unsigned mask, T val, int src_line, - int width) { +__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line, + int width = 32) { +#if CUDA_VERSION < 9000 + return __shfl(val, src_line, width); +#else return __shfl_sync(mask, val, src_line, width); -} #endif +} template __device__ T reduceSum(T val, int tid, int len) { @@ -61,7 +60,7 @@ __device__ T reduceSum(T val, int tid, int len) { CREATE_SHFL_MASK(mask, tid < len); for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::__shfl_down_sync(mask, val, offset); + val += platform::CudaShuffleDownSync(mask, val, offset); if (tid < warpSize) shm[tid] = 0; @@ -75,7 +74,7 @@ __device__ T reduceSum(T val, int tid, int len) { if (tid < warpSize) { val = shm[tid]; for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::__shfl_down_sync(mask, val, offset); + val += platform::CudaShuffleDownSync(mask, val, offset); } return val; } diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 412cdda286c3a77af002fdc5eb6a5ae440606b82..cfddd8e8711f8005e0eff7ef7a2980f535b2f851 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -13,12 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" + #include #include #include #include +#include #include #include // NOLINT +#include #include #ifdef PADDLE_WITH_CUDA #include @@ -33,6 +36,9 @@ namespace platform { struct EventList; +static int64_t profiler_lister_id = 0; +static bool should_send_profile_state = false; + // The profiler state, the initial value is ProfilerState::kDisabled static ProfilerState g_state = ProfilerState::kDisabled; // The thread local event list only can be accessed by the specific thread @@ -219,13 +225,12 @@ void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enbale profling, since the input state is ", "ProfilerState::kDisabled"); - PADDLE_ENFORCE(g_state == ProfilerState::kDisabled, - "The profiling state should be disabled when calling ", - "EnableProfiler."); - g_state = state; - if (g_state == ProfilerState::kAll) { - GetDeviceTracer()->Enable(); + if (state == g_state) { + return; } + g_state = state; + should_send_profile_state = true; + GetDeviceTracer()->Enable(); #ifdef PADDLE_WITH_CUDA if (g_state == ProfilerState::kCUDA) { // Generate some dummy events first to reduce the startup overhead. @@ -435,8 +440,7 @@ void ParseEvents(const std::vector>& events, void DisableProfiler(EventSortingKey sorted_key, const std::string& profile_path) { - PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, - "Can't disable profiling, since it's not starting."); + if (g_state == ProfilerState::kDisabled) return; // Mark the profiling stop. Mark("_stop_profiler_", nullptr); @@ -444,12 +448,25 @@ void DisableProfiler(EventSortingKey sorted_key, ParseEvents(all_events, sorted_key); ResetProfiler(); DeviceTracer* tracer = GetDeviceTracer(); - if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) { + if (tracer->IsEnabled()) { tracer->Disable(); tracer->GenProfile(profile_path); } g_state = ProfilerState::kDisabled; + should_send_profile_state = true; +} + +bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; } +bool ShouldSendProfileState() { return should_send_profile_state; } + +void SetProfileListener() { + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist6( + 1, std::numeric_limits::max()); + profiler_lister_id = dist6(rng); } +int64_t ListenerId() { return profiler_lister_id; } } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 428d9ebceaabd987261c1dcd6e66faf044b702c0..61b98143e41abb9e47d2c717c7876f1bab7f5077 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -114,5 +114,13 @@ void ResetProfiler(); void DisableProfiler(EventSortingKey sorted_key, const std::string& profile_path); +// Test if the profiler is currently enabled. +bool IsProfileEnabled(); +// Whether the trainer should send profiling state to PS. +bool ShouldSendProfileState(); +// Mark current process as PS by assigning a lister id. +void SetProfileListener(); +int64_t ListenerId(); + } // namespace platform } // namespace paddle diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index dcf4e2a8e013f8e4e70ac1335890e7df0a050b5f..37d368946770978700abe49eef6825e1d96839f0 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -40,16 +40,14 @@ import backward import regularizer import average import metrics +import transpiler from param_attr import ParamAttr, WeightNormParamAttr from data_feeder import DataFeeder from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace -from distribute_transpiler import DistributeTranspiler -from distribute_transpiler_simple import SimpleDistributeTranspiler +from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, InferenceTranspiler, memory_optimize, release_memory from concurrency import (Go, make_channel, channel_send, channel_recv, channel_close, Select) -from inference_transpiler import InferenceTranspiler import clip -from memory_optimization_transpiler import memory_optimize, release_memory import profiler import unique_name import recordio_writer @@ -58,7 +56,7 @@ from parallel_executor import ParallelExecutor Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\ - trainer.__all__ + inferencer.__all__ + [ + trainer.__all__ + inferencer.__all__ + transpiler.__all__ + [ 'io', 'initializer', 'layers', @@ -76,11 +74,6 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\ 'WeightNormParamAttr', 'DataFeeder', 'clip', - 'SimpleDistributeTranspiler', - 'DistributeTranspiler', - 'InferenceTranspiler', - 'memory_optimize', - 'release_memory', 'profiler', 'unique_name', 'recordio_writer', diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py index 58e027695a7100245dd424583e2cedeed3d165e6..b38526bc574a103ece86aecdecf06b0bcfd6cad0 100644 --- a/python/paddle/fluid/inferencer.py +++ b/python/paddle/fluid/inferencer.py @@ -13,7 +13,9 @@ # limitations under the License. import core - +import framework +import executor +import io __all__ = ['Inferencer', ] @@ -29,6 +31,15 @@ class Inferencer(object): # 4. load params from param_path into scope self.scope = core.Scope() self.place = place + self.startup_program = framework.Program() + # TODO: generate the startup_program with network_func + + exe = executor.Executor(place) + exe.run(self.startup_program, scope=self.scope) + + if param_path: + # load params from param_path into scope + io.load_persistables(exe, dirname=param_path) def infer(self, inputs): # run self.program diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index c618b02a768f2ca3e2b2914d8ee0134836d5c0d2..bb9c6fdc60089fc2b43573a6421a6f9781d2d4a8 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -251,7 +251,7 @@ class EditDistance(MetricBase): self.instance_error += seq_num - seq_right_count self.total_distance += total_distance - def eval(): + def eval(self): if self.seq_num == 0: raise ValueError( "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance." @@ -280,6 +280,7 @@ class DetectionMAP(MetricBase): super(DetectionMAP, self).__init__(name) # the current map value self.value = .0 + self.weight = .0 def update(self, value, weight): if not _is_number_or_matrix_(value): @@ -340,8 +341,8 @@ class Auc(MetricBase): raise ValueError("The 'predictions' must be a numpy ndarray.") kepsilon = 1e-7 # to account for floating point imprecisions - thresholds = [(i + 1) * 1.0 / (num_thresholds - 1) - for i in range(num_thresholds - 2)] + thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1) + for i in range(self._num_thresholds - 2)] thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] # caculate TP, FN, TN, FP count @@ -358,19 +359,20 @@ class Auc(MetricBase): fp += 1 else: tn += 1 - tp_list[idx_thresh] += tp - fn_list[idx_thresh] += fn - tn_list[idx_thresh] += tn - fp_list[idx_thresh] += fp + self.tp_list[idx_thresh] += tp + self.fn_list[idx_thresh] += fn + self.tn_list[idx_thresh] += tn + self.fp_list[idx_thresh] += fp def eval(self): epsilon = self._epsilon num_thresholds = self._num_thresholds - tpr = (tp_list.astype("float32") + epsilon) / ( - tp_list + fn_list + epsilon) - fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon) - rec = (tp_list.astype("float32") + epsilon) / ( - tp_list + fp_list + epsilon) + tpr = (self.tp_list.astype("float32") + epsilon) / ( + self.tp_list + self.fn_list + epsilon) + fpr = self.fp_list.astype("float32") / ( + self.fp_list + self.tn_list + epsilon) + rec = (self.tp_list.astype("float32") + epsilon) / ( + self.tp_list + self.fp_list + epsilon) x = fpr[:num_thresholds - 1] - fpr[1:] y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index f4128dcbe935ea5caf63b8c0b377f9bed36d6705..6b80b007e9080922241ee6c66e0577a18b357980 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -43,7 +43,7 @@ class ParallelExecutor(object): training. allow_op_delay(bool, default False): Whether to delay and buffer some operators together for scheduling or not, which may - improve performance in some cases, defalut False. + improve performance in some cases, default False. share_vars_from(ParallelExecutor, default None): If provied, it will share variables from the specified ParallelExecutor. use_default_grad_scale(bool, default True): If set True, a default @@ -93,7 +93,7 @@ class ParallelExecutor(object): if use_cuda: # Experiments on se-resnext shows that too many threads hurt # performance. Worth tunning for other models in the future. - num_threads = len(self._places) + num_threads = len(self._places) * 2 else: num_threads = min( len(self._places) * 2, multiprocessing.cpu_count()) @@ -130,6 +130,7 @@ class ParallelExecutor(object): local_scopes, allow_op_delay, use_default_grad_scale) + self.scope = scope def run(self, fetch_list, feed=None, feed_dict=None): diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 7703dfe0135b402f830bcdeaf47c26e5e3f8ca58..dd4ef7cc94ea1e8de5fe4775408389907d47d0d6 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -70,9 +70,11 @@ def conv3d_forward_naive(input, filter, group, conv_param): class TestConv3dOp(OpTest): def setUp(self): + self.op_type = "conv3d" self.use_cudnn = False + self.dtype = np.float32 + self.init_kernel_type() self.init_group() - self.init_op_type() self.init_dilation() self.init_test_case() @@ -80,20 +82,24 @@ class TestConv3dOp(OpTest): 'stride': self.stride, 'pad': self.pad, 'dilations': self.dilations, - 'use_cudnn': self.use_cudnn, 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } - input = np.random.random(self.input_size).astype("float32") - filter = np.random.random(self.filter_size).astype("float32") + + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.random(self.filter_size).astype(self.dtype) output = conv3d_forward_naive(input, filter, self.groups, - conv3d_param).astype("float32") + conv3d_param).astype(self.dtype) - self.inputs = {'Input': input, 'Filter': filter} + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } self.attrs = { 'strides': self.stride, 'paddings': self.pad, 'groups': self.groups, - 'dilations': self.dilations + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn } self.outputs = {'Output': output} @@ -108,6 +114,8 @@ class TestConv3dOp(OpTest): self.check_output() def test_check_grad(self): + if self.dtype == np.float16: + return if self.testcudnn(): place = core.CUDAPlace(0) self.check_grad_with_place( @@ -120,6 +128,8 @@ class TestConv3dOp(OpTest): set(['Input', 'Filter']), 'Output', max_relative_error=0.03) def test_check_grad_no_filter(self): + if self.dtype == np.float16: + return if self.testcudnn(): place = core.CUDAPlace(0) self.check_grad_with_place( @@ -135,6 +145,8 @@ class TestConv3dOp(OpTest): no_grad_set=set(['Filter'])) def test_check_grad_no_input(self): + if self.dtype == np.float16: + return if self.testcudnn(): place = core.CUDAPlace(0) self.check_grad_with_place( @@ -163,8 +175,8 @@ class TestConv3dOp(OpTest): def init_group(self): self.groups = 1 - def init_op_type(self): - self.op_type = "conv3d" + def init_kernel_type(self): + pass class TestCase1(TestConv3dOp): @@ -235,34 +247,90 @@ class TestWithDilation(TestConv3dOp): self.groups = 3 +#----------------Conv3dCUDNN---------------- class TestCUDNN(TestConv3dOp): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv3d" + + +class TestFP16CUDNN(TestConv3dOp): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) class TestWithGroup1CUDNN(TestWithGroup1): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv3d" + + +class TestFP16WithGroup1CUDNN(TestWithGroup1): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) class TestWithGroup2CUDNN(TestWithGroup2): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv3d" + + +class TestFP16WithGroup2CUDNN(TestWithGroup2): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) class TestWith1x1CUDNN(TestWith1x1): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv3d" + + +class TestFP16With1x1CUDNN(TestWith1x1): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv3d" + + +class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) # FIXME(typhoonzero): find a way to determine if diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index c783a142467f3f6a9cd210425acfc526a32a6f71..9056f5e66fceb42397c9a923d802320dd772725b 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -280,7 +280,7 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( './mnist.recordio', reader, feeder) - def test_simple_fc(self): + def check_simple_fc_convergence(self): self.check_network_convergence(simple_fc_net) self.check_network_convergence(simple_fc_net, allow_op_delay=True) @@ -290,7 +290,10 @@ class TestMNIST(TestParallelExecutorBase): simple_fc_net, feed_dict={"image": img, "label": label}) - def test_simple_fc_parallel_accuracy(self): + def test_simple_fc(self): + self.check_simple_fc_convergence() + + def check_simple_fc_parallel_accuracy(self): img = numpy.zeros(shape=[32, 784], dtype='float32') label = numpy.ones(shape=[32, 1], dtype='int64') single_first_loss, single_last_loss = self.check_network_convergence( @@ -311,7 +314,10 @@ class TestMNIST(TestParallelExecutorBase): for p_l in parallel_last_loss: self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) - def test_batchnorm_fc(self): + def test_simple_fc_parallel_accuracy(self): + self.check_simple_fc_parallel_accuracy() + + def check_batchnorm_fc_convergence(self): self.check_network_convergence(fc_with_batchnorm) img = numpy.zeros(shape=[32, 784], dtype='float32') label = numpy.ones(shape=[32, 1], dtype='int64') @@ -319,6 +325,9 @@ class TestMNIST(TestParallelExecutorBase): fc_with_batchnorm, feed_dict={"image": img, "label": label}) + def test_batchnorm_fc(self): + self.check_batchnorm_fc_convergence() + class TestResnet(TestParallelExecutorBase): # @classmethod @@ -339,7 +348,7 @@ class TestResnet(TestParallelExecutorBase): # fluid.recordio_writer.convert_reader_to_recordio_file( # "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress) - def test_resnet(self): + def check_resnet_convergence(self): import functools batch_size = 2 self.check_network_convergence( @@ -348,6 +357,9 @@ class TestResnet(TestParallelExecutorBase): iter=20, batch_size=batch_size) + def test_resnet(self): + self.check_resnet_convergence() + class ModelHyperParams(object): # Dictionary size for source and target language. This model directly uses @@ -510,7 +522,7 @@ class TestTransformer(TestParallelExecutorBase): class ParallelExecutorTestingDuringTraining(unittest.TestCase): - def test_parallel_testing(self): + def check_network_convergence(self): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -550,6 +562,9 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): "Train loss: " + str(train_loss) + "\n Test loss:" + str(test_loss)) + def test_parallel(self): + self.check_network_convergence() + import paddle.dataset.conll05 as conll05 import paddle.fluid as fluid @@ -568,21 +583,26 @@ embedding_name = 'emb' def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, - **ignored): + is_sparse, **ignored): # 8 features predicate_embedding = fluid.layers.embedding( input=predicate, + is_sparse=is_sparse, size=[pred_dict_len, word_dim], dtype='float32', param_attr='vemb') mark_embedding = fluid.layers.embedding( - input=mark, size=[mark_dict_len, mark_dim], dtype='float32') + input=mark, + is_sparse=is_sparse, + size=[mark_dict_len, mark_dim], + dtype='float32') word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] emb_layers = [ fluid.layers.embedding( size=[word_dict_len, word_dim], + is_sparse=is_sparse, input=x, param_attr=fluid.ParamAttr( name=embedding_name, trainable=False)) for x in word_input @@ -632,7 +652,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, class TestCRFModel(unittest.TestCase): - def test_all(self): + def check_network_convergence(self, is_sparse): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -652,6 +672,7 @@ class TestCRFModel(unittest.TestCase): name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) mark = fluid.layers.data( name='mark_data', shape=[1], dtype='int64', lod_level=1) + feature_out = db_lstm(**locals()) target = fluid.layers.data( name='target', shape=[1], dtype='int64', lod_level=1) @@ -694,3 +715,9 @@ class TestCRFModel(unittest.TestCase): print map(numpy.array, pe.run(feed=feeder.feed(cur_batch), fetch_list=[avg_cost.name]))[0] + + def test_update_sparse_parameter(self): + self.check_network_convergence(is_sparse=True) + + def test_update_dense_parameter(self): + self.check_network_convergence(is_sparse=False) diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py index aaa94842513691c836e04353aa4bc5ce5e66c5c3..142165f29beeaedfaa660f04424147e06710d192 100644 --- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py @@ -90,20 +90,22 @@ def avg_pool3D_forward_naive(x, class TestPool3d_Op(OpTest): def setUp(self): + self.op_type = "pool3d" self.use_cudnn = False + self.dtype = np.float32 self.init_test_case() self.init_global_pool() - self.init_op_type() + self.init_kernel_type() self.init_pool_type() self.init_ceil_mode() if self.global_pool: self.paddings = [0 for _ in range(len(self.paddings))] - input = np.random.random(self.shape).astype("float32") + input = np.random.random(self.shape).astype(self.dtype) output = self.pool3D_forward_naive(input, self.ksize, self.strides, self.paddings, self.global_pool, - self.ceil_mode).astype("float32") - self.inputs = {'X': input} + self.ceil_mode).astype(self.dtype) + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)} self.attrs = { 'strides': self.strides, @@ -116,7 +118,7 @@ class TestPool3d_Op(OpTest): 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } - self.outputs = {'Out': output.astype('float32')} + self.outputs = {'Out': output} def testcudnn(self): return core.is_compiled_with_cuda() and self.use_cudnn @@ -129,6 +131,8 @@ class TestPool3d_Op(OpTest): self.check_output() def test_check_grad(self): + if self.dtype == np.float16: + return if self.testcudnn() and self.pool_type != "max": place = core.CUDAPlace(0) self.check_grad_with_place( @@ -142,8 +146,8 @@ class TestPool3d_Op(OpTest): self.strides = [1, 1, 1] self.paddings = [0, 0, 0] - def init_op_type(self): - self.op_type = "pool3d" + def init_kernel_type(self): + pass def init_pool_type(self): self.pool_type = "avg" @@ -158,15 +162,11 @@ class TestPool3d_Op(OpTest): class TestCase1(TestPool3d_Op): def init_test_case(self): - self.op_type = "pool3d" self.shape = [2, 3, 7, 7, 7] self.ksize = [3, 3, 3] self.strides = [1, 1, 1] self.paddings = [0, 0, 0] - def init_op_type(self): - self.op_type = "pool3d" - def init_pool_type(self): self.pool_type = "avg" self.pool3D_forward_naive = avg_pool3D_forward_naive @@ -182,9 +182,6 @@ class TestCase2(TestPool3d_Op): self.strides = [1, 1, 1] self.paddings = [1, 1, 1] - def init_op_type(self): - self.op_type = "pool3d" - def init_pool_type(self): self.pool_type = "avg" self.pool3D_forward_naive = avg_pool3D_forward_naive @@ -194,27 +191,18 @@ class TestCase2(TestPool3d_Op): class TestCase3(TestPool3d_Op): - def init_op_type(self): - self.op_type = "pool3d" - def init_pool_type(self): self.pool_type = "max" self.pool3D_forward_naive = max_pool3D_forward_naive class TestCase4(TestCase1): - def init_op_type(self): - self.op_type = "pool3d" - def init_pool_type(self): self.pool_type = "max" self.pool3D_forward_naive = max_pool3D_forward_naive class TestCase5(TestCase2): - def init_op_type(self): - self.op_type = "pool3d" - def init_pool_type(self): self.pool_type = "max" self.pool3D_forward_naive = max_pool3D_forward_naive @@ -222,39 +210,105 @@ class TestCase5(TestCase2): #--------------------test pool3d-------------------- class TestCUDNNCase1(TestPool3d_Op): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool3d" + + +class TestFP16CUDNNCase1(TestPool3d_Op): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCUDNNCase2(TestCase1): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool3d" + + +class TestFP16CUDNNCase2(TestCase1): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCUDNNCase3(TestCase2): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool3d" + + +class TestFP16CUDNNCase3(TestCase2): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCUDNNCase4(TestCase3): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool3d" + + +class TestFP16CUDNNCase4(TestCase3): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCUDNNCase5(TestCase4): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool3d" + + +class TestFP16CUDNNCase5(TestCase4): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCUDNNCase6(TestCase5): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool3d" + + +class TestFP16CUDNNCase6(TestCase5): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCeilModeCase1(TestCUDNNCase1): diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 0aada3deb0f047d21701b64af022ebad372d505b..a9fa2359e019d0301110dfc3573e2d2534c6a47a 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -12,14 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import core import framework import executor import data_feeder import contextlib +import io # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module import optimizer as opt_module +from transpiler import distribute_transpiler __all__ = [ 'Trainer', @@ -76,22 +79,60 @@ class Trainer(object): raise TypeError( "The optimizer should be an instance of Optimizer") - optimizer.minimize(loss) + optimize_ops, params_grads = optimizer.minimize(loss) self.place = Trainer._check_and_get_place(place) + self.dist_transpile_if_necessary(optimize_ops, params_grads) + # 2. move the default_main_program to self.program and run the # default_startup program on an empty core.Scope() # Run startup program - exe = executor.Executor(place) - exe.run(self.startup_program, scope=self.scope) + with self._prog_and_scope_guard(): + exe = executor.Executor(place) + exe.run(self.startup_program) if param_path: # load params from param_path into scope - # TODO(yuyang): This depends on parameters implementation. - pass - - # TODO(helin): support distributed training + io.load_persistables(exe, dirname=param_path) + + def dist_transpile_if_necessary(self, optimize_ops, params_grads): + if "PADDLE_TRAINING_ROLE" not in os.environ: + return + + # the port of all pservers, needed by both trainer and pserver + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + # comma separated ips of all pservers, needed by trainer and + # pserver + pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, port])) + pserver_endpoints = ",".join(eplist) + # total number of workers/trainers in the job, needed by + # trainer and pserver + trainers = int(os.getenv("PADDLE_TRAINERS")) + # the IP of the local machine, needed by pserver only + current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port + # the unique trainer id, starting from 0, needed by trainer + # only + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + # the role, should be either PSERVER or TRAINER + training_role = os.getenv("PADDLE_TRAINING_ROLE") + with self._prog_and_scope_guard(): + t = distribute_transpiler.DistributeTranspiler() + t.transpile( + trainer_id, pservers=pserver_endpoints, trainers=trainers) + if training_role == "PSERVER": + self.train_program = t.get_pserver_program(current_endpoint) + self.startup_program = t.get_startup_program(current_endpoint, + self.train_program) + elif training_role == "TRAINER": + self.train_program = t.get_trainer_program() + else: + raise ValueError( + 'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' + ) def train(self, num_epochs, @@ -117,6 +158,13 @@ class Trainer(object): raise NotImplementedError( "Parallel Executor version of trainer is not implemented") + training_role = os.getenv("PADDLE_TRAINING_ROLE", "") + if training_role == "PSERVER": + with self._prog_and_scope_guard(): + exe = executor.Executor(self.place) + exe.run() + return + self._train_by_executor(num_epochs, event_handler, reader, feed_order) def test(self, reader): @@ -124,7 +172,9 @@ class Trainer(object): def save_params(self, param_path): # reference: save_persistables in io.py - pass + exe = executor.Executor(self.place) + io.save_persistables( + exe, dirname=param_path, main_program=self.startup_program) @staticmethod def _check_and_get_place(place): diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6d3c1b947f4acb1335b25e6eb0099d5d532c895a --- /dev/null +++ b/python/paddle/fluid/transpiler/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from distribute_transpiler import DistributeTranspiler +from inference_transpiler import InferenceTranspiler +from memory_optimization_transpiler import memory_optimize, release_memory +from distribute_transpiler_simple import SimpleDistributeTranspiler + +__all__ = [ + "DistributeTranspiler", "InferenceTranspiler", "SimpleDistributeTranspiler", + "memory_optimize", "release_memory" +] diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py similarity index 98% rename from python/paddle/fluid/distribute_transpiler.py rename to python/paddle/fluid/transpiler/distribute_transpiler.py index ee17b11c8baaa1da0669ee55dfbeae4f3a0a3620..640ac9f085e6dc83bb04faafadf4846089ad3e29 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -17,9 +17,8 @@ from __future__ import print_function import math import distributed_splitter as splitter -import framework -from framework import Program, default_main_program, Variable, Parameter -from . import core +from .. import core +from ..framework import Program, default_main_program, Variable, Parameter LOOKUP_TABLE_TYPE = "lookup_table" LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" @@ -135,6 +134,16 @@ def split_dense_variable(var_list, return blocks +def delete_ops(block, ops): + try: + start = list(block.ops).index(ops[0]) + end = list(block.ops).index(ops[-1]) + [block.remove_op(start) for _ in xrange(end - start + 1)] + except Exception, e: + raise e + block.program.sync_with_cpp() + + class DistributeTranspiler: def transpile(self, trainer_id, @@ -317,7 +326,7 @@ class DistributeTranspiler: def get_trainer_program(self): # remove optimize ops and add a send op to main_program - self.delete_ops(self.origin_program.global_block(), self.optimize_ops) + delete_ops(self.origin_program.global_block(), self.optimize_ops) # FIXME(typhoonzero): serialize once will fix error occurs when clone. self.origin_program.__str__() return self.origin_program @@ -601,7 +610,7 @@ class DistributeTranspiler: attrs={"axis": 0}) # delete lookup_table_op - self.delete_ops(program.global_block(), [op]) + delete_ops(program.global_block(), [op]) # break for loop break @@ -1164,12 +1173,3 @@ class DistributeTranspiler: in_name.startswith("beta2_pow_acc"): return True return False - - def delete_ops(self, block, ops): - try: - start = list(block.ops).index(ops[0]) - end = list(block.ops).index(ops[-1]) - [block.remove_op(start) for _ in xrange(end - start + 1)] - except Exception, e: - raise e - block.program.sync_with_cpp() diff --git a/python/paddle/fluid/distribute_transpiler_simple.py b/python/paddle/fluid/transpiler/distribute_transpiler_simple.py similarity index 98% rename from python/paddle/fluid/distribute_transpiler_simple.py rename to python/paddle/fluid/transpiler/distribute_transpiler_simple.py index e94bbb6c39f7a017e2d0b79d050e6ff8e4371a14..ea8c27cdca885dbbf90349b35df9691951264061 100644 --- a/python/paddle/fluid/distribute_transpiler_simple.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler_simple.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import framework -from framework import Program, default_main_program, Parameter, Variable -import optimizer -from layer_helper import LayerHelper +from ..framework import Program, default_main_program, Parameter, Variable +from ..layer_helper import LayerHelper def hash_name_to_server(params_grads, pserver_endpoints): diff --git a/python/paddle/fluid/distributed_splitter.py b/python/paddle/fluid/transpiler/distributed_splitter.py similarity index 100% rename from python/paddle/fluid/distributed_splitter.py rename to python/paddle/fluid/transpiler/distributed_splitter.py diff --git a/python/paddle/fluid/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py similarity index 99% rename from python/paddle/fluid/inference_transpiler.py rename to python/paddle/fluid/transpiler/inference_transpiler.py index 39b01610f96018e1775405a30147e77006cecc16..202aa76084432b4b2378470919b2e924301f2130 100644 --- a/python/paddle/fluid/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -13,9 +13,9 @@ # limitations under the License. import numpy as np -from framework import Program -from executor import global_scope -from . import core +from .. import core +from ..framework import Program +from ..executor import global_scope class InferenceTranspiler: diff --git a/python/paddle/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py similarity index 98% rename from python/paddle/fluid/memory_optimization_transpiler.py rename to python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 20ed19104207c1f0aa45db8f44570377011f3cde..49034b47b2d184e4027bcebc29413a163340fdaa 100644 --- a/python/paddle/fluid/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -13,11 +13,9 @@ # limitations under the License. from collections import defaultdict -import framework -from framework import Program, default_main_program, Parameter, Variable -import backward -from backward import _rename_arg_ -from . import core +from .. import core +from ..framework import Program, default_main_program, Parameter, Variable +from ..backward import _rename_arg_ dtype_to_size = { core.VarDesc.VarType.FP16: 2, diff --git a/tools/timeline.py b/tools/timeline.py index f4083c824e7333a74661d096d4954609f767c83e..8cd6353d46f496831cb61c1cdbbd156ca0579fb4 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -22,7 +22,11 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2 parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( - '--profile_path', type=str, default='', help='Input profile file name.') + '--profile_path', + type=str, + default='', + help='Input profile file name. If there are multiple file, the format ' + 'should be trainer1=file1,trainer2=file2,ps=file3') parser.add_argument( '--timeline_path', type=str, default='', help='Output timeline file name.') args = parser.parse_args() @@ -108,8 +112,8 @@ class _ChromeTraceFormatter(object): class Timeline(object): - def __init__(self, profile_pb): - self._profile_pb = profile_pb + def __init__(self, profile_dict): + self._profile_dict = profile_dict self._pid = 0 self._devices = dict() self._chrome_trace = _ChromeTraceFormatter() @@ -120,35 +124,37 @@ class Timeline(object): return cur_pid def _allocate_pids(self): - for event in self._profile_pb.events: - if event.type == profiler_pb2.Event.CPU: - if (event.device_id, "CPU") not in self._devices: - pid = self._allocate_pid() - self._devices[(event.device_id, "CPU")] = pid - self._chrome_trace.emit_pid("cpu:block:%d" % - (event.device_id), pid) - elif event.type == profiler_pb2.Event.GPUKernel: - if (event.device_id, "GPUKernel") not in self._devices: - pid = self._allocate_pid() - self._devices[(event.device_id, "GPUKernel")] = pid - self._chrome_trace.emit_pid("gpu:%d" % (event.device_id), - pid) + for k, profile_pb in self._profile_dict.iteritems(): + for event in profile_pb.events: + if event.type == profiler_pb2.Event.CPU: + if (k, event.device_id, "CPU") not in self._devices: + pid = self._allocate_pid() + self._devices[(k, event.device_id, "CPU")] = pid + self._chrome_trace.emit_pid("%s:cpu:block:%d" % + (k, event.device_id), pid) + elif event.type == profiler_pb2.Event.GPUKernel: + if (k, event.device_id, "GPUKernel") not in self._devices: + pid = self._allocate_pid() + self._devices[(k, event.device_id, "GPUKernel")] = pid + self._chrome_trace.emit_pid("%s:gpu:%d" % + (k, event.device_id), pid) def _allocate_events(self): - for event in self._profile_pb.events: - if event.type == profiler_pb2.Event.CPU: - type = "CPU" - elif event.type == profiler_pb2.Event.GPUKernel: - type = "GPUKernel" - pid = self._devices[(event.device_id, type)] - args = {'name': event.name} - if event.memcopy.bytes > 0: - args = {'mem_bytes': event.memcopy.bytes} - # TODO(panyx0718): Chrome tracing only handles ms. However, some - # ops takes micro-seconds. Hence, we keep the ns here. - self._chrome_trace.emit_region( - event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, - event.sub_device_id, 'Op', event.name, args) + for k, profile_pb in self._profile_dict.iteritems(): + for event in profile_pb.events: + if event.type == profiler_pb2.Event.CPU: + type = "CPU" + elif event.type == profiler_pb2.Event.GPUKernel: + type = "GPUKernel" + pid = self._devices[(k, event.device_id, type)] + args = {'name': event.name} + if event.memcopy.bytes > 0: + args = {'mem_bytes': event.memcopy.bytes} + # TODO(panyx0718): Chrome tracing only handles ms. However, some + # ops takes micro-seconds. Hence, we keep the ns here. + self._chrome_trace.emit_region( + event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, + event.sub_device_id, 'Op', event.name, args) def generate_chrome_trace(self): self._allocate_pids() @@ -163,11 +169,23 @@ timeline_path = '/tmp/timeline' if args.timeline_path: timeline_path = args.timeline_path -with open(profile_path, 'r') as f: - profile_s = f.read() - profile_pb = profiler_pb2.Profile() - profile_pb.ParseFromString(profile_s) - -tl = Timeline(profile_pb) +profile_paths = profile_path.split(',') +profile_dict = dict() +if len(profile_path) == 1: + with open(profile_path, 'r') as f: + profile_s = f.read() + profile_pb = profiler_pb2.Profile() + profile_pb.ParseFromString(profile_s) + profile_dict['trainer'] = profile_pb +else: + for profile_path in profile_paths: + k, v = profile_path.split('=') + with open(v, 'r') as f: + profile_s = f.read() + profile_pb = profiler_pb2.Profile() + profile_pb.ParseFromString(profile_s) + profile_dict[k] = profile_pb + +tl = Timeline(profile_dict) with open(timeline_path, 'w') as f: f.write(tl.generate_chrome_trace())