diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b19d50a6ad6afa312f5e695583174e56bf490755..d71d792b4e0147afc0ec09808d2004cb4b48ba46 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -235,6 +235,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec')) paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607')) paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329')) +paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', 'ad669cdf83e72a69ebc5ed79e36486de')) paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc index 878b950858a71ba0e10ab2643667922420d29099..c44793cd11d22b29b4b3422a047d81fe26624982 100644 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc @@ -13,125 +13,186 @@ // limitations under the License. #include -#include +#include #include #include #include +#include #include -#include "paddle/fluid/framework/details/all_reduce_deps_pass.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/op_graph_view.h" -#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { namespace framework { namespace details { -VarHandle* GetValidInput(const OpHandleBase* a) { - for (auto p : a->Inputs()) { - VarHandle* b = dynamic_cast(p); - if (b) { - return b; +class AllReduceDepsPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override { + std::vector all_reduce_op_handles = + GetSortedAllReduceOps(*graph); + + for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) { + auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dep_var); + all_reduce_op_handles[i - 1]->AddOutput(dep_var); + all_reduce_op_handles[i]->AddInput(dep_var); } - } - return nullptr; -} - -void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const { - auto graph_ops = ir::FilterByNodeWrapper(*graph); - - // get vars order - int order = 0; - std::unordered_map vars; - // TODO(gongwb): use graph topology sort to find the order of operators. - // Note that must assert topology sort is stable - auto& ops = graph->Get>(kStaleProgramOpDescs); - for (auto* op_desc : ops) { - try { - bool is_bk_op = - static_cast(boost::get(op_desc->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward)); - if (!is_bk_op) continue; - - auto backward_vars = - boost::get>(op_desc->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - - auto outputs = op_desc->Outputs(); - for (auto& o_it : outputs) { - for (auto& v : o_it.second) { // values - vars[v] = order; - VLOG(10) << "in all_reduce_deps_pass:" << v; - } - } - order++; - } catch (boost::bad_get e) { + if (VLOG_IS_ON(10)) { + DebugString(*graph, all_reduce_op_handles); } } - std::vector dist_ops; - // get allreduce ops. - for (auto& op : graph_ops) { - // FIXME(gongwb):add broad cast. - if (op->Name() == "all_reduce" || op->Name() == "reduce") { - dist_ops.push_back(op); + std::vector GetSortedAllReduceOps( + const ir::Graph& graph) const { + std::vector all_reduce_op_handles; + std::unordered_map pending_ops; + std::unordered_set ready_ops; + std::unordered_set next_ready_ops; + + auto op_handles = ir::FilterByNodeWrapper(graph); + size_t num_of_ops = op_handles.size(); + for (OpHandleBase* op : op_handles) { + size_t not_ready_vars = op->NotReadyInputSize(); + if (not_ready_vars) { + pending_ops.insert({op, not_ready_vars}); + } else { + ready_ops.insert(op); + } } - } - - VLOG(10) << "dist_ops size:" << dist_ops.size() - << ", outputs size:" << vars.size() << ", ops size:" << ops.size(); - - std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1, - OpHandleBase* op2) { - VarHandle* i0 = dynamic_cast(GetValidInput(op1)); - VarHandle* i1 = dynamic_cast(GetValidInput(op2)); - - PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error", - op1->DebugString(), op2->DebugString()); - auto l_it = vars.find(i0->name()); - auto r_it = vars.find(i1->name()); - - PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(), - "can't find var's name %s and %s in opdesc", i0->name(), - i1->name()); - - if (l_it->second < r_it->second) return true; + GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles); + + size_t has_run_ops = ready_ops.size(); + while (has_run_ops != num_of_ops) { + for (auto* op : ready_ops) { + for (auto& ready_var : op->Outputs()) { + for (auto* pend_op : ready_var->PendingOps()) { + auto& deps = --pending_ops[pend_op]; + if (deps == 0) { + next_ready_ops.insert(pend_op); + } + } + } + } - if (l_it->second == r_it->second) { - return i0->name() < i1->name(); + PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle."); + ready_ops.clear(); + std::swap(ready_ops, next_ready_ops); + GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles); + has_run_ops += ready_ops.size(); } + return all_reduce_op_handles; + } - return false; - }); - - // add dependency. - auto& sorted_ops = dist_ops; - for (size_t i = 1; i < sorted_ops.size(); ++i) { - auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - - auto* pre_op = sorted_ops[i - 1]; - auto* op = sorted_ops[i]; - - pre_op->AddOutput(dep_var); - op->AddInput(dep_var); - graph->Get(kGraphDepVars).emplace(dep_var); + void GetSortedAllReduceOps( + const std::unordered_set& ready_ops, + std::vector* all_reduce_op_handles) const { + std::vector current_all_reduce_op_handles; + for (auto& op_handle : ready_ops) { + auto all_reduce_op_handle = dynamic_cast(op_handle); + if (all_reduce_op_handle) { + current_all_reduce_op_handles.emplace_back(all_reduce_op_handle); + } + } - VLOG(10) << "add all_reduce sequential dependencies between " << pre_op - << " and " << op; + // NOTE(zcd): For distributed training, it is important to keep the order of + // allReduce on each node consistent. Otherwise, hang may occur. + // Sort the current_all_reduce_op_handles according to the name of input. + sort(current_all_reduce_op_handles.begin(), + current_all_reduce_op_handles.end(), + [](const AllReduceOpHandle* left, + const AllReduceOpHandle* right) -> bool { + auto left_in_vars = DynamicCast(left->Inputs()); + auto right_in_vars = DynamicCast(right->Inputs()); + PADDLE_ENFORCE_GT(left_in_vars.size(), 0); + PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size()); + return left_in_vars[0]->Name() > right_in_vars[0]->Name(); + }); + + all_reduce_op_handles->insert(all_reduce_op_handles->end(), + current_all_reduce_op_handles.begin(), + current_all_reduce_op_handles.end()); + } - VLOG(10) << "pre_op:" << pre_op->DebugString() - << ", op:" << op->DebugString(); + void DebugString( + const ir::Graph& graph, + const std::vector& all_reduce_op_handles) const { + // get vars order + std::map> vars = + GetSoredGradientsFromStaleProgram(graph); + std::stringstream out; + size_t grads_of_stale_program = 0; + out << "Get Order From kStaleProgramOpDescs: "; + for (auto& var : vars) { + out << "Order " << var.first << " ["; + for (auto& var_name : var.second) { + out << var_name << ", "; + ++grads_of_stale_program; + } + out << "], "; + } + VLOG(10) << out.str(); + + std::stringstream out2; + out2 << "Get Order From Topological order: "; + for (auto& op : all_reduce_op_handles) { + bool find_valid_input = false; + for (auto& in_var : op->Inputs()) { + if (dynamic_cast(in_var)) { + out2 << in_var->Name() << ", "; + find_valid_input = true; + break; + } + } + PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input."); + } + VLOG(10) << out2.str(); + if (grads_of_stale_program != all_reduce_op_handles.size()) { + VLOG(10) + << "The gradients number of stale program and graph is not equal."; + } } -} + std::map> GetSoredGradientsFromStaleProgram( + const ir::Graph& graph) const { + std::map> vars; + auto ops = graph.Get>(kStaleProgramOpDescs); + int order = 0; + for (auto* op_desc : ops) { + try { + bool is_bk_op = + static_cast(boost::get(op_desc->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + + auto backward_vars = + boost::get>(op_desc->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + if (backward_vars.empty()) continue; + + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + for (size_t i = 1; i < backward_vars.size(); i += 2) { + vars[order].emplace_back(backward_vars[i]); + VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1] + << ", " << backward_vars[i]; + } + order++; + } catch (boost::bad_get e) { + } + } + return vars; + } +}; } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h deleted file mode 100644 index 4ed3736587aa3d45e288e3dc7e6ab3099f935f41..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace details { - -// TODO(gongwb): overlap allreduce with backward computation. -class AllReduceDepsPass : public ir::Pass { - protected: - void ApplyImpl(ir::Graph* graph) const override; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 6e477cd2977561ddb914e4a6343f677044fad4be..ed75b48090b27a9c430afe067467a1a39d711938 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -28,7 +28,7 @@ // asynchronous nccl allreduce or synchronous issue: // https://github.com/PaddlePaddle/Paddle/issues/15049 DEFINE_bool( - sync_nccl_allreduce, false, + sync_nccl_allreduce, true, "If set true, will call `cudaStreamSynchronize(nccl_stream)`" "after allreduce, this mode can get better performance in some scenarios."); diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 027de4cda410178fbae11f1db9a580c2b7ad22a3..f8bf43bcb48226b4d1317a78ade7179741097378 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -163,14 +163,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { "graph_printer", new details::GraphvizSSAGraphPrinter); } - // Verify that the graph is correct for multi-device executor. - AppendPass("multi_devices_check_pass"); - - if (VLOG_IS_ON(2)) { - AppendPass("all_reduce_deps_pass"); - } - - if (SeqOnlyAllReduceOps(strategy_)) { + // experimental shows that the program will be faster if append + // all_reduce_deps_pass here. + if (!strategy_.enable_parallel_graph_ && + (SeqOnlyAllReduceOps(strategy_) || + strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) { VLOG(10) << "Add all_reduce_deps_pass"; AppendPass("all_reduce_deps_pass"); } @@ -179,6 +176,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { VLOG(10) << "Add modify_op_lock_and_record_event_pass"; AppendPass("modify_op_lock_and_record_event_pass"); } + + // Verify that the graph is correct for multi-device executor. + AppendPass("multi_devices_check_pass"); } // Convert graph to run on multi-devices. diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 413b14961631b3459e0d05af685ad1c5395844c2..69cd84ebf2d678c089141f09a92c46e3a03fe4d9 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -68,7 +68,7 @@ void OpHandleBase::Run(bool use_cuda) { if (out_var_handle) { PADDLE_ENFORCE( platform::is_same_place(place, out_var_handle->place()), - "The place of input(%s) is not consistent with the " + "The place of output(%s) is not consistent with the " "place of current op(%s).", out_var_handle->Name(), Name()); out_var_handle->SetGenerateEvent(events_.at(dev_id)); diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 353db435213c74982d582e5be298ecfb1a810f30..e6f5b15af8cd440a9304235acfe62787c5f1b134 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -617,6 +617,25 @@ void OpDesc::Flush() { static std::once_flag init_infer_shape_funcs; +/** + * NOTE(paddle-dev): Very tricky code here. Maybe we should find a + * better way to register compile-time infershape method gentlely. + * + * Normally, we can register a class derived from InferShapeBase, so that + * we can set the field of `infer_shape_` inside OpInfo when registering op. + * + * However, there is another way we can set the field of `infer_shape_` inside + * OpInfo. Usually, we overload InferShape method of OperatorWithKernel. After + * running the following method InitInferShapeFuncs, `infer_shape_` would be set + * to be the InferShape method of OperatorWithKernel. That is to say, we borrow + * the run-time InferShape method of OperatorWithKernel to be the compile-time + * InferShape method. + * + * However, during compiling time, we may not know inputs, outputs and attrs of + * run-time OperatorWithKernel. So the following code creates a fake + * OperatorWithKernel object. That is why the field info_ of OperatorBase + * would be null. + */ static void InitInferShapeFuncs() { std::call_once(init_infer_shape_funcs, [] { auto &map = OpInfoMap::Instance(); @@ -628,11 +647,16 @@ static void InitInferShapeFuncs() { PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered", op_type); auto &op_info = it->second; - auto op = static_cast(op_info.Creator()( - "", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); if (op_info.infer_shape_) { // infer_shape has been registered. continue; } + + auto op = dynamic_cast(op_info.Creator()( + "", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); + + PADDLE_ENFORCE_NOT_NULL( + op, "InferShapeBase is not registered to Operator %s", op_type); + op_info.infer_shape_ = [op](InferShapeContext *ctx) { op->InferShape(ctx); }; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index a2a8083da955c93175ab2f01a37737c145e6f1b8..4245caf1689c76d72b410c742488c55562c8b998 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -19,11 +19,6 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/framework/ir/graph_helper.h" - -#include "paddle/fluid/framework/ir/graph.h" - -#include "paddle/fluid/framework/details/all_reduce_deps_pass.h" #include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" @@ -31,6 +26,8 @@ limitations under the License. */ #include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/profiler.h" #ifdef WITH_GPERFTOOLS diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 0d116a6495477ca69c10c130e63247a4f6c03b23..e52a0283f726640eb56b24a2978af6ee44e658ff 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(engine SRCS engine.cc) cc_library(imperative_profiler SRCS profiler.cc) +cc_library(nccl_context SRCS nccl_context.cc DEPS device_context) + +cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) endif() diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..04364e68342810f6babee1df0c5eb3b476de022a --- /dev/null +++ b/paddle/fluid/imperative/nccl_context.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/nccl_context.h" + +namespace paddle { +namespace imperative { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +void NCCLParallelContext::RecvNCCLID(const std::string &ep, + ncclUniqueId *nccl_id) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ(addr.size(), 2UL, + "The endpoint should contain host and port: %s", ep); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + + int server_fd, new_socket; + struct sockaddr_in address; + int addrlen = sizeof(address); + char buffer[1024] = {0}; + int opt = 0; + // creating socket fd + if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) + PADDLE_THROW("create server fd failed"); + if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, + sizeof(opt))) + PADDLE_THROW("set socket opt failed"); + + address.sin_family = AF_INET; + address.sin_addr.s_addr = INADDR_ANY; + address.sin_port = htons(port); + + if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) + PADDLE_THROW("binding failed on ep: %s", ep); + VLOG(3) << "listening on: " << ep; + if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed"); + + if ((new_socket = + accept(server_fd, reinterpret_cast(&address), + reinterpret_cast(&addrlen))) < 0) + PADDLE_THROW("accept the new socket fd failed"); + + if (read(new_socket, buffer, 1024) < 0) + PADDLE_THROW("reading the ncclUniqueId from socket failed"); + VLOG(3) << "recevived the ncclUniqueId"; + memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES); + + VLOG(3) << "closing the socket server: " << ep; + close(server_fd); +} + +void NCCLParallelContext::SendNCCLID(const std::string &ep, + ncclUniqueId *nccl_id) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ(addr.size(), 2UL, + "The endpoint should contain host and port: %s", ep); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + // struct sockaddr_in address; + int sock = 0; + struct sockaddr_in serv_addr; + char buffer[1024] = {0}; + + memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES); + if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) + PADDLE_THROW("create socket failed"); + + memset(&serv_addr, '0', sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_port = htons(port); + + if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) + PADDLE_THROW("invalied address: %s", ep); + + while (true) { + if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { + VLOG(0) << "worker: " << ep + << " is not ready, will retry after 3 seconds..."; + std::this_thread::sleep_for(std::chrono::seconds(3)); + continue; + } + VLOG(3) << "sending the ncclUniqueId to " << ep; + send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0); + break; + } +} + +void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) { + if (strategy_.local_rank_ == root) { + for (auto ep : strategy_.trainer_endpoints_) { + if (ep != strategy_.current_endpoint_) SendNCCLID(ep, nccl_id); + } + } else { + RecvNCCLID(strategy_.current_endpoint_, nccl_id); + } +} + +void NCCLParallelContext::Init() { + ncclUniqueId nccl_id; + ncclComm_t comm; + if (strategy_.local_rank_ == 0) { + // generate the unique ncclid on the root worker + platform::dynload::ncclGetUniqueId(&nccl_id); + BcastNCCLId(&nccl_id, 0); + } else { + BcastNCCLId(&nccl_id, 0); + } + int gpu_id = boost::get(place_).device; + VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id; + + PADDLE_ENFORCE(cudaSetDevice(gpu_id)); + PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( + &comm, strategy_.nranks_, nccl_id, strategy_.local_rank_)); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = static_cast(pool.Get(place_)); + dev_ctx->set_nccl_comm(comm); +} +#endif + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h new file mode 100644 index 0000000000000000000000000000000000000000..51c86190439d1739a99bc91a712f663383bb9371 --- /dev/null +++ b/paddle/fluid/imperative/nccl_context.h @@ -0,0 +1,81 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +// network header files +#ifndef _WIN32 +#include +#include +#include +#include +#endif + +#include +#include + +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/device_context.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/nccl.h" +#endif +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" + +namespace paddle { +namespace imperative { + +struct ParallelStrategy { + int nranks_{1}; + int local_rank_{0}; + std::vector trainer_endpoints_{}; + std::string current_endpoint_{""}; +}; + +class ParallelContext { + public: + explicit ParallelContext(const ParallelStrategy& strategy, + const platform::Place& place) + : strategy_(strategy), place_(place) {} + + virtual ~ParallelContext() {} + + virtual void Init() = 0; + + protected: + ParallelStrategy strategy_; + platform::Place place_; +}; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +class NCCLParallelContext : ParallelContext { + public: + explicit NCCLParallelContext(const ParallelStrategy& strategy, + const platform::Place& place) + : ParallelContext(strategy, place) {} + + ~NCCLParallelContext() {} + + void BcastNCCLId(ncclUniqueId* nccl_id, int root); + + void Init() override; + + protected: + void RecvNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id); + + void SendNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id); +}; +#endif + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/nccl_context_test.cc b/paddle/fluid/imperative/nccl_context_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..74a74ebe921378e2994a6a4cb2087d0acde950b1 --- /dev/null +++ b/paddle/fluid/imperative/nccl_context_test.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/nccl_context.h" +#include "gtest/gtest.h" +#include "paddle/fluid/platform/device_context.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; + +imperative::ParallelStrategy GetStrategy(int local_rank) { + std::vector eps = {"127.0.0.1:9866", "127.0.0.1:9867"}; + imperative::ParallelStrategy strategy; + strategy.trainer_endpoints_ = eps; + strategy.current_endpoint_ = eps[local_rank]; + strategy.nranks_ = 2; + strategy.local_rank_ = local_rank; + return strategy; +} + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) { + auto strategy = GetStrategy(local_rank); + platform::CUDAPlace gpu(local_rank); + imperative::NCCLParallelContext ctx(strategy, gpu); + ctx.BcastNCCLId(nccl_id, 0); +} + +TEST(BcastNCCLId, Run) { + ncclUniqueId nccl_id; + platform::dynload::ncclGetUniqueId(&nccl_id); + std::thread t(BcastNCCLId, 0, &nccl_id); + + ncclUniqueId recv_nccl_id; + BcastNCCLId(1, &recv_nccl_id); + + t.join(); + EXPECT_EQ(0, std::memcmp(nccl_id.internal, recv_nccl_id.internal, + NCCL_UNIQUE_ID_BYTES)); +} +#endif diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 7c9d0af3ecd647604ab46ee6239fc352e5fd8d85..7c495ddd68221acfed8537fd72e9a582e891f8db 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -177,7 +177,7 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, current_vars_map[out->Name()] = out; } - VLOG(3) << "input var name: " << out->Name() + VLOG(3) << "output var name: " << out->Name() << " inited: " << out->var_->IsInitialized() << " stop_grad: " << out->IsStopGradient(); } @@ -215,6 +215,7 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, framework::Scope scope; op->place_ = GetExpectedPlace(expected_place, inputs); + PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_); prepared_op.op.RuntimeInferShape(scope, op->place_, ctx); prepared_op.func( diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index e5036d940197ef012cbfd8f52700c8aeb54fb6c5..0109b4a4fa7617880f642f5a39639bca38475515 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -142,7 +142,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { void AnalysisConfig::EnableMKLDNN() { #ifdef PADDLE_WITH_MKLDNN - pass_builder()->EnableMKLDNN(); use_mkldnn_ = true; #else LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; @@ -235,16 +234,13 @@ void AnalysisConfig::Update() { } if (use_mkldnn_) { +#ifdef PADDLE_WITH_MKLDNN if (!enable_ir_optim_) { LOG(ERROR) << "EnableMKLDNN() only works when IR optimization is enabled."; + } else { + pass_builder()->EnableMKLDNN(); } -#ifdef PADDLE_WITH_MKLDNN - pass_builder()->EnableMKLDNN(); - use_mkldnn_ = true; -#else - LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; - use_mkldnn_ = false; #endif } @@ -256,9 +252,6 @@ void AnalysisConfig::Update() { } #ifdef PADDLE_WITH_MKLDNN pass_builder()->EnableMkldnnQuantizer(); -#else - LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; - use_mkldnn_quantizer_ = false; #endif } diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 258a79fa4e884177490fab79778151ae52537aa0..c89dd41e0a6283e0723e2925f28c0372cda6a2b2 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -27,6 +27,7 @@ #include #include #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/port.h" #include "paddle/fluid/string/printf.h" @@ -266,17 +267,17 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) { } static void PrintTime(int batch_size, int repeat, int num_threads, int tid, - double latency, int epoch = 1) { - LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat - << ", threads: " << num_threads << ", thread id: " << tid - << ", latency: " << latency << "ms, fps: " << 1 / (latency / 1000.f) + double batch_latency, int epoch = 1) { + PADDLE_ENFORCE(batch_size > 0, "Non-positive batch size."); + double sample_latency = batch_latency / batch_size; + LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid << " ======"; - if (epoch > 1) { - int samples = batch_size * epoch; - LOG(INFO) << "====== sample number: " << samples - << ", average latency of each sample: " << latency / samples - << "ms ======"; - } + LOG(INFO) << "====== batch_size: " << batch_size << ", iterations: " << epoch + << ", repetitions: " << repeat << " ======"; + LOG(INFO) << "====== batch latency: " << batch_latency + << "ms, number of samples: " << batch_size * epoch + << ", sample latency: " << sample_latency + << "ms, fps: " << 1000.f / sample_latency << " ======"; } static bool IsFileExists(const std::string &path) { diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 1d1d39e44096b9f50e5bc9603fa12aba92b0e8e2..e765c078aa838de6513e6f4d6729e3b1fb2958db 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -64,10 +64,12 @@ void PaddlePassBuilder::DeletePass(size_t idx) { passes_.erase(std::begin(passes_) + idx); } -void GpuPassStrategy::EnableMKLDNN() { - LOG(ERROR) << "GPU not support MKLDNN yet"; +void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { + analysis_passes_.push_back(pass); } +void PaddlePassBuilder::ClearPasses() { passes_.clear(); } + // The following passes works for Anakin sub-graph engine. const std::vector kAnakinSubgraphPasses({ "infer_clean_graph_pass", // @@ -102,12 +104,12 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { use_gpu_ = true; } -void GpuPassStrategy::EnableMkldnnQuantizer() { - LOG(ERROR) << "GPU not support MKL-DNN quantization"; +void GpuPassStrategy::EnableMKLDNN() { + LOG(ERROR) << "GPU not support MKLDNN yet"; } -void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { - analysis_passes_.push_back(pass); +void GpuPassStrategy::EnableMkldnnQuantizer() { + LOG(ERROR) << "GPU not support MKL-DNN quantization"; } CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { @@ -130,10 +132,44 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", // "is_test_pass", // - "identity_scale_op_clean_pass", // "runtime_context_cache_pass", // }); + use_gpu_ = false; } -void PaddlePassBuilder::ClearPasses() { passes_.clear(); } + +void CpuPassStrategy::EnableMKLDNN() { +// TODO(Superjomn) Consider the way to mix CPU with GPU. +#ifdef PADDLE_WITH_MKLDNN + if (!use_mkldnn_) { + passes_.insert(passes_.begin(), "mkldnn_placement_pass"); + + for (auto &pass : std::vector( + {"depthwise_conv_mkldnn_pass", // + "conv_bn_fuse_pass", // Execute BN passes again to + "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order + "conv_bias_mkldnn_fuse_pass", // + "conv3d_bias_mkldnn_fuse_pass", // + "conv_elementwise_add_mkldnn_fuse_pass", + "conv_relu_mkldnn_fuse_pass"})) { + passes_.push_back(pass); + } + } + use_mkldnn_ = true; +#else + use_mkldnn_ = false; +#endif +} + +void CpuPassStrategy::EnableMkldnnQuantizer() { +#ifdef PADDLE_WITH_MKLDNN + if (!use_mkldnn_quantizer_) { + passes_.push_back("cpu_quantize_placement_pass"); + } + use_mkldnn_quantizer_ = true; +#else + use_mkldnn_quantizer_ = false; +#endif +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 48da8c156f426477011bcc060260c812ad94df23..09ef195d5e66aff0cef17f1594de34c656187a35 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -109,43 +109,16 @@ class CpuPassStrategy : public PassStrategy { CpuPassStrategy(); explicit CpuPassStrategy(const CpuPassStrategy &other) - : PassStrategy(other.AllPasses()) {} + : PassStrategy(other.AllPasses()) { + use_gpu_ = other.use_gpu_; + use_mkldnn_ = other.use_mkldnn_; + use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_; + } virtual ~CpuPassStrategy() = default; - void EnableMKLDNN() override { -// TODO(Superjomn) Consider the way to mix CPU with GPU. -#ifdef PADDLE_WITH_MKLDNN - if (!use_mkldnn_) { - passes_.insert(passes_.begin(), "mkldnn_placement_pass"); - - for (auto &pass : std::vector( - {"depthwise_conv_mkldnn_pass", // - "conv_bn_fuse_pass", // Execute BN passes again to - "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order - "conv_bias_mkldnn_fuse_pass", // - "conv3d_bias_mkldnn_fuse_pass", // - "conv_relu_mkldnn_fuse_pass", // - "conv_elementwise_add_mkldnn_fuse_pass"})) { - passes_.push_back(pass); - } - } - use_mkldnn_ = true; -#else - use_mkldnn_ = false; -#endif - } - - void EnableMkldnnQuantizer() override { -#ifdef PADDLE_WITH_MKLDNN - if (!use_mkldnn_quantizer_) { - passes_.push_back("cpu_quantize_placement_pass"); - } - use_mkldnn_quantizer_ = true; -#else - use_mkldnn_quantizer_ = false; -#endif - } + void EnableMKLDNN() override; + void EnableMkldnnQuantizer() override; protected: bool use_mkldnn_quantizer_{false}; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 647913cc80727786379e2e5525b372818e423d23..8ecb0310c9775393631b99681e13cbea7a5b781e 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -26,7 +26,11 @@ endfunction() function(inference_analysis_api_int8_test target model_dir data_dir filename) inference_analysis_test(${target} SRCS ${filename} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark - ARGS --infer_model=${model_dir}/model --infer_data=${data_dir}/data.bin --batch_size=100) + ARGS --infer_model=${model_dir}/model + --infer_data=${data_dir}/data.bin + --warmup_batch_size=100 + --batch_size=50 + --iterations=2) endfunction() function(inference_analysis_api_test_with_fake_data target install_dir filename model_name) @@ -146,22 +150,22 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_con # int8 image classification tests if(WITH_MKLDNN) - set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8") + set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2") if (NOT EXISTS ${INT8_DATA_DIR}) - inference_download_and_uncompress(${INT8_DATA_DIR} ${INFERENCE_URL}"/int8" "imagenet_val_100.tar.gz") + inference_download_and_uncompress(${INT8_DATA_DIR} "${INFERENCE_URL}/int8" "imagenet_val_100_tail.tar.gz") endif() #resnet50 int8 set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50") if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR}) - inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} ${INFERENCE_URL}"/int8" "resnet50_int8_model.tar.gz" ) + inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "${INFERENCE_URL}/int8" "resnet50_int8_model.tar.gz" ) endif() inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL) #mobilenet int8 set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet") if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR}) - inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} ${INFERENCE_URL}"/int8" "mobilenetv1_int8_model.tar.gz" ) + inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "${INFERENCE_URL}/int8" "mobilenetv1_int8_model.tar.gz" ) endif() inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc index e73358d8827a40786beb05fad931267b0dd88f6b..9b2e74ec16eb3b6e98bfcc8cc546ed74a7966f33 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -154,7 +154,7 @@ void profile(bool use_mkldnn = false) { config.EnableMKLDNN(); } - std::vector outputs; + std::vector> outputs; std::vector> inputs; LoadInputData(&inputs); TestPrediction(reinterpret_cast(&config), diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 735e4fb563788438ee49ff6308d11f4dbe4962be..e10d239a5d1b30e089a110c6155520e3b035860a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -197,7 +197,7 @@ void profile(bool use_mkldnn = false) { cfg.SetMKLDNNOp(op_list); } - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -206,9 +206,11 @@ void profile(bool use_mkldnn = false) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { PADDLE_ENFORCE_GT(outputs.size(), 0); - size_t size = GetSize(outputs[0]); + auto output = outputs.back(); + PADDLE_ENFORCE_GT(output.size(), 0); + size_t size = GetSize(output[0]); PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); + float *result = static_cast(output[0].data.data()); for (size_t i = 0; i < size; i++) { EXPECT_NEAR(result[i], result_data[i], 1e-3); } diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc index 5a4f9a31a164a8fca3f80ce2fe2e6065fd04b340..ece094717b8076321c68d7fdd29f07c4da6b0ed4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc @@ -17,8 +17,6 @@ limitations under the License. */ #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" -DEFINE_int32(iterations, 0, "Number of iterations"); - namespace paddle { namespace inference { namespace analysis { @@ -30,8 +28,13 @@ void SetConfig(AnalysisConfig *cfg) { cfg->SwitchIrOptim(); cfg->SwitchSpecifyInputNames(false); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); - cfg->EnableMKLDNN(); + cfg->pass_builder()->SetPasses( + {"infer_clean_graph_pass", "mkldnn_placement_pass", + "depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass", + "conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass", + "conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass", + "fc_fuse_pass", "is_test_pass"}); } template @@ -40,8 +43,8 @@ class TensorReader { TensorReader(std::ifstream &file, size_t beginning_offset, std::vector shape, std::string name) : file_(file), position(beginning_offset), shape_(shape), name_(name) { - numel = - std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies()); + numel = std::accumulate(shape_.begin(), shape_.end(), size_t{1}, + std::multiplies()); } PaddleTensor NextBatch() { @@ -71,10 +74,14 @@ class TensorReader { }; std::shared_ptr> GetWarmupData( - const std::vector> &test_data, int num_images) { + const std::vector> &test_data, + int num_images = FLAGS_warmup_batch_size) { int test_data_batch_size = test_data[0][0].shape[0]; - CHECK_LE(static_cast(num_images), - test_data.size() * test_data_batch_size); + auto iterations_max = test_data.size(); + PADDLE_ENFORCE( + static_cast(num_images) <= iterations_max * test_data_batch_size, + "The requested quantization warmup data size " + + std::to_string(num_images) + " is bigger than all test data size."); PaddleTensor images; images.name = "input"; @@ -120,20 +127,17 @@ void SetInput(std::vector> *inputs, std::vector image_batch_shape{batch_size, 3, 224, 224}; std::vector label_batch_shape{batch_size, 1}; + auto images_offset_in_file = static_cast(file.tellg()); auto labels_offset_in_file = - static_cast(file.tellg()) + - sizeof(float) * total_images * - std::accumulate(image_batch_shape.begin() + 1, - image_batch_shape.end(), 1, std::multiplies()); + images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224; - TensorReader image_reader(file, 0, image_batch_shape, "input"); + TensorReader image_reader(file, images_offset_in_file, + image_batch_shape, "input"); TensorReader label_reader(file, labels_offset_in_file, label_batch_shape, "label"); - auto iterations = total_images / batch_size; - if (FLAGS_iterations > 0 && FLAGS_iterations < iterations) - iterations = FLAGS_iterations; - for (auto i = 0; i < iterations; i++) { + auto iterations_max = total_images / batch_size; + for (auto i = 0; i < iterations_max; i++) { auto images = image_reader.NextBatch(); auto labels = label_reader.NextBatch(); inputs->emplace_back( @@ -148,20 +152,21 @@ TEST(Analyzer_int8_resnet50, quantization) { AnalysisConfig q_cfg; SetConfig(&q_cfg); + // read data from file and prepare batches with test data std::vector> input_slots_all; - SetInput(&input_slots_all, 100); + SetInput(&input_slots_all); + // prepare warmup batch from input data read earlier + // warmup batch size can be different than batch size std::shared_ptr> warmup_data = - GetWarmupData(input_slots_all, 100); + GetWarmupData(input_slots_all); + // configure quantizer q_cfg.EnableMkldnnQuantizer(); q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); - q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100); + q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size); - CompareQuantizedAndAnalysis( - reinterpret_cast(&cfg), - reinterpret_cast(&q_cfg), - input_slots_all); + CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 347672eaae314aa42096d48a3b044014f2ddbf84..142905dcd8d9964d93d0c5f7444823eef2b84900 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -124,7 +124,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_LAC, profile) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -137,11 +137,13 @@ TEST(Analyzer_LAC, profile) { 24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23}; - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t); PADDLE_ENFORCE_GE(size, batch1_size); - int64_t *pdata = static_cast(outputs[0].data.data()); + int64_t *pdata = static_cast(output[0].data.data()); for (size_t i = 0; i < batch1_size; ++i) { EXPECT_EQ(pdata[i], lac_ref_data[i]); } diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc index 089f655c180d784af66af60277bdbf32a6019599..2eb347a44b394a55706d5aa88bee7fe1fcc7838e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -96,7 +96,7 @@ void SetInput(std::vector> *inputs) { void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; if (use_mkldnn) { cfg.EnableMKLDNN(); @@ -108,8 +108,9 @@ void profile(bool use_mkldnn = false) { input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { - PADDLE_ENFORCE_EQ(outputs.size(), 2UL); - for (auto &output : outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.back().size(), 2UL); + for (auto &output : outputs.back()) { size_t size = GetSize(output); PADDLE_ENFORCE_GT(size, 0); float *result = static_cast(output.data.data()); diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index a70aa7a6ac41121a0c8ea397ebc7e24e4b206d12..36e07d5f55600dc7aa96227289f707fb19f92d56 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -106,7 +106,7 @@ void SetInput(std::vector> *inputs) { void profile(bool memory_load = false) { AnalysisConfig cfg; SetConfig(&cfg, memory_load); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -117,10 +117,12 @@ void profile(bool memory_load = false) { // the first inference result const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, 48, 39, 38, 16, 25}; - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); PADDLE_ENFORCE_GT(size, 0); - int64_t *result = static_cast(outputs[0].data.data()); + int64_t *result = static_cast(output[0].data.data()); for (size_t i = 0; i < std::min(11UL, size); i++) { EXPECT_EQ(result[i], chinese_ner_result_data[i]); } diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index 5157bd280d0f3ee327d5cee7799477b5e6fd3f71..9443b08063b8f61d3d6b291a7217d645d8825c54 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -127,7 +127,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_Pyramid_DNN, profile) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -135,10 +135,12 @@ TEST(Analyzer_Pyramid_DNN, profile) { input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) { - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); + float *result = static_cast(output[0].data.data()); // output is probability, which is in (0, 1). for (size_t i = 0; i < size; i++) { EXPECT_GT(result[i], 0); diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 629981d565f1b6eeabc192287cb9f892df21b8e4..d4330e6cddf8818ace01be2f13a4c18a192c46e1 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -40,7 +40,7 @@ void profile(bool use_mkldnn = false) { if (use_mkldnn) { cfg.EnableMKLDNN(); } - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index dcf4b38ce8a9230148738cfd0840ca96b0c7cf8c..54fd3a4a4caba52110ab636e6d44ee2a473f0cb0 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -229,7 +229,7 @@ TEST(Analyzer_rnn1, profile) { SetConfig(&cfg); cfg.DisableGpu(); cfg.SwitchIrDebug(); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -280,7 +280,7 @@ TEST(Analyzer_rnn1, compare_determine) { TEST(Analyzer_rnn1, multi_thread) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc index 007f9f0b66a7b276f5f2e8500a3001788ad41e79..9ccbf58cbd2bbaab9b1a132c27e50356e1a5df37 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -126,7 +126,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_rnn2, profile) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -136,9 +136,11 @@ TEST(Analyzer_rnn2, profile) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result PADDLE_ENFORCE_GT(outputs.size(), 0); - size_t size = GetSize(outputs[0]); + auto output = outputs.back(); + PADDLE_ENFORCE_GT(output.size(), 0); + size_t size = GetSize(output[0]); PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); + float *result = static_cast(output[0].data.data()); for (size_t i = 0; i < size; i++) { EXPECT_NEAR(result[i], result_data[i], 1e-3); } diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index 47c1d7375843e4bad212c1d7d621c9e6d45e5982..9f23b9f037bcaeb758312d011067ae29c82e73cd 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -110,7 +110,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_seq_conv1, profile) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -119,10 +119,12 @@ TEST(Analyzer_seq_conv1, profile) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); + float *result = static_cast(output[0].data.data()); // output is probability, which is in (0, 1). for (size_t i = 0; i < size; i++) { EXPECT_GT(result[i], 0); diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 19fa5528da4d11d2eb1a2f932f60a84c3f5468e7..d6f7f468a6c83bd6c4ac087931d0c6b7cac3cc1c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -156,7 +156,7 @@ void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg, use_mkldnn); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); TestPrediction(reinterpret_cast(&cfg), diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 2003be82019333ca97b9fa8ef83668825fe5710d..54492dbc238bbaf25f86b300fdd6585f74365088 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -70,7 +70,7 @@ TEST(Analyzer_Text_Classification, profile) { AnalysisConfig cfg; SetConfig(&cfg); cfg.SwitchIrDebug(); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -79,8 +79,9 @@ TEST(Analyzer_Text_Classification, profile) { if (FLAGS_num_threads == 1) { // Get output - LOG(INFO) << "get outputs " << outputs.size(); - for (auto &output : outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + LOG(INFO) << "get outputs " << outputs.back().size(); + for (auto &output : outputs.back()) { LOG(INFO) << "output.shape: " << to_string(output.shape); // no lod ? CHECK_EQ(output.lod.size(), 0UL); diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc index a925da312cde30380b4997b8b76a0d425a71e817..bd4f1b61973fb0de06dcc288e329c94756d5ed47 100644 --- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc @@ -186,7 +186,7 @@ void SetInput(std::vector> *inputs) { void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; if (use_mkldnn) { cfg.EnableMKLDNN(); } diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index ca04c1365cbbffcb4a2786cde9ab240cc20aa3d8..fb47048cd0ccc887927cb4b533d45df11ef633eb 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -87,7 +87,7 @@ void profile(bool use_mkldnn = false) { cfg.EnableMKLDNN(); } // cfg.pass_builder()->TurnOnDebug(); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -100,7 +100,8 @@ void profile(bool use_mkldnn = false) { auto refer = ProcessALine(line); file.close(); - auto &output = outputs.front(); + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto &output = outputs.back().front(); size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); CHECK_EQ(numel, refer.data.size()); for (size_t i = 0; i < numel; ++i) { diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 33f1d0254858814be20eee1a6c2faaf00c2e8178..9a0dcc722cf00984b8c0e3ac20f13849e2904102 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -41,7 +41,10 @@ DEFINE_string(model_name, "", "model name"); DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_data, "", "data file"); DEFINE_string(refer_result, "", "reference result for comparison"); -DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(batch_size, 1, "batch size"); +DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup"); +// setting iterations to 0 means processing the whole dataset +DEFINE_int32(iterations, 0, "number of batches to process"); DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); @@ -239,7 +242,7 @@ void SetFakeImageInput(std::vector> *inputs, } input.shape = shape; input.dtype = PaddleDType::FLOAT32; - size_t len = std::accumulate(shape.begin(), shape.end(), 1, + size_t len = std::accumulate(shape.begin(), shape.end(), size_t{1}, [](int a, int b) { return a * b; }); input.data.Resize(len * sizeof(float)); input.lod.assign({{0, static_cast(FLAGS_batch_size)}}); @@ -286,17 +289,18 @@ void ConvertPaddleTensorToZeroCopyTensor( void PredictionWarmUp(PaddlePredictor *predictor, const std::vector> &inputs, - std::vector *outputs, int num_threads, - int tid) { + std::vector> *outputs, + int num_threads, int tid) { int batch_size = FLAGS_batch_size; LOG(INFO) << "Running thread " << tid << ", warm up run..."; if (FLAGS_zero_copy) { ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]); } + outputs->resize(1); Timer warmup_timer; warmup_timer.tic(); if (!FLAGS_zero_copy) { - predictor->Run(inputs[0], outputs, batch_size); + predictor->Run(inputs[0], &(*outputs)[0], batch_size); } else { predictor->ZeroCopyRun(); } @@ -308,11 +312,16 @@ void PredictionWarmUp(PaddlePredictor *predictor, void PredictionRun(PaddlePredictor *predictor, const std::vector> &inputs, - std::vector *outputs, int num_threads, - int tid) { - int batch_size = FLAGS_batch_size; + std::vector> *outputs, + int num_threads, int tid) { int num_times = FLAGS_repeat; - LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; + int iterations = inputs.size(); // process the whole dataset ... + if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size()) + iterations = + FLAGS_iterations; // ... unless the number of iterations is set + outputs->resize(iterations); + LOG(INFO) << "Thread " << tid << ", number of threads " << num_threads + << ", run " << num_times << " times..."; Timer run_timer; double elapsed_time = 0; #ifdef WITH_GPERFTOOLS @@ -320,14 +329,14 @@ void PredictionRun(PaddlePredictor *predictor, #endif if (!FLAGS_zero_copy) { run_timer.tic(); - for (size_t i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < iterations; i++) { for (int j = 0; j < num_times; j++) { - predictor->Run(inputs[i], outputs, batch_size); + predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size); } } elapsed_time = run_timer.toc(); } else { - for (size_t i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < iterations; i++) { ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]); run_timer.tic(); for (int j = 0; j < num_times; j++) { @@ -340,13 +349,14 @@ void PredictionRun(PaddlePredictor *predictor, ProfilerStop(); #endif - PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times, - inputs.size()); + auto batch_latency = elapsed_time / (iterations * num_times); + PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency, + iterations); if (FLAGS_record_benchmark) { Benchmark benchmark; benchmark.SetName(FLAGS_model_name); - benchmark.SetBatchSize(batch_size); - benchmark.SetLatency(elapsed_time / num_times); + benchmark.SetBatchSize(FLAGS_batch_size); + benchmark.SetLatency(batch_latency); benchmark.PersistToFile("benchmark_record.txt"); } } @@ -354,16 +364,17 @@ void PredictionRun(PaddlePredictor *predictor, void TestOneThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, - std::vector *outputs, bool use_analysis = true) { + std::vector> *outputs, bool use_analysis = true) { auto predictor = CreateTestPredictor(config, use_analysis); - PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0); - PredictionRun(predictor.get(), inputs, outputs, 1, 0); + PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, + 0); + PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0); } void TestMultiThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, - std::vector *outputs, int num_threads, + std::vector> *outputs, int num_threads, bool use_analysis = true) { std::vector threads; std::vector> predictors; @@ -376,7 +387,7 @@ void TestMultiThreadPrediction( threads.emplace_back([&, tid]() { // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. - std::vector outputs_tid; + std::vector> outputs_tid; auto &predictor = predictors[tid]; #ifdef PADDLE_WITH_MKLDNN if (use_analysis) { @@ -384,8 +395,8 @@ void TestMultiThreadPrediction( ->SetMkldnnThreadID(static_cast(tid) + 1); } #endif - PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid); - PredictionRun(predictor.get(), inputs, outputs, num_threads, tid); + PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads, tid); + PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid); }); } for (int i = 0; i < num_threads; ++i) { @@ -395,8 +406,8 @@ void TestMultiThreadPrediction( void TestPrediction(const PaddlePredictor::Config *config, const std::vector> &inputs, - std::vector *outputs, int num_threads, - bool use_analysis = FLAGS_use_analysis) { + std::vector> *outputs, + int num_threads, bool use_analysis = FLAGS_use_analysis) { PrintConfig(config, use_analysis); if (num_threads == 1) { TestOneThreadPrediction(config, inputs, outputs, use_analysis); @@ -406,30 +417,41 @@ void TestPrediction(const PaddlePredictor::Config *config, } } -void CompareTopAccuracy(const std::vector &output_slots1, - const std::vector &output_slots2) { - // first output: avg_cost - if (output_slots1.size() == 0 || output_slots2.size() == 0) +void CompareTopAccuracy( + const std::vector> &output_slots_quant, + const std::vector> &output_slots_ref) { + if (output_slots_quant.size() == 0 || output_slots_ref.size() == 0) throw std::invalid_argument( "CompareTopAccuracy: output_slots vector is empty."); - PADDLE_ENFORCE(output_slots1.size() >= 2UL); - PADDLE_ENFORCE(output_slots2.size() >= 2UL); - // second output: acc_top1 - if (output_slots1[1].lod.size() > 0 || output_slots2[1].lod.size() > 0) - throw std::invalid_argument( - "CompareTopAccuracy: top1 accuracy output has nonempty LoD."); - if (output_slots1[1].dtype != paddle::PaddleDType::FLOAT32 || - output_slots2[1].dtype != paddle::PaddleDType::FLOAT32) - throw std::invalid_argument( - "CompareTopAccuracy: top1 accuracy output is of a wrong type."); - float *top1_quantized = static_cast(output_slots1[1].data.data()); - float *top1_reference = static_cast(output_slots2[1].data.data()); - LOG(INFO) << "top1 INT8 accuracy: " << *top1_quantized; - LOG(INFO) << "top1 FP32 accuracy: " << *top1_reference; + float total_accs1_quant{0}; + float total_accs1_ref{0}; + for (size_t i = 0; i < output_slots_quant.size(); ++i) { + PADDLE_ENFORCE(output_slots_quant[i].size() >= 2UL); + PADDLE_ENFORCE(output_slots_ref[i].size() >= 2UL); + // second output: acc_top1 + if (output_slots_quant[i][1].lod.size() > 0 || + output_slots_ref[i][1].lod.size() > 0) + throw std::invalid_argument( + "CompareTopAccuracy: top1 accuracy output has nonempty LoD."); + if (output_slots_quant[i][1].dtype != paddle::PaddleDType::FLOAT32 || + output_slots_ref[i][1].dtype != paddle::PaddleDType::FLOAT32) + throw std::invalid_argument( + "CompareTopAccuracy: top1 accuracy output is of a wrong type."); + total_accs1_quant += + *static_cast(output_slots_quant[i][1].data.data()); + total_accs1_ref += + *static_cast(output_slots_ref[i][1].data.data()); + } + float avg_acc1_quant = total_accs1_quant / output_slots_quant.size(); + float avg_acc1_ref = total_accs1_ref / output_slots_ref.size(); + + LOG(INFO) << "Avg top1 INT8 accuracy: " << std::fixed << std::setw(6) + << std::setprecision(4) << avg_acc1_quant; + LOG(INFO) << "Avg top1 FP32 accuracy: " << std::fixed << std::setw(6) + << std::setprecision(4) << avg_acc1_ref; LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy; - CHECK_LE(std::abs(*top1_quantized - *top1_reference), - FLAGS_quantized_accuracy); + CHECK_LE(std::abs(avg_acc1_quant - avg_acc1_ref), FLAGS_quantized_accuracy); } void CompareDeterministic( @@ -455,20 +477,35 @@ void CompareNativeAndAnalysis( const PaddlePredictor::Config *config, const std::vector> &inputs) { PrintConfig(config, true); - std::vector native_outputs, analysis_outputs; + std::vector> native_outputs, analysis_outputs; TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); - CompareResult(analysis_outputs, native_outputs); + PADDLE_ENFORCE(native_outputs.size() > 0, "Native output is empty."); + PADDLE_ENFORCE(analysis_outputs.size() > 0, "Analysis output is empty."); + CompareResult(analysis_outputs.back(), native_outputs.back()); } void CompareQuantizedAndAnalysis( - const PaddlePredictor::Config *config, - const PaddlePredictor::Config *qconfig, + const AnalysisConfig *config, const AnalysisConfig *qconfig, const std::vector> &inputs) { - PrintConfig(config, true); - std::vector analysis_outputs, quantized_outputs; - TestOneThreadPrediction(config, inputs, &analysis_outputs, true); - TestOneThreadPrediction(qconfig, inputs, &quantized_outputs, true); + PADDLE_ENFORCE_EQ(inputs[0][0].shape[0], FLAGS_batch_size, + "Input data has to be packed batch by batch."); + LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size + << ", warmup batch size " << FLAGS_warmup_batch_size << "."; + + LOG(INFO) << "--- FP32 prediction start ---"; + auto *cfg = reinterpret_cast(config); + PrintConfig(cfg, true); + std::vector> analysis_outputs; + TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true); + + LOG(INFO) << "--- INT8 prediction start ---"; + auto *qcfg = reinterpret_cast(qconfig); + PrintConfig(qcfg, true); + std::vector> quantized_outputs; + TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true); + + LOG(INFO) << "--- comparing outputs --- "; CompareTopAccuracy(quantized_outputs, analysis_outputs); } @@ -578,9 +615,9 @@ static bool CompareTensorData(const framework::LoDTensor &a, const framework::LoDTensor &b) { auto a_shape = framework::vectorize(a.dims()); auto b_shape = framework::vectorize(b.dims()); - size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1, + size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), size_t{1}, [](int a, int b) { return a * b; }); - size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1, + size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), size_t{1}, [](int a, int b) { return a * b; }); if (a_size != b_size) { LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index cb668a4174134ba3ce9517955ff740ada568e97b..98ce225a0476b38c021b0b81489f69d7953ae456 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -74,7 +74,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); } - std::vector outputs; + std::vector> outputs; if (use_analysis || use_tensorrt) { AnalysisConfig config; config.EnableUseGpu(100, 0); diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec index 712449f6be87a3cfb61f099ad6291875c8ad1292..f0e3d3e86f24cb68f6e9d41f48c9698b43bca13e 100644 --- a/paddle/fluid/op_use_default_grad_op_maker.spec +++ b/paddle/fluid/op_use_default_grad_op_maker.spec @@ -8,9 +8,6 @@ conv_shift cos cos_sim dequantize -elementwise_div -elementwise_max -elementwise_min elu fc flatten @@ -28,8 +25,6 @@ gelu gru hard_shrink hierarchical_sigmoid -hinge_loss -huber_loss leaky_relu log logsigmoid @@ -57,7 +52,6 @@ requantize reshape rnn_memory_helper round -row_conv sequence_softmax sin softplus diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h index fc15d56891cf7af10a91ca22a09c84fa2e52d465..7e2740f148f1d273310f44ed4a35d413e7201394 100644 --- a/paddle/fluid/operators/batch_size_like.h +++ b/paddle/fluid/operators/batch_size_like.h @@ -74,5 +74,8 @@ class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker { virtual void Apply() = 0; }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(BatchSizeLikeNoNeedBufferVarsInference, + "Input"); + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc index dd28f82b65403550c67418cae535bbfeeef4476e..f0dc718195506e89bf9fecc0eb5e0d5117275a33 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc @@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include +#include +#include +#include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type.h" @@ -174,24 +177,41 @@ class ConditionalBlockGradOp : public ConditionalOp { framework::Executor exec(dev_place); auto *block = Attr("sub_block"); - exec.Run(*block->Program(), &cur_scope, block->ID(), false); - AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"), - Outputs(framework::GradVarName("Input"))); + const auto &ins = Inputs("Input"); + const auto &d_ins = Outputs(framework::GradVarName("Input")); + const auto &conds = Inputs("Cond"); + const auto &d_conds = Outputs(framework::GradVarName("Cond")); + + std::vector ins_conds_grads; + ins_conds_grads.reserve(ins.size() + conds.size()); + for (auto &in : ins) { + ins_conds_grads.emplace_back(framework::GradVarName(in)); + } + for (auto &cond : conds) { + ins_conds_grads.emplace_back(framework::GradVarName(cond)); + } + + exec.Run(*block->Program(), &cur_scope, block->ID(), false, true, + ins_conds_grads); + + AssignLocalGradientToGlobal(dev_place, cur_scope, ins_conds_grads.data(), + ins.size(), d_ins); - AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"), - Outputs(framework::GradVarName("Cond"))); + AssignLocalGradientToGlobal(dev_place, cur_scope, + ins_conds_grads.data() + ins.size(), + conds.size(), d_conds); } } private: void AssignLocalGradientToGlobal( const platform::Place &place, const framework::Scope &cur_scope, - const std::vector &p_names, + const std::string *p_grad_names, size_t p_grad_names_num, const std::vector &pg_names) const { - for (size_t i = 0; i < p_names.size(); ++i) { + for (size_t i = 0; i < p_grad_names_num; ++i) { auto out_grad_name = pg_names[i]; - auto in_grad_name = framework::GradVarName(p_names[i]); + const auto &in_grad_name = p_grad_names[i]; auto *in_var = cur_scope.FindVar(in_grad_name); if (in_var == nullptr) { continue; diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 85612ba47448a7b0d712e9314e3980019c96e9c3..530a54b7ca186008bc8ec4b083254e65378ae619 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -13,10 +13,47 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" +#include +#include #include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { + +class ElementwiseDivOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "Div"; } + std::string GetEquation() const override { return "Out = X / Y"; } +}; + +class ElementwiseDivGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("elementwise_div_grad"); + op->SetInput("Y", Input("Y")); + op->SetInput("Out", Output("Out")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y"); +REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp, + ops::ElementwiseDivOpMaker, ops::ElementwiseOpInferVarType, + ops::ElementwiseDivGradOpDescMaker); + +REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_div, diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index 8a07339077aeaa4403ffd1e1e30e0d58a9cc30e7..0f0ad8637301772f073bca305b9196b9c7865daf 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -47,7 +47,7 @@ struct DivGradDX { template struct DivGradDY { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return -dout * x / (y * y); + return -dout * out / y; } }; @@ -58,13 +58,15 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel { ElemwiseGradKernel::Compute(ctx); using Tensor = framework::Tensor; - auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); + + auto* x = dout; // Fake x, not used + ElemwiseGradCompute, DivGradDY>( ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX(), DivGradDY()); } diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc index ea0dcd736e5700fb0f341938ac3e3e3b178f29c1..b7df9c6f845dfc941e3c6acbc986a584e984a1de 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc @@ -13,9 +13,48 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_max_op.h" +#include +#include #include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { + +class ElementwiseMaxOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "Max"; } + std::string GetEquation() const override { return "Out = max(X, Y)"; } +}; + +class ElementwiseMaxGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("elementwise_max_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_max, "Max", "Out = max(X, Y)"); + +REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp, + ops::ElementwiseMaxOpMaker, ops::ElementwiseOpInferVarType, + ops::ElementwiseMaxGradOpDescMaker); + +REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad); + REGISTER_OP_CPU_KERNEL( elementwise_max, ops::ElementwiseMaxKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h index 3ee0c32e0d5d5df02d5d157416918fb4fb3aca92..abdb1b9671de80d02b9a6a788088f47929fcc6f0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h @@ -63,10 +63,10 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* out = dout; // Fake out, not used int axis = ctx.Attr("axis"); ElemwiseGradCompute, MaxGradDy>( ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx(), MaxGradDy()); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc index b263b9addd40cfd329d2cc8588c278df2cb008e9..f60c0ed8a0faad384f4eaa631c2758f83bc56414 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc @@ -13,9 +13,48 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_min_op.h" +#include +#include #include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { + +class ElementwiseMinOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "Min"; } + std::string GetEquation() const override { return "Out = min(X, Y)"; } +}; + +class ElementwiseMinGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("elementwise_min_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_min, "Min", "Out = min(X, Y)"); + +REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp, + ops::ElementwiseMinOpMaker, ops::ElementwiseOpInferVarType, + ops::ElementwiseMinGradOpDescMaker); + +REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad); + REGISTER_OP_CPU_KERNEL( elementwise_min, ops::ElementwiseMinKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h index d04e372faaa4e6296e982afe6155cdde2fec4f81..1a49a6013987ae1ec685ec91ca656e4756ba7c32 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h @@ -62,10 +62,10 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* out = dout; // Fake out, not used int axis = ctx.Attr("axis"); ElemwiseGradCompute, MinGradDy>( ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx(), MinGradDy()); diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 6dbb9072495f743a4df1ff05e029a227c2cf618b..95246b38f530ff5f81e1fbb5f1dd22149943c8ff 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -173,12 +173,12 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { using Tensor = framework::Tensor; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + auto out_grad_name = framework::GradVarName("Out"); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + PADDLE_ENFORCE(ctx->HasInput(out_grad_name), "Input(Out@GRAD) should not be null"); - auto x_dims = ctx->GetInputDim("X"); + auto x_dims = ctx->GetInputDim(out_grad_name); auto y_dims = ctx->GetInputDim("Y"); PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), @@ -187,8 +187,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); if (ctx->HasOutput(x_grad_name)) { - ctx->ShareDim("X", /*->*/ x_grad_name); - ctx->ShareLoD("X", /*->*/ x_grad_name); + ctx->ShareDim(out_grad_name, /*->*/ x_grad_name); + ctx->ShareLoD(out_grad_name, /*->*/ x_grad_name); } if (ctx->HasOutput(y_grad_name)) { ctx->ShareDim("Y", /*->*/ y_grad_name); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc index 453a1b32a0171a2ca88879ab3287e89c4d3c7759..b8921b171cf37be17fb62d270a5c22f9d1806c64 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc @@ -46,6 +46,7 @@ obtained from the `input` tensor. )DOC"); } }; + } // namespace operators } // namespace paddle @@ -53,7 +54,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOp, paddle::framework::EmptyGradOpMaker, - ops::FillConstantBatchSizeLikeOpMaker); + ops::FillConstantBatchSizeLikeOpMaker, + ops::BatchSizeLikeNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOpKernel(ctx.Attr("dtype")), + ctx.GetPlace()); + } }; + +class FillZerosLikeOp2Maker : public FillZerosLikeOpMaker { + protected: + void ExtraMake() override { + this->AddAttr("dtype", + "(int, default 5(FP32)) " + "Output data type.") + .SetDefault(framework::proto::VarType::FP32); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(FillZerosLikeOp2NoNeedBufferVarsInference, + "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, ops::FillZerosLikeOpMaker); + +REGISTER_OPERATOR(fill_zeros_like2, ops::FillZerosLikeOp2, + ops::FillZerosLikeOp2Maker, + ops::FillZerosLikeOp2NoNeedBufferVarsInference, + paddle::framework::EmptyGradOpMaker); + REGISTER_OP_CPU_KERNEL( fill_zeros_like, ops::FillZerosLikeKernel, @@ -58,3 +95,11 @@ REGISTER_OP_CPU_KERNEL( ops::FillZerosLikeKernel, ops::FillZerosLikeKernel, ops::FillZerosLikeKernel); + +REGISTER_OP_CPU_KERNEL( + fill_zeros_like2, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc index e80a703c30c0335124c089ea82ba4f6fe055acde..1831635def79b3ccb713dbc14cc70b8beeb609fc 100644 --- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc +++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc @@ -26,3 +26,13 @@ REGISTER_OP_CUDA_KERNEL( ops::FillZerosLikeKernel, ops::FillZerosLikeKernel); + +REGISTER_OP_CUDA_KERNEL( + fill_zeros_like2, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc index 98ebe1fdf4bb3308b2f07a073072031e79e14146..01302687a421165e908b2aa0646ba8b9c835034e 100644 --- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc +++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc @@ -65,17 +65,13 @@ by input arguments. } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( - GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input"); - } // namespace operators } // namespace paddle -REGISTER_OPERATOR( - gaussian_random_batch_size_like, - paddle::operators::GaussianRandomBatchSizeLikeOp, - paddle::operators::GaussianRandomBatchSizeLikeOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference); +REGISTER_OPERATOR(gaussian_random_batch_size_like, + paddle::operators::GaussianRandomBatchSizeLikeOp, + paddle::operators::GaussianRandomBatchSizeLikeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::operators::BatchSizeLikeNoNeedBufferVarsInference); // Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index f458ce6c83bfcfb56d558409b0802f27f13a4761..b6cfa9cc43c312e60a1b7c5e13d1ecbe6bc5dc7d 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/hinge_loss_op.h" +#include +#include +#include namespace paddle { namespace operators { @@ -97,12 +100,29 @@ class HingeLossGradOp : public framework::OperatorWithKernel { } }; +class HingeLossGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("hinge_loss_grad"); + op->SetInput("Logits", Input("Logits")); + op->SetInput("Labels", Input("Labels")); + op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::HingeLossGradOpDescMaker); REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp); REGISTER_OP_CPU_KERNEL( hinge_loss, diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 253b65a5f33308fc2c94537641b0fa19378b0cc9..a72db384c1f09f66ecf7ce85271d6263bbdcb523 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/huber_loss_op.h" +#include +#include +#include namespace paddle { namespace operators { @@ -90,38 +93,45 @@ class HuberLossGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Residual"), - "Input(Residual) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); auto residual_dims = ctx->GetInputDim("Residual"); - auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); - - PADDLE_ENFORCE_EQ(residual_dims, x_dims); - PADDLE_ENFORCE_EQ(out_grad_dims, x_dims); auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); + ctx->SetOutputDim(x_grad_name, residual_dims); } if (ctx->HasOutput(y_grad_name)) { - ctx->SetOutputDim(y_grad_name, y_dims); + ctx->SetOutputDim(y_grad_name, residual_dims); } } }; +class HuberLossGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("huber_loss_grad"); + op->SetInput("Residual", Output("Residual")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::HuberLossGradOpDescMaker); REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp); REGISTER_OP_CPU_KERNEL( huber_loss, ops::HuberLossKernel, diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 656728c609eb19f90390d9dec72d9e30fd3040fd..435c755df3642ae0ba5144a89ed30ed6e0b63258 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = framework::OpKernelType( - framework::proto::VarType::FP32, platform::CPUPlace()); + framework::proto::VarType::FP32, ctx.GetPlace()); return kt; } }; diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..59ba660af79bff02cd350afb3eb7675bfe8ac498 --- /dev/null +++ b/paddle/fluid/operators/pixel_shuffle_op.cc @@ -0,0 +1,135 @@ +/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pixel_shuffle_op.h" +#include + +namespace paddle { +namespace operators { + +class PixelShuffleOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of PixelShuffleOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PixelShuffleOp should not be null."); + + auto input_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + auto upscale_factor = ctx->Attrs().Get("upscale_factor"); + + PADDLE_ENFORCE(input_dims[1] % (upscale_factor * upscale_factor) == 0, + "Upscale_factor should devide the number of channel"); + + auto output_dims = input_dims; + output_dims[0] = input_dims[0]; + output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor); + output_dims[2] = input_dims[2] * upscale_factor; + output_dims[3] = input_dims[3] * upscale_factor; + ctx->SetOutputDim("Out", output_dims); + } +}; + +class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "X", + "(Tensor, default Tensor), " + "the input feature data of PixelShuffleOp, the layout is [N C H W]."); + AddOutput( + "Out", + "(Tensor, default Tensor), the output of " + "PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor]."); + AddAttr("upscale_factor", + "the factor to increase spatial resolution by.") + .SetDefault(1) + .AddCustomChecker([](const int& upscale_factor) { + PADDLE_ENFORCE_GE(upscale_factor, 1, + "upscale_factor should be larger than 0."); + }); + + AddComment(R"DOC( + Pixel Shuffle operator + This operator rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` + to a tensor of shape :math:`(C, H \times r, W \times r)`. + + This is useful for implementing efficient sub-pixel convolution + with a stride of :math:`1/r`. + + Please refer to the paper: + `Real-Time Single Image and Video Super-Resolution Using an Efficient + Sub-Pixel Convolutional Neural Network `_ + by Shi et. al (2016) for more details. + + )DOC"); + } +}; + +class PixelShuffleGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("pixel_shuffle_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetAttrMap(Attrs()); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return std::unique_ptr(op); + } +}; + +class PixelShuffleGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@Grad) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@Grad) should not be null"); + + auto do_dims = ctx->GetInputDim(framework::GradVarName("Out")); + PADDLE_ENFORCE(do_dims.size() == 4, "The layout of input is NCHW."); + + auto upscale_factor = ctx->Attrs().Get("upscale_factor"); + + auto dx_dims = do_dims; + dx_dims[0] = do_dims[0]; + dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor); + dx_dims[2] = do_dims[2] / upscale_factor; + dx_dims[3] = do_dims[3] / upscale_factor; + ctx->SetOutputDim(framework::GradVarName("X"), dx_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker, + ops::PixelShuffleGradMaker); + +REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp); + +REGISTER_OP_CPU_KERNEL( + pixel_shuffle, + ops::PixelShuffleOpKernel, + ops::PixelShuffleOpKernel); + +REGISTER_OP_CPU_KERNEL( + pixel_shuffle_grad, + ops::PixelShuffleGradOpKernel, + ops::PixelShuffleGradOpKernel); diff --git a/paddle/fluid/operators/pixel_shuffle_op.cu b/paddle/fluid/operators/pixel_shuffle_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6faf91079e1dac00b3516ccde8dc82cec73a79e6 --- /dev/null +++ b/paddle/fluid/operators/pixel_shuffle_op.cu @@ -0,0 +1,26 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pixel_shuffle_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + pixel_shuffle, ops::PixelShuffleOpKernel, + ops::PixelShuffleOpKernel); +REGISTER_OP_CUDA_KERNEL( + pixel_shuffle_grad, + ops::PixelShuffleGradOpKernel, + ops::PixelShuffleGradOpKernel); diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1ae1c7e9d50cb9d701fd0e79337a1906f2f5d545 --- /dev/null +++ b/paddle/fluid/operators/pixel_shuffle_op.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class PixelShuffleOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + int factor = ctx.Attr("upscale_factor"); + + auto in_dims = in->dims(); + auto o_dims = out->dims(); + + framework::Tensor t; + t.ShareDataWith(*in); + t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]}); + + std::vector axis = {0, 1, 4, 2, 5, 3}; + + framework::Tensor o; + o.ShareDataWith(*out); + o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor}); + + math::Transpose trans; + auto& dev_ctx = ctx.template device_context(); + trans(dev_ctx, t, &o, axis); + out->Resize(o_dims); + } +}; + +template +class PixelShuffleGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); + + int factor = ctx.Attr("upscale_factor"); + + auto do_dims = dout->dims(); + auto dx_dims = dx->dims(); + + framework::Tensor t; + t.ShareDataWith(*dout); + t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor}); + + std::vector axis = {0, 1, 3, 5, 2, 4}; + + framework::Tensor o; + o.ShareDataWith(*dx); + o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]}); + + math::Transpose trans; + auto& dev_ctx = ctx.template device_context(); + trans(dev_ctx, t, &o, axis); + dx->Resize(dx_dims); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc index d283bddbe9f974ac6835ee91d5a7851453687b80..81aabdd0061b3940f23d4731d55fc5cbe5817004 100644 --- a/paddle/fluid/operators/row_conv_op.cc +++ b/paddle/fluid/operators/row_conv_op.cc @@ -13,6 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/row_conv_op.h" +#include +#include +#include + #include "paddle/fluid/framework/eigen.h" namespace paddle { @@ -54,7 +58,6 @@ class RowConvGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); PADDLE_ENFORCE(ctx->HasInput("Filter"), "Input(Filter) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), @@ -62,8 +65,8 @@ class RowConvGradOp : public framework::OperatorWithKernel { auto x_grad_name = framework::GradVarName("X"); if (ctx->HasOutput(x_grad_name)) { - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim(x_grad_name, x_dims); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(x_grad_name, dout_dims); } auto filter_grad_name = framework::GradVarName("Filter"); @@ -259,12 +262,31 @@ class RowConvGradKernel } } }; + +class RowConvGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("row_conv_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput("Filter", Input("Filter")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::RowConvGradOpDescMaker); REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp); REGISTER_OP_CPU_KERNEL( row_conv, ops::RowConvKernel); diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc index 75d6181749e4e9bd81a3c02de69caf0acd81eef9..7260fe25d6ebb357040af8774c574b767bfd9f13 100644 --- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc +++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc @@ -64,8 +64,9 @@ with random values sampled from a uniform distribution. } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT( - uniform_random_batch_size_like, - paddle::operators::UniformRandomBatchSizeLikeOp, - paddle::operators::UniformRandomBatchSizeLikeOpMaker); +REGISTER_OPERATOR(uniform_random_batch_size_like, + paddle::operators::UniformRandomBatchSizeLikeOp, + paddle::operators::UniformRandomBatchSizeLikeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::operators::BatchSizeLikeNoNeedBufferVarsInference); // Kernels are registered in uniform_random_op.cc and uniform_random_op.cu diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index c8a0aa58859cca06375ce578e5a7097179e23107..16365c1fd0b0adb914cdfd08e3f6542fca952e06 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool - tracer analysis_predictor imperative_profiler) + tracer analysis_predictor imperative_profiler nccl_context) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index e9ed4e16443eba481143bd2095f9970bcb167d71..265707f1bccdabd37b9a7248755d0b81339418c3 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -29,7 +29,7 @@ namespace paddle { namespace pybind { // Bind Methods -void BindTracer(pybind11::module* m) { +void BindImperative(pybind11::module* m) { pybind11::class_(*m, "Tracer", "") .def("__init__", [](imperative::Tracer& self, framework::BlockDesc* root_block) { @@ -59,6 +59,47 @@ void BindTracer(pybind11::module* m) { }) .def("py_trace", &imperative::Tracer::PyTrace, pybind11::return_value_policy::take_ownership); + + // define parallel context + pybind11::class_ parallel_strategy( + *m, "ParallelStrategy", ""); + parallel_strategy.def(pybind11::init()) + .def_property( + "nranks", + [](const imperative::ParallelStrategy& self) { return self.nranks_; }, + [](imperative::ParallelStrategy& self, int nranks) { + self.nranks_ = nranks; + }) + .def_property("local_rank", + [](const imperative::ParallelStrategy& self) { + return self.local_rank_; + }, + [](imperative::ParallelStrategy& self, int local_rank) { + self.local_rank_ = local_rank; + }) + .def_property( + "trainer_endpoints", + [](const imperative::ParallelStrategy& self) { + return self.trainer_endpoints_; + }, + [](imperative::ParallelStrategy& self, std::vector eps) { + self.trainer_endpoints_ = eps; + }) + .def_property("current_endpoint", + [](const imperative::ParallelStrategy& self) { + return self.current_endpoint_; + }, + [](imperative::ParallelStrategy& self, + const std::string& ep) { self.current_endpoint_ = ep; }); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + pybind11::class_ nccl_ctx( + *m, "NCCLParallelContext"); + + nccl_ctx + .def(pybind11::init()) + .def("init", [](imperative::NCCLParallelContext& self) { self.Init(); }); +#endif } } // namespace pybind diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index 8496cbfcb18798ee8ce1714431b7877bb2b7d377..f9d4a7c990e23b30eb7f5086fe56587f7c38bd22 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/nccl_context.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" @@ -46,7 +47,7 @@ class PyVarBase : public imperative::VarBase { using imperative::VarBase::VarBase; // Inherit constructors }; -void BindTracer(pybind11::module* m); +void BindImperative(pybind11::module* m); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 044677fb756e0368c65b84f15fdf2540abbd14b8..8c34e3efe2a07cadde5aa06669fda88be7661db1 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -288,7 +288,7 @@ PYBIND11_MODULE(core, m) { }) .def_static("num_funcs", &imperative::PyLayer::NumFuncs); - BindTracer(&m); + BindImperative(&m); py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index 03c4078775d455fdb19aaf78ace4dcb98c8dd66a..d8153fa00267b00eedc52aa043af9ba7dc090f7d 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -32,6 +32,7 @@ default_envs = { "NCCL_SOCKET_IFNAME": "eth0", "NCCL_IB_GID_INDEX": "3", "NCCL_IB_RETRY_CNT": "0", + "PYTHONPATH": os.getenv("PYTHONPATH", ""), } GPUS = 8 diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 6303be003a701e57a8aa1e2f925459f416cdb543..9fd53a74bf51929f9e115fdc94f2f85f8e2fbdda 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -231,9 +231,16 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): for idx, op_desc in enumerate(op_descs): for arg in op_desc.input_arg_names(): if core.grad_var_suffix() in arg and arg in no_grad_set: - to_insert.append((_create_op_desc_("fill_zeros_like", { - "X": [_strip_grad_suffix_(arg)] - }, {"Out": [arg]}, {}), idx)) + x_in = _strip_grad_suffix_(arg) + x_in_var_desc = op_desc.block().find_var_recursive( + cpt.to_bytes(x_in)) + assert x_in_var_desc is not None, "Variable {} not found".format( + x_in) + dtype = x_in_var_desc.dtype() + + to_insert.append( + (_create_op_desc_("fill_zeros_like2", {"X": [x_in]}, + {"Out": [arg]}, {"dtype": dtype}), idx)) list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)]) diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py index 2d0c7b7ddaacee28da599d5850e9b3381c01de5c..9bb72ede304dbde732153bac980f24a74bcd126d 100644 --- a/python/paddle/fluid/dygraph/__init__.py +++ b/python/paddle/fluid/dygraph/__init__.py @@ -29,6 +29,9 @@ from .tracer import * from . import profiler from .profiler import * +from . import parallel +from .parallel import * + from . import checkpoint from .checkpoint import * @@ -41,5 +44,6 @@ __all__ += base.__all__ __all__ += nn.__all__ __all__ += tracer.__all__ __all__ += profiler.__all__ +__all__ += parallel.__all__ __all__ += checkpoint.__all__ __all__ += learning_rate_scheduler.__all__ diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 178e6cd48613cf25ea3026c6bf097b9b886e11bf..71abb9e3eca974138fe2d8bedd41e4d58983f80c 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -15,19 +15,20 @@ from __future__ import print_function from six.moves import reduce -import numpy as np from .. import core from ..layers import utils from . import layers -from ..framework import Variable, OpProtoHolder, Parameter -from ..layers import layer_function_generator +from ..framework import Variable, _in_dygraph_mode, OpProtoHolder, Parameter from ..param_attr import ParamAttr from ..initializer import Normal, Constant, NumpyArrayInitializer +import numpy as np __all__ = [ - 'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm', - 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', 'SequenceConv' + 'Conv2D', 'Conv3D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', + 'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', + 'Conv3DTranspose', 'SequenceConv', 'RowConv', 'GroupNorm', 'SpectralNorm', + 'TreeConv' ] @@ -137,6 +138,303 @@ class Conv2D(layers.Layer): return self._helper.append_activation(pre_act, act=self._act) +class Conv3D(layers.Layer): + """ + **Convlution3D Layer** + + The convolution3D layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input(Input) and + Output(Output) are in NCDHW format. Where N is batch size C is the number of + channels, D is the depth of the feature, H is the height of the feature, + and W is the width of the feature. Convlution3D is similar with Convlution2D + but adds one dimension(depth). If bias attribution and activation type are + provided, bias is added to the output of the convolution, and the + corresponding activation function is applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W \\ast X + b) + + In the above equation: + + * :math:`X`: Input value, a tensor with NCDHW format. + * :math:`W`: Filter value, a tensor with MCDHW format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)` + + - Output: + Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\ + H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1 + + Args: + input (Variable): The input image with [N, C, D, H, W] format. + num_filters(int): The number of filter. It is as same as the output + image channel. + filter_size (int|tuple|None): The filter size. If filter_size is a tuple, + it must contain three integers, (filter_size_D, filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + stride (int|tuple): The stride size. If stride is a tuple, it must + contain three integers, (stride_D, stride_H, stride_W). Otherwise, the + stride_D = stride_H = stride_W = stride. Default: stride = 1. + padding (int|tuple): The padding size. If padding is a tuple, it must + contain three integers, (padding_D, padding_H, padding_W). Otherwise, the + padding_D = padding_H = padding_W = padding. Default: padding = 0. + dilation (int|tuple): The dilation size. If dilation is a tuple, it must + contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the + dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1. + groups (int): The groups number of the Conv3d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1 + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv3d. If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as param_attr. If it is set to None, the parameter + is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is + :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act (str): Activation type, if it is set to None, activation is not appended. + Default: None. + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Default: None. + + Returns: + Variable: The tensor variable storing the convolution and \ + non-linearity activation result. + + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32') + conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu") + """ + + def __init__(self, + name_scope, + num_filters, + filter_size, + stride=1, + padding=0, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None): + assert param_attr is not False, "param_attr should not be False here." + super(Conv3D, self).__init__(name_scope) + self._groups = groups + self._stride = utils.convert_to_list(stride, 3, 'stride') + self._padding = utils.convert_to_list(padding, 3, 'padding') + self._dilation = utils.convert_to_list(dilation, 3, 'dilation') + self._act = act + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + self._use_cudnn = use_cudnn + self._filter_size = filter_size + self._num_filters = num_filters + self._param_attr = param_attr + self._bias_attr = bias_attr + + def _build_once(self, input): + num_channels = input.shape[1] + self._dtype = self._helper.input_dtype(input) + + if self._groups is None: + num_filter_channels = num_channels + else: + if num_channels % self._groups != 0: + raise ValueError("num_channels must be divisible by groups.") + num_filter_channels = num_channels // self._groups + + filter_size = utils.convert_to_list(self._filter_size, 3, 'filter_size') + + filter_shape = [self._num_filters, num_filter_channels] + filter_size + + def _get_default_param_initializer(): + filter_elem_num = filter_size[0] * filter_size[1] * filter_size[ + 2] * num_channels + std = (2.0 / filter_elem_num)**0.5 + return Normal(0.0, std, 0) + + self._filter_param = self.create_parameter( + attr=self._param_attr, + shape=filter_shape, + dtype=self._dtype, + default_initializer=_get_default_param_initializer()) + + self._bias_param = self.create_parameter( + attr=self._bias_attr, + shape=[self._num_filters], + dtype=self._dtype, + is_bias=True) + + def forward(self, input): + pre_bias = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type='conv3d', + inputs={ + 'Input': input, + 'Filter': self._filter_param, + }, + outputs={"Output": pre_bias}, + attrs={ + 'strides': self._stride, + 'paddings': self._padding, + 'dilations': self._dilation, + 'groups': self._groups if self._groups else 1, + 'use_cudnn': self._use_cudnn, + 'use_mkldnn': False + }) + + pre_act = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type='elementwise_add', + inputs={'X': [pre_bias], + 'Y': [self._bias_param]}, + outputs={'Out': [pre_act]}, + attrs={'axis': 1}) + + return self._helper.append_activation(pre_act, act=self._act) + + +class Conv3DTranspose(layers.Layer): + def __init__(self, + name_scope, + num_filters, + output_size=None, + filter_size=None, + padding=0, + stride=1, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + name=None): + super(Conv3DTranspose, self).__init__(name_scope) + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + assert param_attr is not False, "param_attr should not be False in conv3d_transpose." + self._padding = utils.convert_to_list(padding, 3, 'padding') + self._stride = utils.convert_to_list(stride, 3, 'stride') + self._dilation = utils.convert_to_list(dilation, 3, 'dilation') + self._param_attr = param_attr + self._filter_size = filter_size + self._output_size = output_size + self._groups = 1 if groups is None else groups + self._num_filters = num_filters + self._use_cudnn = use_cudnn + self._bias_attr = bias_attr + self._act = act + + def _build_once(self, input): + self._dtype = self._helper.input_dtype(input) + self._input_channel = input.shape[1] + + if self._filter_size is None: + if self._output_size is None: + raise ValueError( + "output_size must be set when filter_size is None") + if isinstance(self._output_size, int): + self._output_size = [self._output_size, self._output_size] + + d_in = input.shape[2] + h_in = input.shape[3] + w_in = input.shape[4] + + filter_size_d = (self._output_size[0] - + (d_in - 1) * self._stride[0] + 2 * self._padding[0] + - 1) // self._dilation[0] + 1 + filter_size_h = (self._output_size[1] - + (h_in - 1) * self._stride[1] + 2 * self._padding[1] + - 1) // self._dilation[1] + 1 + filter_size_w = (self._output_size[2] - + (w_in - 1) * self._stride[2] + 2 * self._padding[2] + - 1) // self._dilation[2] + 1 + self._filter_size = [filter_size_d, filter_size_h, filter_size_w] + else: + self._filter_size = utils.convert_to_list( + self._filter_size, 3, 'conv3d_transpose.filter_size') + + filter_shape = [ + self._input_channel, self._num_filters // self._groups + ] + self._filter_size + self._img_filter = self.create_parameter( + dtype=self._dtype, shape=filter_shape, attr=self._param_attr) + if self._bias_attr: + self._bias_param = self.create_parameter( + attr=self._bias_attr, + shape=[self._num_filters], + dtype=self._dtype, + is_bias=True) + + def forward(self, input): + pre_bias = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( + type="conv3d_transpose", + inputs={'Input': [input], + 'Filter': [self._img_filter]}, + outputs={'Output': pre_bias}, + attrs={ + 'strides': self._stride, + 'paddings': self._padding, + 'dilations': self._dilation, + 'groups': self._groups if self._groups else 1, + 'use_cudnn': self._use_cudnn + }) + + if self._bias_attr: + pre_act = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( + type='elementwise_add', + inputs={'X': [pre_bias], + 'Y': [self._bias_param]}, + outputs={'Out': [pre_act]}, + attrs={'axis': 1}) + else: + pre_act = pre_bias + + # Currently, we don't support inplace in imperative mode + return self._helper.append_activation(pre_act, act=self._act) + + class Pool2D(layers.Layer): def __init__(self, name_scope, @@ -1388,6 +1686,8 @@ class SequenceConv(layers.Layer): bias_attr=None, param_attr=None, act=None): + assert not _in_dygraph_mode( + ), "SequenceConv is not supported by dynamic graph mode yet!" super(SequenceConv, self).__init__(name_scope) self._num_filters = num_filters self._filter_size = filter_size @@ -1397,12 +1697,10 @@ class SequenceConv(layers.Layer): self._param_attr = param_attr def _build_once(self, input): - self._dtype = self._helper.input_dtype(input) - print(self._filter_size) filter_shape = [self._filter_size * input.shape[1], self._num_filters] self._filter_param = self.create_parameter( - attr=self.param_attr, shape=filter_shape, dtype=self._dtype) + attr=self._param_attr, shape=filter_shape, dtype=self._dtype) def forward(self, input): pre_bias = self._helper.create_variable_for_type_inference(self._dtype) @@ -1420,3 +1718,237 @@ class SequenceConv(layers.Layer): }) pre_act = self._helper.append_bias_op(pre_bias) return self._helper.append_activation(pre_act) + + +class RowConv(layers.Layer): + def __init__(self, + name_scope, + future_context_size, + param_attr=None, + act=None): + assert not _in_dygraph_mode( + ), "RowConv is not supported by dynamic graph mode yet!" + super(RowConv, self).__init__(name_scope) + self._act = act + self._param_attr = param_attr + self._future_context_size = future_context_size + + def _build_once(self, input): + self._dtype = self._helper.input_dtype(input) + filter_shape = [self._future_context_size + 1, input.shape[1]] + self._filter_param = self.create_parameter( + attr=self._param_attr, + shape=filter_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, input): + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type='row_conv', + inputs={'X': [input], + 'Filter': [self._filter_param]}, + outputs={'Out': [out]}) + return self._helper.append_activation(out, act=self._act) + + +class GroupNorm(layers.Layer): + """ + **Group Normalization Layer** + + Refer to `Group Normalization `_ . + + Args: + name_scope (str): See base class. + groups(int): The number of groups that divided from channels. + epsilon(float): The small value added to the variance to prevent + division by zero. + param_attr(ParamAttr|None): The parameter attribute for the learnable + scale :math:`g`. If it is set to False, no scale will be added to the output units. + If it is set to None, the bias is initialized one. Default: None. + bias_attr(ParamAttr|None): The parameter attribute for the learnable + bias :math:`b`. If it is set to False, no bias will be added to the output units. + If it is set to None, the bias is initialized zero. Default: None. + act(str): Activation to be applied to the output of group normalizaiton. + data_layout(string|NCHW): Only NCHW is supported. + dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc + + Returns: + Variable: A tensor variable which is the result after applying group normalization on the input. + + + """ + + def __init__(self, + name_scope, + groups, + epsilon=1e-05, + param_attr=None, + bias_attr=None, + act=None, + data_layout='NCHW'): + super(GroupNorm, self).__init__(name_scope) + self._param_attr = param_attr + self._bias_attr = bias_attr + self._epsilon = epsilon + self._groups = groups + self._act = act + if data_layout != 'NCHW': + raise ValueError("unsupported data layout:" + data_layout) + + def _build_once(self, input): + self._dtype = self._helper.input_dtype(input) + param_shape = [input.shape[1]] + if self._bias_attr: + self._bias = self.create_parameter( + attr=self._bias_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=True) + + if self._param_attr: + self._scale = self.create_parameter( + attr=self._param_attr, + shape=param_shape, + dtype=self._dtype, + default_initializer=Constant(1.0)) + + def forward(self, input): + inputs = {'X': input} + if self._bias: + inputs['Bias'] = self._bias + if self._scale: + inputs['Scale'] = self._scale + + # create output + mean_out = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + variance_out = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + group_norm_out = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type="group_norm", + inputs=inputs, + outputs={ + "Y": group_norm_out, + "Mean": mean_out, + "Variance": variance_out, + }, + attrs={"epsilon": self._epsilon, + "groups": self._groups}) + + return self._helper.append_activation(group_norm_out, self._act) + + +class SpectralNorm(layers.Layer): + def __init__(self, name_scope, dim=0, power_iters=1, eps=1e-12, name=None): + super(SpectralNorm, self).__init__(name_scope) + self._power_iters = power_iters + self._eps = eps + self._dim = dim + + def _build_once(self, weight): + self._dtype = self._helper.input_dtype(weight) + input_shape = weight.shape + h = input_shape[self._dim] + w = np.prod(input_shape) // h + + self.u = self.create_parameter( + attr=ParamAttr(), + shape=[h], + dtype=self._dtype, + default_initializer=Normal(0., 1.)) + self.u.stop_gradient = True + + self.v = self.create_parameter( + attr=ParamAttr(), + shape=[w], + dtype=self._dtype, + default_initializer=Normal(0., 1.)) + self.v.stop_gradient = True + + def forward(self, weight): + inputs = {'Weight': weight, 'U': self.u, 'V': self.v} + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="spectral_norm", + inputs=inputs, + outputs={"Out": out, }, + attrs={ + "dim": self._dim, + "power_iters": self._power_iters, + "eps": self._eps, + }) + + return out + + +class TreeConv(layers.Layer): + def __init__(self, + name_scope, + output_size, + num_filters=1, + max_depth=2, + act='tanh', + param_attr=None, + bias_attr=None, + name=None): + super(TreeConv, self).__init__(name_scope) + self._name = name + self._output_size = output_size + self._act = act + self._max_depth = max_depth + self._num_filters = num_filters + self._bias_attr = bias_attr + self._param_attr = param_attr + + def _build_once(self, nodes_vector, edge_set): + assert isinstance(nodes_vector, Variable) + assert isinstance(edge_set, Variable) + self._dtype = self._helper.input_dtype(nodes_vector) + + feature_size = nodes_vector.shape[2] + w_shape = [feature_size, 3, self._output_size, self._num_filters] + if self._bias_attr: + self._bias_param = self.create_parameter( + attr=self._bias_attr, + shape=[self._num_filters], + dtype=self._dtype, + is_bias=True) + self.W = self.create_parameter( + attr=self._param_attr, + shape=w_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, nodes_vector, edge_set): + if self._name: + out = self.create_variable( + name=self._name, dtype=self._dtype, persistable=False) + else: + out = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type='tree_conv', + inputs={ + 'NodesVector': nodes_vector, + 'EdgeSet': edge_set, + 'Filter': self.W + }, + outputs={'Out': out, }, + attrs={'max_depth': self._max_depth}) + if self._bias_attr: + pre_activation = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( + type='elementwise_add', + inputs={'X': [out], + 'Y': [self._bias_param]}, + outputs={'Out': [pre_activation]}, + attrs={'axis': 1}) + else: + pre_activation = out + return self._helper.append_activation(pre_activation, act=self._act) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..f7decac963f47ba1dcc33e9c8eab7900e745d1df --- /dev/null +++ b/python/paddle/fluid/dygraph/parallel.py @@ -0,0 +1,60 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except jin compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from .. import core + +__all__ = ["prepare_context"] + +ParallelStrategy = core.ParallelStrategy + +__parallel_ctx__clz__ = None + + +def prepare_context(parallel_strategy, place): + global __parallel_ctx__clz__ + assert __parallel_ctx__clz__ is None, "ParallelContext can only be initialized once." + + if isinstance(place, core.CUDAPlace): + __parallel_ctx__clz__ = core.NCCLParallelContext(parallel_strategy, + place) + else: + # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation + assert ("Only support CUDAPlace for now.") + __parallel_ctx__clz__.init() + + +class Env(object): + def __init__(self): + self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) + self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0")) + self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0")) + self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", + "").split(",") + self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "") + + @property + def nranks(self): + return self._nranks + + @property + def local_rank(self): + return self._local_rank + + @property + def dev_id(self): + return self._dev_id + + @property + def current_endpoint(self): + return self._current_endpoint diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 66526848c9b8d9cb17190e89193d790843f1628d..a5d4d3947ab9b9699f6fc8ac0d7f088ede345290 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -191,6 +191,7 @@ __all__ = [ 'kldiv_loss', 'tree_conv', 'npair_loss', + 'pixel_shuffle', 'fsp_matrix', ] @@ -10961,6 +10962,65 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002): return l2loss + celoss +def pixel_shuffle(x, upscale_factor): + """ + + **Pixel Shuffle Layer** + + This layer rearranges elements in a tensor of shape [N, C, H, W] + to a tensor of shape [N, C/r**2, H*r, W*r]. + This is useful for implementing efficient sub-pixel convolution + with a stride of 1/r. + Please refer to the paper: `Real-Time Single Image and Video Super-Resolution + Using an Efficient Sub-Pixel Convolutional Neural Network `_ . + by Shi et. al (2016) for more details. + + .. code-block:: text + + Given a 4-D tensor with the shape: + x.shape = [1, 9, 4, 4] + Given upscale_factor: + upscale_factor= 3 + output shape is: + [1, 1, 12, 12] + + Args: + + x(Variable): The input tensor variable. + upscale_factor(int): factor to increase spatial resolution + + Returns: + + Out(Variable): the pixel shuffle result is a tensor variable with the same shape and the same type as the input. + + Raises: + + ValueError: If the square of upscale_factor cannot divide the channels of input. + + Examples: + + .. code-block:: python + + input = fluid.layers.data(shape=[9,4,4]) + output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3) + + """ + + helper = LayerHelper("pixel_shuffle", **locals()) + + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + if not isinstance(upscale_factor, int): + raise TypeError("upscale factor must be int type") + + helper.append_op( + type="pixel_shuffle", + inputs={"X": x}, + outputs={"Out": out}, + attrs={"upscale_factor": upscale_factor}) + return out + + def fsp_matrix(x, y): """ diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index fd07ff0ba3d21721fbbc46099f7dcb6937f93524..c7c82f28e7c441b4aa24ffa81a8695e565d737d8 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -227,7 +227,7 @@ class Precision(MetricBase): metric.reset() for data in train_reader(): loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) - metric.update(preds=preds, labels=labels) + metric.update(preds=preds, labels=labels) numpy_precision = metric.eval() """ @@ -241,9 +241,11 @@ class Precision(MetricBase): raise ValueError("The 'preds' must be a numpy ndarray.") if not _is_numpy_(labels): raise ValueError("The 'labels' must be a numpy ndarray.") - sample_num = labels[0] + sample_num = labels.shape[0] + preds = np.rint(preds).astype("int32") + for i in range(sample_num): - pred = preds[i].astype("int32") + pred = preds[i] label = labels[i] if label == 1: if pred == label: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 1390e759d7e309674a2ecb61c59043b0f5032400..0291bc25ed3145a580582fa30a965fd76047daba 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -81,6 +81,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_resnet) list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext) list(REMOVE_ITEM TEST_OPS test_imperative_mnist) list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer) +list(REMOVE_ITEM TEST_OPS test_layers) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -118,7 +119,7 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) - +py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1) if(NOT WIN32) py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL) endif() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py new file mode 100644 index 0000000000000000000000000000000000000000..95cae1c2029c472c5a34b37a79739e2ff088feb2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py @@ -0,0 +1,23 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import unittest + +fluid.core._set_eager_deletion_mode(0.0, 1.0, True) + +from test_conditional_block import * + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py index 4844d930daca75595376b1f1f67ae03011a713c6..a84ff1fd6d46c30ad7aa72f1b29a8ae668b90e20 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py @@ -23,6 +23,8 @@ from test_elementwise_sub_op import * from test_concat_op import * from test_gather_op import * from test_gaussian_random_batch_size_like_op import * +from test_uniform_random_batch_size_like_op import * +from test_fill_constant_batch_size_like_op import * from test_lod_reset_op import * from test_scatter_op import * from test_mean_op import * @@ -40,6 +42,7 @@ from test_sequence_unpad_op import * from test_sequence_scatter_op import * from test_sequence_slice_op import * from test_pad2d_op import * +from test_fill_zeros_like2_op import * if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py new file mode 100644 index 0000000000000000000000000000000000000000..935653b07a6a4e1d344e8040fa4a0ed72b9b164d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py @@ -0,0 +1,50 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.framework import convert_np_dtype_to_dtype_ +from op_test import OpTest + + +class TestFillZerosLike2Op(OpTest): + def setUp(self): + self.op_type = "fill_zeros_like2" + self.dtype = np.float32 + self.init_dtype() + self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)} + self.outputs = {'Out': np.zeros_like(self.inputs["X"])} + self.attrs = {'dtype': convert_np_dtype_to_dtype_(self.dtype)} + + def init_dtype(self): + pass + + def test_check_output(self): + self.check_output() + + +class TestFillZerosLike2OpFp16(TestFillZerosLike2Op): + def init_dtype(self): + self.dtype = np.float16 + + +class TestFillZerosLike2OpFp64(TestFillZerosLike2Op): + def init_dtype(self): + self.dtype = np.float64 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 13f2d662178c7e1474ec43fdeadf7046516eb8e5..8b3094cb2a3a633cf89e571e85e494ce7e887063 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -348,6 +348,55 @@ class TestImperative(unittest.TestCase): self.assertEqual(mlp._fc2, sublayers[1]) self.assertEqual(len(sublayers), 2) + def test_dygraph_vs_static(self): + inp1 = np.random.rand(4, 3, 3) + inp2 = np.random.rand(4, 3, 3) + + # dynamic graph + with fluid.dygraph.guard(): + if np.sum(inp1) < np.sum(inp2): + x = fluid.layers.elementwise_add(inp1, inp2) + else: + x = fluid.layers.elementwise_sub(inp1, inp2) + dygraph_result = x._numpy() + + # static graph + with new_program_scope(): + inp_data1 = fluid.layers.data( + name='inp1', shape=[3, 3], dtype=np.float32) + inp_data2 = fluid.layers.data( + name='inp2', shape=[3, 3], dtype=np.float32) + + a = fluid.layers.expand( + fluid.layers.reshape( + fluid.layers.reduce_sum(inp_data1), [1, 1]), [4, 1]) + b = fluid.layers.expand( + fluid.layers.reshape( + fluid.layers.reduce_sum(inp_data2), [1, 1]), [4, 1]) + cond = fluid.layers.less_than(x=a, y=b) + + ie = fluid.layers.IfElse(cond) + with ie.true_block(): + d1 = ie.input(inp_data1) + d2 = ie.input(inp_data2) + d3 = fluid.layers.elementwise_add(d1, d2) + ie.output(d3) + + with ie.false_block(): + d1 = ie.input(inp_data1) + d2 = ie.input(inp_data2) + d3 = fluid.layers.elementwise_sub(d1, d2) + ie.output(d3) + out = ie() + + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + static_result = exe.run(fluid.default_main_program(), + feed={'inp1': inp1, + 'inp2': inp2}, + fetch_list=out)[0] + self.assertTrue(np.allclose(dygraph_result, static_result)) + def test_rnn(self): np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py index 732f0681c4e65006628d51e083a400c0b5bd3d92..89ae3c6a39d6277f590c8f2e02f7b0ae62a1cd4a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py @@ -302,8 +302,11 @@ use_py_reader = False # if we run sync mode sync = False -# how many batches we use -batch_num = 50 +if not core.is_compiled_with_cuda(): + # how many batches we use + batch_num = 50 +else: + batch_num = 5 np.random.seed = 1 src_word_np = np.random.randint( diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 302f26d1230b2d45cc2c7e8a4d10d41dd55bc188..fb40109bdc1d1619f8b676d105f112b3641855f6 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -600,6 +600,280 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(static_rlt2, static_rlt)) self.assertTrue(np.allclose(nce_loss3._numpy(), static_rlt)) + def test_conv3d(self): + with self.static_graph(): + images = layers.data( + name='pixel', shape=[3, 6, 6, 6], dtype='float32') + ret = layers.conv3d(input=images, num_filters=3, filter_size=2) + static_ret = self.get_static_graph_result( + feed={'pixel': np.ones( + [2, 3, 6, 6, 6], dtype='float32')}, + fetch_list=[ret])[0] + + with self.static_graph(): + images = layers.data( + name='pixel', shape=[3, 6, 6, 6], dtype='float32') + conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2) + ret = conv3d(images) + static_ret2 = self.get_static_graph_result( + feed={'pixel': np.ones( + [2, 3, 6, 6, 6], dtype='float32')}, + fetch_list=[ret])[0] + + with self.dynamic_graph(): + images = np.ones([2, 3, 6, 6, 6], dtype='float32') + conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2) + dy_ret = conv3d(base.to_variable(images)) + + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, static_ret2)) + + def test_row_conv(self): + input = np.arange(15).reshape([3, 5]).astype('float32') + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + with self.static_graph(): + x = layers.data( + name='X', + shape=[3, 5], + dtype='float32', + lod_level=1, + append_batch_size=False) + ret = layers.row_conv(input=x, future_context_size=2) + static_ret = self.get_static_graph_result( + feed={ + 'X': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1, 1]], place=place) + }, + fetch_list=[ret], + with_lod=True)[0] + + with self.static_graph(): + x = layers.data( + name='X', + shape=[3, 5], + dtype='float32', + lod_level=1, + append_batch_size=False) + rowConv = nn.RowConv('RowConv', future_context_size=2) + ret = rowConv(x) + static_ret2 = self.get_static_graph_result( + feed={ + 'X': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1, 1]], place=place) + }, + fetch_list=[ret], + with_lod=True)[0] + + # TODO: dygraph can't support LODTensor + + self.assertTrue(np.allclose(static_ret, static_ret2)) + + def test_group_norm(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + shape = (2, 4, 3, 3) + + input = np.random.random(shape).astype('float32') + + with self.static_graph(): + X = fluid.layers.data( + name='X', + shape=shape, + dtype='float32', + lod_level=1, + append_batch_size=False) + ret = layers.group_norm(input=X, groups=2) + static_ret = self.get_static_graph_result( + feed={ + 'X': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1]], place=place) + }, + fetch_list=[ret], + with_lod=True)[0] + + with self.static_graph(): + X = fluid.layers.data( + name='X', + shape=shape, + dtype='float32', + lod_level=1, + append_batch_size=False) + groupNorm = nn.GroupNorm('GroupNorm', groups=2) + ret = groupNorm(X) + static_ret2 = self.get_static_graph_result( + feed={ + 'X': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1]], place=place) + }, + fetch_list=[ret], + with_lod=True)[0] + + with self.dynamic_graph(): + groupNorm = nn.GroupNorm('GroupNorm', groups=2) + dy_ret = groupNorm(base.to_variable(input)) + + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, static_ret2)) + + def test_spectral_norm(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + shape = (2, 4, 3, 3) + + input = np.random.random(shape).astype('float32') + + with self.static_graph(): + Weight = fluid.layers.data( + name='Weight', + shape=shape, + dtype='float32', + lod_level=1, + append_batch_size=False) + ret = layers.spectral_norm(weight=Weight, dim=1, power_iters=2) + static_ret = self.get_static_graph_result( + feed={ + 'Weight': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1]], place=place), + }, + fetch_list=[ret], + with_lod=True)[0] + + with self.static_graph(): + Weight = fluid.layers.data( + name='Weight', + shape=shape, + dtype='float32', + lod_level=1, + append_batch_size=False) + spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2) + ret = spectralNorm(Weight) + static_ret2 = self.get_static_graph_result( + feed={ + 'Weight': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1]], place=place) + }, + fetch_list=[ret], + with_lod=True)[0] + + with self.dynamic_graph(): + spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2) + dy_ret = spectralNorm(base.to_variable(input)) + + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, static_ret2)) + + def test_tree_conv(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + adj_array = [1, 2, 1, 3, 1, 4, 1, 5, 2, 6, 2, 7, 2, 8, 4, 9, 4, 10] + adj = np.array(adj_array).reshape((1, 9, 2)).astype('int32') + adj = np.tile(adj, (1, 1, 1)) + vectors = np.random.random((1, 10, 5)).astype('float32') + with self.static_graph(): + NodesVector = fluid.layers.data( + name='NodesVector', + shape=(1, 10, 5), + dtype='float32', + lod_level=1, + append_batch_size=False) + EdgeSet = fluid.layers.data( + name='EdgeSet', + shape=(1, 9, 2), + dtype='int32', + lod_level=1, + append_batch_size=False) + ret = layers.tree_conv( + nodes_vector=NodesVector, + edge_set=EdgeSet, + output_size=6, + num_filters=1, + max_depth=2) + static_ret = self.get_static_graph_result( + feed={ + 'NodesVector': fluid.create_lod_tensor( + data=vectors, recursive_seq_lens=[[1]], place=place), + 'EdgeSet': fluid.create_lod_tensor( + data=adj, recursive_seq_lens=[[1]], place=place) + }, + fetch_list=[ret], + with_lod=False)[0] + + with self.static_graph(): + NodesVector = fluid.layers.data( + name='NodesVector', + shape=(1, 10, 5), + dtype='float32', + lod_level=1, + append_batch_size=False) + EdgeSet = fluid.layers.data( + name='EdgeSet', + shape=(1, 9, 2), + dtype='int32', + lod_level=1, + append_batch_size=False) + treeConv = nn.TreeConv( + 'TreeConv', output_size=6, num_filters=1, max_depth=2) + ret = treeConv(NodesVector, EdgeSet) + static_ret2 = self.get_static_graph_result( + feed={ + 'NodesVector': fluid.create_lod_tensor( + data=vectors, recursive_seq_lens=[[1]], place=place), + 'EdgeSet': fluid.create_lod_tensor( + data=adj, recursive_seq_lens=[[1]], place=place) + }, + fetch_list=[ret], + with_lod=False)[0] + + with self.dynamic_graph(): + treeConv = nn.TreeConv( + 'SpectralNorm', output_size=6, num_filters=1, max_depth=2) + dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj)) + + self.assertTrue(np.allclose(static_ret, static_ret2)) + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + + def test_conv3d_transpose(self): + input_array = np.arange(0, 48).reshape( + [2, 3, 2, 2, 2]).astype('float32') + + with self.static_graph(): + img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32') + out = layers.conv3d_transpose( + input=img, num_filters=12, filter_size=12, use_cudnn=False) + static_rlt = self.get_static_graph_result( + feed={'pixel': input_array}, fetch_list=[out])[0] + with self.static_graph(): + img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32') + conv3d_transpose = nn.Conv3DTranspose( + 'Conv3DTranspose', + num_filters=12, + filter_size=12, + use_cudnn=False) + out = conv3d_transpose(img) + static_rlt2 = self.get_static_graph_result( + feed={'pixel': input_array}, fetch_list=[out])[0] + with self.dynamic_graph(): + conv3d_transpose = nn.Conv3DTranspose( + 'Conv3DTranspose', + num_filters=12, + filter_size=12, + use_cudnn=False) + dy_rlt = conv3d_transpose(base.to_variable(input_array)) + self.assertTrue(np.allclose(static_rlt2, static_rlt)) + self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt)) + class TestBook(LayerTest): def test_all_layers(self): @@ -1634,6 +1908,41 @@ class TestBook(LayerTest): out = layers.flatten(x, axis=1, name="flatten") return (out) + def test_kldiv_loss(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[32, 128, 128], dtype="float32") + target = layers.data( + name='target', shape=[32, 128, 128], dtype="float32") + loss = layers.kldiv_loss(x=x, target=target, reduction='batchmean') + self.assertIsNotNone(loss) + + print(str(program)) + + def test_temporal_shift(self): + program = Program() + with program_guard(program): + x = layers.data(name="X", shape=[16, 4, 4], dtype="float32") + out = layers.temporal_shift(x, seg_num=4, shift_ratio=0.2) + self.assertIsNotNone(out) + print(str(program)) + + def test_shuffle_channel(self): + program = Program() + with program_guard(program): + x = layers.data(name="X", shape=[16, 4, 4], dtype="float32") + out = layers.shuffle_channel(x, group=4) + self.assertIsNotNone(out) + print(str(program)) + + def test_pixel_shuffle(self): + program = Program() + with program_guard(program): + x = layers.data(name="X", shape=[9, 4, 4], dtype="float32") + out = layers.pixel_shuffle(x, upscale_factor=3) + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py new file mode 100644 index 0000000000000000000000000000000000000000..cc3ae2b3b9d4c40a7ee992c04cac79f518acac6d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py @@ -0,0 +1,50 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest + + +class TestPixelShuffle(OpTest): + def setUp(self): + self.op_type = "pixel_shuffle" + n, c, h, w = 2, 9, 4, 4 + up_factor = 3 + shape = [n, c, h, w] + x = np.random.random(shape).astype("float32") + new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h, + w) + # reshape to (num,output_channel,upscale_factor,upscale_factor,h,w) + npresult = np.reshape(x, new_shape) + # transpose to (num,output_channel,h,upscale_factor,w,upscale_factor) + npresult = npresult.transpose(0, 1, 4, 2, 5, 3) + oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor] + npresult = np.reshape(npresult, oshape) + + self.inputs = {'X': x} + self.outputs = {'Out': npresult} + self.attrs = {'upscale_factor': up_factor} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +if __name__ == '__main__': + unittest.main()