提交 2e0b8713 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_dqn

...@@ -235,6 +235,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg ...@@ -235,6 +235,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg
paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec')) paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607')) paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329')) paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', 'ad669cdf83e72a69ebc5ed79e36486de'))
paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393')) paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
......
...@@ -13,125 +13,186 @@ ...@@ -13,125 +13,186 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include <memory> #include <map>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/op_graph_view.h" #include "paddle/fluid/framework/details/op_graph_view.h"
#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
VarHandle* GetValidInput(const OpHandleBase* a) { class AllReduceDepsPass : public ir::Pass {
for (auto p : a->Inputs()) { protected:
VarHandle* b = dynamic_cast<VarHandle*>(p); void ApplyImpl(ir::Graph* graph) const override {
if (b) { std::vector<AllReduceOpHandle*> all_reduce_op_handles =
return b; GetSortedAllReduceOps(*graph);
for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) {
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
all_reduce_op_handles[i - 1]->AddOutput(dep_var);
all_reduce_op_handles[i]->AddInput(dep_var);
} }
}
return nullptr; if (VLOG_IS_ON(10)) {
} DebugString(*graph, all_reduce_op_handles);
void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
// get vars order
int order = 0;
std::unordered_map<std::string, int> vars;
// TODO(gongwb): use graph topology sort to find the order of operators.
// Note that must assert topology sort is stable
auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
for (auto* op_desc : ops) {
try {
bool is_bk_op =
static_cast<bool>(boost::get<int>(op_desc->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) continue;
auto backward_vars =
boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
auto outputs = op_desc->Outputs();
for (auto& o_it : outputs) {
for (auto& v : o_it.second) { // values
vars[v] = order;
VLOG(10) << "in all_reduce_deps_pass:" << v;
}
}
order++;
} catch (boost::bad_get e) {
} }
} }
std::vector<OpHandleBase*> dist_ops; std::vector<AllReduceOpHandle*> GetSortedAllReduceOps(
// get allreduce ops. const ir::Graph& graph) const {
for (auto& op : graph_ops) { std::vector<AllReduceOpHandle*> all_reduce_op_handles;
// FIXME(gongwb):add broad cast. std::unordered_map<OpHandleBase*, size_t> pending_ops;
if (op->Name() == "all_reduce" || op->Name() == "reduce") { std::unordered_set<OpHandleBase*> ready_ops;
dist_ops.push_back(op); std::unordered_set<OpHandleBase*> next_ready_ops;
auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(graph);
size_t num_of_ops = op_handles.size();
for (OpHandleBase* op : op_handles) {
size_t not_ready_vars = op->NotReadyInputSize();
if (not_ready_vars) {
pending_ops.insert({op, not_ready_vars});
} else {
ready_ops.insert(op);
}
} }
}
VLOG(10) << "dist_ops size:" << dist_ops.size()
<< ", outputs size:" << vars.size() << ", ops size:" << ops.size();
std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
OpHandleBase* op2) {
VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
op1->DebugString(), op2->DebugString());
auto l_it = vars.find(i0->name()); GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
auto r_it = vars.find(i1->name());
size_t has_run_ops = ready_ops.size();
PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(), while (has_run_ops != num_of_ops) {
"can't find var's name %s and %s in opdesc", i0->name(), for (auto* op : ready_ops) {
i1->name()); for (auto& ready_var : op->Outputs()) {
for (auto* pend_op : ready_var->PendingOps()) {
if (l_it->second < r_it->second) return true; auto& deps = --pending_ops[pend_op];
if (deps == 0) {
next_ready_ops.insert(pend_op);
}
}
}
}
if (l_it->second == r_it->second) { PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle.");
return i0->name() < i1->name(); ready_ops.clear();
std::swap(ready_ops, next_ready_ops);
GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
has_run_ops += ready_ops.size();
} }
return all_reduce_op_handles;
}
return false; void GetSortedAllReduceOps(
}); const std::unordered_set<OpHandleBase*>& ready_ops,
std::vector<AllReduceOpHandle*>* all_reduce_op_handles) const {
// add dependency. std::vector<AllReduceOpHandle*> current_all_reduce_op_handles;
auto& sorted_ops = dist_ops; for (auto& op_handle : ready_ops) {
for (size_t i = 1; i < sorted_ops.size(); ++i) { auto all_reduce_op_handle = dynamic_cast<AllReduceOpHandle*>(op_handle);
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); if (all_reduce_op_handle) {
current_all_reduce_op_handles.emplace_back(all_reduce_op_handle);
auto* pre_op = sorted_ops[i - 1]; }
auto* op = sorted_ops[i]; }
pre_op->AddOutput(dep_var);
op->AddInput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
VLOG(10) << "add all_reduce sequential dependencies between " << pre_op // NOTE(zcd): For distributed training, it is important to keep the order of
<< " and " << op; // allReduce on each node consistent. Otherwise, hang may occur.
// Sort the current_all_reduce_op_handles according to the name of input.
sort(current_all_reduce_op_handles.begin(),
current_all_reduce_op_handles.end(),
[](const AllReduceOpHandle* left,
const AllReduceOpHandle* right) -> bool {
auto left_in_vars = DynamicCast<VarHandle>(left->Inputs());
auto right_in_vars = DynamicCast<VarHandle>(right->Inputs());
PADDLE_ENFORCE_GT(left_in_vars.size(), 0);
PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size());
return left_in_vars[0]->Name() > right_in_vars[0]->Name();
});
all_reduce_op_handles->insert(all_reduce_op_handles->end(),
current_all_reduce_op_handles.begin(),
current_all_reduce_op_handles.end());
}
VLOG(10) << "pre_op:" << pre_op->DebugString() void DebugString(
<< ", op:" << op->DebugString(); const ir::Graph& graph,
const std::vector<AllReduceOpHandle*>& all_reduce_op_handles) const {
// get vars order
std::map<int, std::vector<std::string>> vars =
GetSoredGradientsFromStaleProgram(graph);
std::stringstream out;
size_t grads_of_stale_program = 0;
out << "Get Order From kStaleProgramOpDescs: ";
for (auto& var : vars) {
out << "Order " << var.first << " [";
for (auto& var_name : var.second) {
out << var_name << ", ";
++grads_of_stale_program;
}
out << "], ";
}
VLOG(10) << out.str();
std::stringstream out2;
out2 << "Get Order From Topological order: ";
for (auto& op : all_reduce_op_handles) {
bool find_valid_input = false;
for (auto& in_var : op->Inputs()) {
if (dynamic_cast<VarHandle*>(in_var)) {
out2 << in_var->Name() << ", ";
find_valid_input = true;
break;
}
}
PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input.");
}
VLOG(10) << out2.str();
if (grads_of_stale_program != all_reduce_op_handles.size()) {
VLOG(10)
<< "The gradients number of stale program and graph is not equal.";
}
} }
}
std::map<int, std::vector<std::string>> GetSoredGradientsFromStaleProgram(
const ir::Graph& graph) const {
std::map<int, std::vector<std::string>> vars;
auto ops = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
int order = 0;
for (auto* op_desc : ops) {
try {
bool is_bk_op =
static_cast<bool>(boost::get<int>(op_desc->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) continue;
auto backward_vars =
boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
if (backward_vars.empty()) continue;
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
for (size_t i = 1; i < backward_vars.size(); i += 2) {
vars[order].emplace_back(backward_vars[i]);
VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1]
<< ", " << backward_vars[i];
}
order++;
} catch (boost::bad_get e) {
}
}
return vars;
}
};
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
// asynchronous nccl allreduce or synchronous issue: // asynchronous nccl allreduce or synchronous issue:
// https://github.com/PaddlePaddle/Paddle/issues/15049 // https://github.com/PaddlePaddle/Paddle/issues/15049
DEFINE_bool( DEFINE_bool(
sync_nccl_allreduce, false, sync_nccl_allreduce, true,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`" "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios."); "after allreduce, this mode can get better performance in some scenarios.");
......
...@@ -163,14 +163,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -163,14 +163,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
"graph_printer", new details::GraphvizSSAGraphPrinter); "graph_printer", new details::GraphvizSSAGraphPrinter);
} }
// Verify that the graph is correct for multi-device executor. // experimental shows that the program will be faster if append
AppendPass("multi_devices_check_pass"); // all_reduce_deps_pass here.
if (!strategy_.enable_parallel_graph_ &&
if (VLOG_IS_ON(2)) { (SeqOnlyAllReduceOps(strategy_) ||
AppendPass("all_reduce_deps_pass"); strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) {
}
if (SeqOnlyAllReduceOps(strategy_)) {
VLOG(10) << "Add all_reduce_deps_pass"; VLOG(10) << "Add all_reduce_deps_pass";
AppendPass("all_reduce_deps_pass"); AppendPass("all_reduce_deps_pass");
} }
...@@ -179,6 +176,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -179,6 +176,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
VLOG(10) << "Add modify_op_lock_and_record_event_pass"; VLOG(10) << "Add modify_op_lock_and_record_event_pass";
AppendPass("modify_op_lock_and_record_event_pass"); AppendPass("modify_op_lock_and_record_event_pass");
} }
// Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass");
} }
// Convert graph to run on multi-devices. // Convert graph to run on multi-devices.
......
...@@ -68,7 +68,7 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -68,7 +68,7 @@ void OpHandleBase::Run(bool use_cuda) {
if (out_var_handle) { if (out_var_handle) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::is_same_place(place, out_var_handle->place()), platform::is_same_place(place, out_var_handle->place()),
"The place of input(%s) is not consistent with the " "The place of output(%s) is not consistent with the "
"place of current op(%s).", "place of current op(%s).",
out_var_handle->Name(), Name()); out_var_handle->Name(), Name());
out_var_handle->SetGenerateEvent(events_.at(dev_id)); out_var_handle->SetGenerateEvent(events_.at(dev_id));
......
...@@ -617,6 +617,25 @@ void OpDesc::Flush() { ...@@ -617,6 +617,25 @@ void OpDesc::Flush() {
static std::once_flag init_infer_shape_funcs; static std::once_flag init_infer_shape_funcs;
/**
* NOTE(paddle-dev): Very tricky code here. Maybe we should find a
* better way to register compile-time infershape method gentlely.
*
* Normally, we can register a class derived from InferShapeBase, so that
* we can set the field of `infer_shape_` inside OpInfo when registering op.
*
* However, there is another way we can set the field of `infer_shape_` inside
* OpInfo. Usually, we overload InferShape method of OperatorWithKernel. After
* running the following method InitInferShapeFuncs, `infer_shape_` would be set
* to be the InferShape method of OperatorWithKernel. That is to say, we borrow
* the run-time InferShape method of OperatorWithKernel to be the compile-time
* InferShape method.
*
* However, during compiling time, we may not know inputs, outputs and attrs of
* run-time OperatorWithKernel. So the following code creates a fake
* OperatorWithKernel object. That is why the field info_ of OperatorBase
* would be null.
*/
static void InitInferShapeFuncs() { static void InitInferShapeFuncs() {
std::call_once(init_infer_shape_funcs, [] { std::call_once(init_infer_shape_funcs, [] {
auto &map = OpInfoMap::Instance(); auto &map = OpInfoMap::Instance();
...@@ -628,11 +647,16 @@ static void InitInferShapeFuncs() { ...@@ -628,11 +647,16 @@ static void InitInferShapeFuncs() {
PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered", PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered",
op_type); op_type);
auto &op_info = it->second; auto &op_info = it->second;
auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
"", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
if (op_info.infer_shape_) { // infer_shape has been registered. if (op_info.infer_shape_) { // infer_shape has been registered.
continue; continue;
} }
auto op = dynamic_cast<OperatorWithKernel *>(op_info.Creator()(
"", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
PADDLE_ENFORCE_NOT_NULL(
op, "InferShapeBase is not registered to Operator %s", op_type);
op_info.infer_shape_ = [op](InferShapeContext *ctx) { op_info.infer_shape_ = [op](InferShapeContext *ctx) {
op->InferShape(ctx); op->InferShape(ctx);
}; };
......
...@@ -19,11 +19,6 @@ limitations under the License. */ ...@@ -19,11 +19,6 @@ limitations under the License. */
#include <tuple> #include <tuple>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
...@@ -31,6 +26,8 @@ limitations under the License. */ ...@@ -31,6 +26,8 @@ limitations under the License. */
#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
......
...@@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi ...@@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc) cc_library(engine SRCS engine.cc)
cc_library(imperative_profiler SRCS profiler.cc) cc_library(imperative_profiler SRCS profiler.cc)
cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
endif() endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h"
namespace paddle {
namespace imperative {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void NCCLParallelContext::RecvNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(addr.size(), 2UL,
"The endpoint should contain host and port: %s", ep);
std::string host = addr[0];
int port = std::stoi(addr[1]);
int server_fd, new_socket;
struct sockaddr_in address;
int addrlen = sizeof(address);
char buffer[1024] = {0};
int opt = 0;
// creating socket fd
if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
PADDLE_THROW("create server fd failed");
if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt,
sizeof(opt)))
PADDLE_THROW("set socket opt failed");
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(port);
if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
PADDLE_THROW("binding failed on ep: %s", ep);
VLOG(3) << "listening on: " << ep;
if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
if ((new_socket =
accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
reinterpret_cast<socklen_t *>(&addrlen))) < 0)
PADDLE_THROW("accept the new socket fd failed");
if (read(new_socket, buffer, 1024) < 0)
PADDLE_THROW("reading the ncclUniqueId from socket failed");
VLOG(3) << "recevived the ncclUniqueId";
memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
VLOG(3) << "closing the socket server: " << ep;
close(server_fd);
}
void NCCLParallelContext::SendNCCLID(const std::string &ep,
ncclUniqueId *nccl_id) {
auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(addr.size(), 2UL,
"The endpoint should contain host and port: %s", ep);
std::string host = addr[0];
int port = std::stoi(addr[1]);
// struct sockaddr_in address;
int sock = 0;
struct sockaddr_in serv_addr;
char buffer[1024] = {0};
memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
PADDLE_THROW("create socket failed");
memset(&serv_addr, '0', sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(port);
if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
PADDLE_THROW("invalied address: %s", ep);
while (true) {
if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
VLOG(0) << "worker: " << ep
<< " is not ready, will retry after 3 seconds...";
std::this_thread::sleep_for(std::chrono::seconds(3));
continue;
}
VLOG(3) << "sending the ncclUniqueId to " << ep;
send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0);
break;
}
}
void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) {
if (strategy_.local_rank_ == root) {
for (auto ep : strategy_.trainer_endpoints_) {
if (ep != strategy_.current_endpoint_) SendNCCLID(ep, nccl_id);
}
} else {
RecvNCCLID(strategy_.current_endpoint_, nccl_id);
}
}
void NCCLParallelContext::Init() {
ncclUniqueId nccl_id;
ncclComm_t comm;
if (strategy_.local_rank_ == 0) {
// generate the unique ncclid on the root worker
platform::dynload::ncclGetUniqueId(&nccl_id);
BcastNCCLId(&nccl_id, 0);
} else {
BcastNCCLId(&nccl_id, 0);
}
int gpu_id = boost::get<platform::CUDAPlace>(place_).device;
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id;
PADDLE_ENFORCE(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
&comm, strategy_.nranks_, nccl_id, strategy_.local_rank_));
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(pool.Get(place_));
dev_ctx->set_nccl_comm(comm);
}
#endif
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
// network header files
#ifndef _WIN32
#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <sys/socket.h>
#endif
#include <string>
#include <vector>
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
namespace paddle {
namespace imperative {
struct ParallelStrategy {
int nranks_{1};
int local_rank_{0};
std::vector<std::string> trainer_endpoints_{};
std::string current_endpoint_{""};
};
class ParallelContext {
public:
explicit ParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: strategy_(strategy), place_(place) {}
virtual ~ParallelContext() {}
virtual void Init() = 0;
protected:
ParallelStrategy strategy_;
platform::Place place_;
};
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
class NCCLParallelContext : ParallelContext {
public:
explicit NCCLParallelContext(const ParallelStrategy& strategy,
const platform::Place& place)
: ParallelContext(strategy, place) {}
~NCCLParallelContext() {}
void BcastNCCLId(ncclUniqueId* nccl_id, int root);
void Init() override;
protected:
void RecvNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
void SendNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
};
#endif
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,21 +12,41 @@ ...@@ -12,21 +12,41 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #include "paddle/fluid/imperative/nccl_context.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/framework/ir/graph.h" namespace imperative = paddle::imperative;
#include "paddle/fluid/framework/ir/pass.h" namespace platform = paddle::platform;
namespace paddle { imperative::ParallelStrategy GetStrategy(int local_rank) {
namespace framework { std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
namespace details { imperative::ParallelStrategy strategy;
strategy.trainer_endpoints_ = eps;
strategy.current_endpoint_ = eps[local_rank];
strategy.nranks_ = 2;
strategy.local_rank_ = local_rank;
return strategy;
}
// TODO(gongwb): overlap allreduce with backward computation. #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
class AllReduceDepsPass : public ir::Pass { void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) {
protected: auto strategy = GetStrategy(local_rank);
void ApplyImpl(ir::Graph* graph) const override; platform::CUDAPlace gpu(local_rank);
}; imperative::NCCLParallelContext ctx(strategy, gpu);
ctx.BcastNCCLId(nccl_id, 0);
}
} // namespace details TEST(BcastNCCLId, Run) {
} // namespace framework ncclUniqueId nccl_id;
} // namespace paddle platform::dynload::ncclGetUniqueId(&nccl_id);
std::thread t(BcastNCCLId, 0, &nccl_id);
ncclUniqueId recv_nccl_id;
BcastNCCLId(1, &recv_nccl_id);
t.join();
EXPECT_EQ(0, std::memcmp(nccl_id.internal, recv_nccl_id.internal,
NCCL_UNIQUE_ID_BYTES));
}
#endif
...@@ -177,7 +177,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -177,7 +177,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
current_vars_map[out->Name()] = out; current_vars_map[out->Name()] = out;
} }
VLOG(3) << "input var name: " << out->Name() VLOG(3) << "output var name: " << out->Name()
<< " inited: " << out->var_->IsInitialized() << " inited: " << out->var_->IsInitialized()
<< " stop_grad: " << out->IsStopGradient(); << " stop_grad: " << out->IsStopGradient();
} }
...@@ -215,6 +215,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -215,6 +215,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework::Scope scope; framework::Scope scope;
op->place_ = GetExpectedPlace(expected_place, inputs); op->place_ = GetExpectedPlace(expected_place, inputs);
PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_); PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
prepared_op.op.RuntimeInferShape(scope, op->place_, ctx); prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
prepared_op.func( prepared_op.func(
......
...@@ -142,7 +142,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -142,7 +142,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
void AnalysisConfig::EnableMKLDNN() { void AnalysisConfig::EnableMKLDNN() {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMKLDNN();
use_mkldnn_ = true; use_mkldnn_ = true;
#else #else
LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
...@@ -235,16 +234,13 @@ void AnalysisConfig::Update() { ...@@ -235,16 +234,13 @@ void AnalysisConfig::Update() {
} }
if (use_mkldnn_) { if (use_mkldnn_) {
#ifdef PADDLE_WITH_MKLDNN
if (!enable_ir_optim_) { if (!enable_ir_optim_) {
LOG(ERROR) LOG(ERROR)
<< "EnableMKLDNN() only works when IR optimization is enabled."; << "EnableMKLDNN() only works when IR optimization is enabled.";
} else {
pass_builder()->EnableMKLDNN();
} }
#ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMKLDNN();
use_mkldnn_ = true;
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
use_mkldnn_ = false;
#endif #endif
} }
...@@ -256,9 +252,6 @@ void AnalysisConfig::Update() { ...@@ -256,9 +252,6 @@ void AnalysisConfig::Update() {
} }
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMkldnnQuantizer(); pass_builder()->EnableMkldnnQuantizer();
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
use_mkldnn_quantizer_ = false;
#endif #endif
} }
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -266,17 +267,17 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) { ...@@ -266,17 +267,17 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
} }
static void PrintTime(int batch_size, int repeat, int num_threads, int tid, static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency, int epoch = 1) { double batch_latency, int epoch = 1) {
LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat PADDLE_ENFORCE(batch_size > 0, "Non-positive batch size.");
<< ", threads: " << num_threads << ", thread id: " << tid double sample_latency = batch_latency / batch_size;
<< ", latency: " << latency << "ms, fps: " << 1 / (latency / 1000.f) LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
<< " ======"; << " ======";
if (epoch > 1) { LOG(INFO) << "====== batch_size: " << batch_size << ", iterations: " << epoch
int samples = batch_size * epoch; << ", repetitions: " << repeat << " ======";
LOG(INFO) << "====== sample number: " << samples LOG(INFO) << "====== batch latency: " << batch_latency
<< ", average latency of each sample: " << latency / samples << "ms, number of samples: " << batch_size * epoch
<< "ms ======"; << ", sample latency: " << sample_latency
} << "ms, fps: " << 1000.f / sample_latency << " ======";
} }
static bool IsFileExists(const std::string &path) { static bool IsFileExists(const std::string &path) {
......
...@@ -64,10 +64,12 @@ void PaddlePassBuilder::DeletePass(size_t idx) { ...@@ -64,10 +64,12 @@ void PaddlePassBuilder::DeletePass(size_t idx) {
passes_.erase(std::begin(passes_) + idx); passes_.erase(std::begin(passes_) + idx);
} }
void GpuPassStrategy::EnableMKLDNN() { void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
LOG(ERROR) << "GPU not support MKLDNN yet"; analysis_passes_.push_back(pass);
} }
void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
// The following passes works for Anakin sub-graph engine. // The following passes works for Anakin sub-graph engine.
const std::vector<std::string> kAnakinSubgraphPasses({ const std::vector<std::string> kAnakinSubgraphPasses({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
...@@ -102,12 +104,12 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { ...@@ -102,12 +104,12 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
use_gpu_ = true; use_gpu_ = true;
} }
void GpuPassStrategy::EnableMkldnnQuantizer() { void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKL-DNN quantization"; LOG(ERROR) << "GPU not support MKLDNN yet";
} }
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void GpuPassStrategy::EnableMkldnnQuantizer() {
analysis_passes_.push_back(pass); LOG(ERROR) << "GPU not support MKL-DNN quantization";
} }
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
...@@ -130,10 +132,44 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { ...@@ -130,10 +132,44 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", // "is_test_pass", //
"identity_scale_op_clean_pass", //
"runtime_context_cache_pass", // "runtime_context_cache_pass", //
}); });
use_gpu_ = false; use_gpu_ = false;
} }
void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
void CpuPassStrategy::EnableMKLDNN() {
// TODO(Superjomn) Consider the way to mix CPU with GPU.
#ifdef PADDLE_WITH_MKLDNN
if (!use_mkldnn_) {
passes_.insert(passes_.begin(), "mkldnn_placement_pass");
for (auto &pass : std::vector<std::string>(
{"depthwise_conv_mkldnn_pass", //
"conv_bn_fuse_pass", // Execute BN passes again to
"conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order
"conv_bias_mkldnn_fuse_pass", //
"conv3d_bias_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass",
"conv_relu_mkldnn_fuse_pass"})) {
passes_.push_back(pass);
}
}
use_mkldnn_ = true;
#else
use_mkldnn_ = false;
#endif
}
void CpuPassStrategy::EnableMkldnnQuantizer() {
#ifdef PADDLE_WITH_MKLDNN
if (!use_mkldnn_quantizer_) {
passes_.push_back("cpu_quantize_placement_pass");
}
use_mkldnn_quantizer_ = true;
#else
use_mkldnn_quantizer_ = false;
#endif
}
} // namespace paddle } // namespace paddle
...@@ -109,43 +109,16 @@ class CpuPassStrategy : public PassStrategy { ...@@ -109,43 +109,16 @@ class CpuPassStrategy : public PassStrategy {
CpuPassStrategy(); CpuPassStrategy();
explicit CpuPassStrategy(const CpuPassStrategy &other) explicit CpuPassStrategy(const CpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {} : PassStrategy(other.AllPasses()) {
use_gpu_ = other.use_gpu_;
use_mkldnn_ = other.use_mkldnn_;
use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
}
virtual ~CpuPassStrategy() = default; virtual ~CpuPassStrategy() = default;
void EnableMKLDNN() override { void EnableMKLDNN() override;
// TODO(Superjomn) Consider the way to mix CPU with GPU. void EnableMkldnnQuantizer() override;
#ifdef PADDLE_WITH_MKLDNN
if (!use_mkldnn_) {
passes_.insert(passes_.begin(), "mkldnn_placement_pass");
for (auto &pass : std::vector<std::string>(
{"depthwise_conv_mkldnn_pass", //
"conv_bn_fuse_pass", // Execute BN passes again to
"conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order
"conv_bias_mkldnn_fuse_pass", //
"conv3d_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass"})) {
passes_.push_back(pass);
}
}
use_mkldnn_ = true;
#else
use_mkldnn_ = false;
#endif
}
void EnableMkldnnQuantizer() override {
#ifdef PADDLE_WITH_MKLDNN
if (!use_mkldnn_quantizer_) {
passes_.push_back("cpu_quantize_placement_pass");
}
use_mkldnn_quantizer_ = true;
#else
use_mkldnn_quantizer_ = false;
#endif
}
protected: protected:
bool use_mkldnn_quantizer_{false}; bool use_mkldnn_quantizer_{false};
......
...@@ -26,7 +26,11 @@ endfunction() ...@@ -26,7 +26,11 @@ endfunction()
function(inference_analysis_api_int8_test target model_dir data_dir filename) function(inference_analysis_api_int8_test target model_dir data_dir filename)
inference_analysis_test(${target} SRCS ${filename} inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
ARGS --infer_model=${model_dir}/model --infer_data=${data_dir}/data.bin --batch_size=100) ARGS --infer_model=${model_dir}/model
--infer_data=${data_dir}/data.bin
--warmup_batch_size=100
--batch_size=50
--iterations=2)
endfunction() endfunction()
function(inference_analysis_api_test_with_fake_data target install_dir filename model_name) function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
...@@ -146,22 +150,22 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_con ...@@ -146,22 +150,22 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_con
# int8 image classification tests # int8 image classification tests
if(WITH_MKLDNN) if(WITH_MKLDNN)
set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8") set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
if (NOT EXISTS ${INT8_DATA_DIR}) if (NOT EXISTS ${INT8_DATA_DIR})
inference_download_and_uncompress(${INT8_DATA_DIR} ${INFERENCE_URL}"/int8" "imagenet_val_100.tar.gz") inference_download_and_uncompress(${INT8_DATA_DIR} "${INFERENCE_URL}/int8" "imagenet_val_100_tail.tar.gz")
endif() endif()
#resnet50 int8 #resnet50 int8
set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50") set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR}) if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR})
inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} ${INFERENCE_URL}"/int8" "resnet50_int8_model.tar.gz" ) inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "${INFERENCE_URL}/int8" "resnet50_int8_model.tar.gz" )
endif() endif()
inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL) inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
#mobilenet int8 #mobilenet int8
set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet") set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet")
if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR}) if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR})
inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} ${INFERENCE_URL}"/int8" "mobilenetv1_int8_model.tar.gz" ) inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "${INFERENCE_URL}/int8" "mobilenetv1_int8_model.tar.gz" )
endif() endif()
inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL) inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
endif() endif()
......
...@@ -154,7 +154,7 @@ void profile(bool use_mkldnn = false) { ...@@ -154,7 +154,7 @@ void profile(bool use_mkldnn = false) {
config.EnableMKLDNN(); config.EnableMKLDNN();
} }
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> inputs; std::vector<std::vector<PaddleTensor>> inputs;
LoadInputData(&inputs); LoadInputData(&inputs);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config), TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
......
...@@ -197,7 +197,7 @@ void profile(bool use_mkldnn = false) { ...@@ -197,7 +197,7 @@ void profile(bool use_mkldnn = false) {
cfg.SetMKLDNNOp(op_list); cfg.SetMKLDNNOp(op_list);
} }
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -206,9 +206,11 @@ void profile(bool use_mkldnn = false) { ...@@ -206,9 +206,11 @@ void profile(bool use_mkldnn = false) {
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
PADDLE_ENFORCE_GT(outputs.size(), 0); PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]); auto output = outputs.back();
PADDLE_ENFORCE_GT(output.size(), 0);
size_t size = GetSize(output[0]);
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data()); float *result = static_cast<float *>(output[0].data.data());
for (size_t i = 0; i < size; i++) { for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(result[i], result_data[i], 1e-3); EXPECT_NEAR(result[i], result_data[i], 1e-3);
} }
......
...@@ -17,8 +17,6 @@ limitations under the License. */ ...@@ -17,8 +17,6 @@ limitations under the License. */
#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_int32(iterations, 0, "Number of iterations");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
...@@ -30,8 +28,13 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -30,8 +28,13 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->SwitchIrOptim(); cfg->SwitchIrOptim();
cfg->SwitchSpecifyInputNames(false); cfg->SwitchSpecifyInputNames(false);
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
cfg->EnableMKLDNN(); cfg->EnableMKLDNN();
cfg->pass_builder()->SetPasses(
{"infer_clean_graph_pass", "mkldnn_placement_pass",
"depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass",
"conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass",
"conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass",
"fc_fuse_pass", "is_test_pass"});
} }
template <typename T> template <typename T>
...@@ -40,8 +43,8 @@ class TensorReader { ...@@ -40,8 +43,8 @@ class TensorReader {
TensorReader(std::ifstream &file, size_t beginning_offset, TensorReader(std::ifstream &file, size_t beginning_offset,
std::vector<int> shape, std::string name) std::vector<int> shape, std::string name)
: file_(file), position(beginning_offset), shape_(shape), name_(name) { : file_(file), position(beginning_offset), shape_(shape), name_(name) {
numel = numel = std::accumulate(shape_.begin(), shape_.end(), size_t{1},
std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<T>()); std::multiplies<size_t>());
} }
PaddleTensor NextBatch() { PaddleTensor NextBatch() {
...@@ -71,10 +74,14 @@ class TensorReader { ...@@ -71,10 +74,14 @@ class TensorReader {
}; };
std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData( std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
const std::vector<std::vector<PaddleTensor>> &test_data, int num_images) { const std::vector<std::vector<PaddleTensor>> &test_data,
int num_images = FLAGS_warmup_batch_size) {
int test_data_batch_size = test_data[0][0].shape[0]; int test_data_batch_size = test_data[0][0].shape[0];
CHECK_LE(static_cast<size_t>(num_images), auto iterations_max = test_data.size();
test_data.size() * test_data_batch_size); PADDLE_ENFORCE(
static_cast<size_t>(num_images) <= iterations_max * test_data_batch_size,
"The requested quantization warmup data size " +
std::to_string(num_images) + " is bigger than all test data size.");
PaddleTensor images; PaddleTensor images;
images.name = "input"; images.name = "input";
...@@ -120,20 +127,17 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs, ...@@ -120,20 +127,17 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
std::vector<int> image_batch_shape{batch_size, 3, 224, 224}; std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
std::vector<int> label_batch_shape{batch_size, 1}; std::vector<int> label_batch_shape{batch_size, 1};
auto images_offset_in_file = static_cast<size_t>(file.tellg());
auto labels_offset_in_file = auto labels_offset_in_file =
static_cast<size_t>(file.tellg()) + images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
sizeof(float) * total_images *
std::accumulate(image_batch_shape.begin() + 1,
image_batch_shape.end(), 1, std::multiplies<int>());
TensorReader<float> image_reader(file, 0, image_batch_shape, "input"); TensorReader<float> image_reader(file, images_offset_in_file,
image_batch_shape, "input");
TensorReader<int64_t> label_reader(file, labels_offset_in_file, TensorReader<int64_t> label_reader(file, labels_offset_in_file,
label_batch_shape, "label"); label_batch_shape, "label");
auto iterations = total_images / batch_size; auto iterations_max = total_images / batch_size;
if (FLAGS_iterations > 0 && FLAGS_iterations < iterations) for (auto i = 0; i < iterations_max; i++) {
iterations = FLAGS_iterations;
for (auto i = 0; i < iterations; i++) {
auto images = image_reader.NextBatch(); auto images = image_reader.NextBatch();
auto labels = label_reader.NextBatch(); auto labels = label_reader.NextBatch();
inputs->emplace_back( inputs->emplace_back(
...@@ -148,20 +152,21 @@ TEST(Analyzer_int8_resnet50, quantization) { ...@@ -148,20 +152,21 @@ TEST(Analyzer_int8_resnet50, quantization) {
AnalysisConfig q_cfg; AnalysisConfig q_cfg;
SetConfig(&q_cfg); SetConfig(&q_cfg);
// read data from file and prepare batches with test data
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all, 100); SetInput(&input_slots_all);
// prepare warmup batch from input data read earlier
// warmup batch size can be different than batch size
std::shared_ptr<std::vector<PaddleTensor>> warmup_data = std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
GetWarmupData(input_slots_all, 100); GetWarmupData(input_slots_all);
// configure quantizer
q_cfg.EnableMkldnnQuantizer(); q_cfg.EnableMkldnnQuantizer();
q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100); q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size);
CompareQuantizedAndAnalysis( CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all);
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
reinterpret_cast<const PaddlePredictor::Config *>(&q_cfg),
input_slots_all);
} }
} // namespace analysis } // namespace analysis
......
...@@ -124,7 +124,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -124,7 +124,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST(Analyzer_LAC, profile) { TEST(Analyzer_LAC, profile) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -137,11 +137,13 @@ TEST(Analyzer_LAC, profile) { ...@@ -137,11 +137,13 @@ TEST(Analyzer_LAC, profile) {
24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25, 24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14, 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23}; 15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
PADDLE_ENFORCE_EQ(outputs.size(), 1UL); PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]); auto output = outputs.back();
PADDLE_ENFORCE_EQ(output.size(), 1UL);
size_t size = GetSize(output[0]);
size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t); size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
PADDLE_ENFORCE_GE(size, batch1_size); PADDLE_ENFORCE_GE(size, batch1_size);
int64_t *pdata = static_cast<int64_t *>(outputs[0].data.data()); int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
for (size_t i = 0; i < batch1_size; ++i) { for (size_t i = 0; i < batch1_size; ++i) {
EXPECT_EQ(pdata[i], lac_ref_data[i]); EXPECT_EQ(pdata[i], lac_ref_data[i]);
} }
......
...@@ -96,7 +96,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -96,7 +96,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
void profile(bool use_mkldnn = false) { void profile(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
if (use_mkldnn) { if (use_mkldnn) {
cfg.EnableMKLDNN(); cfg.EnableMKLDNN();
...@@ -108,8 +108,9 @@ void profile(bool use_mkldnn = false) { ...@@ -108,8 +108,9 @@ void profile(bool use_mkldnn = false) {
input_slots_all, &outputs, FLAGS_num_threads); input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
PADDLE_ENFORCE_EQ(outputs.size(), 2UL); PADDLE_ENFORCE_GT(outputs.size(), 0);
for (auto &output : outputs) { PADDLE_ENFORCE_EQ(outputs.back().size(), 2UL);
for (auto &output : outputs.back()) {
size_t size = GetSize(output); size_t size = GetSize(output);
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(output.data.data()); float *result = static_cast<float *>(output.data.data());
......
...@@ -106,7 +106,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -106,7 +106,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
void profile(bool memory_load = false) { void profile(bool memory_load = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg, memory_load); SetConfig(&cfg, memory_load);
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -117,10 +117,12 @@ void profile(bool memory_load = false) { ...@@ -117,10 +117,12 @@ void profile(bool memory_load = false) {
// the first inference result // the first inference result
const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
48, 39, 38, 16, 25}; 48, 39, 38, 16, 25};
PADDLE_ENFORCE_EQ(outputs.size(), 1UL); PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]); auto output = outputs.back();
PADDLE_ENFORCE_EQ(output.size(), 1UL);
size_t size = GetSize(output[0]);
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
int64_t *result = static_cast<int64_t *>(outputs[0].data.data()); int64_t *result = static_cast<int64_t *>(output[0].data.data());
for (size_t i = 0; i < std::min(11UL, size); i++) { for (size_t i = 0; i < std::min(11UL, size); i++) {
EXPECT_EQ(result[i], chinese_ner_result_data[i]); EXPECT_EQ(result[i], chinese_ner_result_data[i]);
} }
......
...@@ -127,7 +127,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -127,7 +127,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST(Analyzer_Pyramid_DNN, profile) { TEST(Analyzer_Pyramid_DNN, profile) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -135,10 +135,12 @@ TEST(Analyzer_Pyramid_DNN, profile) { ...@@ -135,10 +135,12 @@ TEST(Analyzer_Pyramid_DNN, profile) {
input_slots_all, &outputs, FLAGS_num_threads); input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
PADDLE_ENFORCE_EQ(outputs.size(), 1UL); PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]); auto output = outputs.back();
PADDLE_ENFORCE_EQ(output.size(), 1UL);
size_t size = GetSize(output[0]);
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data()); float *result = static_cast<float *>(output[0].data.data());
// output is probability, which is in (0, 1). // output is probability, which is in (0, 1).
for (size_t i = 0; i < size; i++) { for (size_t i = 0; i < size; i++) {
EXPECT_GT(result[i], 0); EXPECT_GT(result[i], 0);
......
...@@ -40,7 +40,7 @@ void profile(bool use_mkldnn = false) { ...@@ -40,7 +40,7 @@ void profile(bool use_mkldnn = false) {
if (use_mkldnn) { if (use_mkldnn) {
cfg.EnableMKLDNN(); cfg.EnableMKLDNN();
} }
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
......
...@@ -229,7 +229,7 @@ TEST(Analyzer_rnn1, profile) { ...@@ -229,7 +229,7 @@ TEST(Analyzer_rnn1, profile) {
SetConfig(&cfg); SetConfig(&cfg);
cfg.DisableGpu(); cfg.DisableGpu();
cfg.SwitchIrDebug(); cfg.SwitchIrDebug();
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -280,7 +280,7 @@ TEST(Analyzer_rnn1, compare_determine) { ...@@ -280,7 +280,7 @@ TEST(Analyzer_rnn1, compare_determine) {
TEST(Analyzer_rnn1, multi_thread) { TEST(Analyzer_rnn1, multi_thread) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
......
...@@ -126,7 +126,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -126,7 +126,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST(Analyzer_rnn2, profile) { TEST(Analyzer_rnn2, profile) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -136,9 +136,11 @@ TEST(Analyzer_rnn2, profile) { ...@@ -136,9 +136,11 @@ TEST(Analyzer_rnn2, profile) {
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
PADDLE_ENFORCE_GT(outputs.size(), 0); PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]); auto output = outputs.back();
PADDLE_ENFORCE_GT(output.size(), 0);
size_t size = GetSize(output[0]);
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data()); float *result = static_cast<float *>(output[0].data.data());
for (size_t i = 0; i < size; i++) { for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(result[i], result_data[i], 1e-3); EXPECT_NEAR(result[i], result_data[i], 1e-3);
} }
......
...@@ -110,7 +110,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -110,7 +110,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST(Analyzer_seq_conv1, profile) { TEST(Analyzer_seq_conv1, profile) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -119,10 +119,12 @@ TEST(Analyzer_seq_conv1, profile) { ...@@ -119,10 +119,12 @@ TEST(Analyzer_seq_conv1, profile) {
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
PADDLE_ENFORCE_EQ(outputs.size(), 1UL); PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]); auto output = outputs.back();
PADDLE_ENFORCE_EQ(output.size(), 1UL);
size_t size = GetSize(output[0]);
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data()); float *result = static_cast<float *>(output[0].data.data());
// output is probability, which is in (0, 1). // output is probability, which is in (0, 1).
for (size_t i = 0; i < size; i++) { for (size_t i = 0; i < size; i++) {
EXPECT_GT(result[i], 0); EXPECT_GT(result[i], 0);
......
...@@ -156,7 +156,7 @@ void profile(bool use_mkldnn = false) { ...@@ -156,7 +156,7 @@ void profile(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg, use_mkldnn); SetConfig(&cfg, use_mkldnn);
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg), TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
......
...@@ -70,7 +70,7 @@ TEST(Analyzer_Text_Classification, profile) { ...@@ -70,7 +70,7 @@ TEST(Analyzer_Text_Classification, profile) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.SwitchIrDebug(); cfg.SwitchIrDebug();
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -79,8 +79,9 @@ TEST(Analyzer_Text_Classification, profile) { ...@@ -79,8 +79,9 @@ TEST(Analyzer_Text_Classification, profile) {
if (FLAGS_num_threads == 1) { if (FLAGS_num_threads == 1) {
// Get output // Get output
LOG(INFO) << "get outputs " << outputs.size(); PADDLE_ENFORCE_GT(outputs.size(), 0);
for (auto &output : outputs) { LOG(INFO) << "get outputs " << outputs.back().size();
for (auto &output : outputs.back()) {
LOG(INFO) << "output.shape: " << to_string(output.shape); LOG(INFO) << "output.shape: " << to_string(output.shape);
// no lod ? // no lod ?
CHECK_EQ(output.lod.size(), 0UL); CHECK_EQ(output.lod.size(), 0UL);
......
...@@ -186,7 +186,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -186,7 +186,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
void profile(bool use_mkldnn = false) { void profile(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
if (use_mkldnn) { if (use_mkldnn) {
cfg.EnableMKLDNN(); cfg.EnableMKLDNN();
} }
......
...@@ -87,7 +87,7 @@ void profile(bool use_mkldnn = false) { ...@@ -87,7 +87,7 @@ void profile(bool use_mkldnn = false) {
cfg.EnableMKLDNN(); cfg.EnableMKLDNN();
} }
// cfg.pass_builder()->TurnOnDebug(); // cfg.pass_builder()->TurnOnDebug();
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -100,7 +100,8 @@ void profile(bool use_mkldnn = false) { ...@@ -100,7 +100,8 @@ void profile(bool use_mkldnn = false) {
auto refer = ProcessALine(line); auto refer = ProcessALine(line);
file.close(); file.close();
auto &output = outputs.front(); PADDLE_ENFORCE_GT(outputs.size(), 0);
auto &output = outputs.back().front();
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
CHECK_EQ(numel, refer.data.size()); CHECK_EQ(numel, refer.data.size());
for (size_t i = 0; i < numel; ++i) { for (size_t i = 0; i < numel; ++i) {
......
...@@ -41,7 +41,10 @@ DEFINE_string(model_name, "", "model name"); ...@@ -41,7 +41,10 @@ DEFINE_string(model_name, "", "model name");
DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data file"); DEFINE_string(infer_data, "", "data file");
DEFINE_string(refer_result, "", "reference result for comparison"); DEFINE_string(refer_result, "", "reference result for comparison");
DEFINE_int32(batch_size, 1, "batch size."); DEFINE_int32(batch_size, 1, "batch size");
DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
// setting iterations to 0 means processing the whole dataset
DEFINE_int32(iterations, 0, "number of batches to process");
DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
...@@ -239,7 +242,7 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs, ...@@ -239,7 +242,7 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
} }
input.shape = shape; input.shape = shape;
input.dtype = PaddleDType::FLOAT32; input.dtype = PaddleDType::FLOAT32;
size_t len = std::accumulate(shape.begin(), shape.end(), 1, size_t len = std::accumulate(shape.begin(), shape.end(), size_t{1},
[](int a, int b) { return a * b; }); [](int a, int b) { return a * b; });
input.data.Resize(len * sizeof(float)); input.data.Resize(len * sizeof(float));
input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}}); input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}});
...@@ -286,17 +289,18 @@ void ConvertPaddleTensorToZeroCopyTensor( ...@@ -286,17 +289,18 @@ void ConvertPaddleTensorToZeroCopyTensor(
void PredictionWarmUp(PaddlePredictor *predictor, void PredictionWarmUp(PaddlePredictor *predictor,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<std::vector<PaddleTensor>> *outputs,
int tid) { int num_threads, int tid) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
LOG(INFO) << "Running thread " << tid << ", warm up run..."; LOG(INFO) << "Running thread " << tid << ", warm up run...";
if (FLAGS_zero_copy) { if (FLAGS_zero_copy) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]); ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
} }
outputs->resize(1);
Timer warmup_timer; Timer warmup_timer;
warmup_timer.tic(); warmup_timer.tic();
if (!FLAGS_zero_copy) { if (!FLAGS_zero_copy) {
predictor->Run(inputs[0], outputs, batch_size); predictor->Run(inputs[0], &(*outputs)[0], batch_size);
} else { } else {
predictor->ZeroCopyRun(); predictor->ZeroCopyRun();
} }
...@@ -308,11 +312,16 @@ void PredictionWarmUp(PaddlePredictor *predictor, ...@@ -308,11 +312,16 @@ void PredictionWarmUp(PaddlePredictor *predictor,
void PredictionRun(PaddlePredictor *predictor, void PredictionRun(PaddlePredictor *predictor,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<std::vector<PaddleTensor>> *outputs,
int tid) { int num_threads, int tid) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; int iterations = inputs.size(); // process the whole dataset ...
if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size())
iterations =
FLAGS_iterations; // ... unless the number of iterations is set
outputs->resize(iterations);
LOG(INFO) << "Thread " << tid << ", number of threads " << num_threads
<< ", run " << num_times << " times...";
Timer run_timer; Timer run_timer;
double elapsed_time = 0; double elapsed_time = 0;
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
...@@ -320,14 +329,14 @@ void PredictionRun(PaddlePredictor *predictor, ...@@ -320,14 +329,14 @@ void PredictionRun(PaddlePredictor *predictor,
#endif #endif
if (!FLAGS_zero_copy) { if (!FLAGS_zero_copy) {
run_timer.tic(); run_timer.tic();
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < iterations; i++) {
for (int j = 0; j < num_times; j++) { for (int j = 0; j < num_times; j++) {
predictor->Run(inputs[i], outputs, batch_size); predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
} }
} }
elapsed_time = run_timer.toc(); elapsed_time = run_timer.toc();
} else { } else {
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < iterations; i++) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]); ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
run_timer.tic(); run_timer.tic();
for (int j = 0; j < num_times; j++) { for (int j = 0; j < num_times; j++) {
...@@ -340,13 +349,14 @@ void PredictionRun(PaddlePredictor *predictor, ...@@ -340,13 +349,14 @@ void PredictionRun(PaddlePredictor *predictor,
ProfilerStop(); ProfilerStop();
#endif #endif
PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times, auto batch_latency = elapsed_time / (iterations * num_times);
inputs.size()); PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency,
iterations);
if (FLAGS_record_benchmark) { if (FLAGS_record_benchmark) {
Benchmark benchmark; Benchmark benchmark;
benchmark.SetName(FLAGS_model_name); benchmark.SetName(FLAGS_model_name);
benchmark.SetBatchSize(batch_size); benchmark.SetBatchSize(FLAGS_batch_size);
benchmark.SetLatency(elapsed_time / num_times); benchmark.SetLatency(batch_latency);
benchmark.PersistToFile("benchmark_record.txt"); benchmark.PersistToFile("benchmark_record.txt");
} }
} }
...@@ -354,16 +364,17 @@ void PredictionRun(PaddlePredictor *predictor, ...@@ -354,16 +364,17 @@ void PredictionRun(PaddlePredictor *predictor,
void TestOneThreadPrediction( void TestOneThreadPrediction(
const PaddlePredictor::Config *config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, bool use_analysis = true) { std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
auto predictor = CreateTestPredictor(config, use_analysis); auto predictor = CreateTestPredictor(config, use_analysis);
PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0); PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads,
PredictionRun(predictor.get(), inputs, outputs, 1, 0); 0);
PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0);
} }
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
const PaddlePredictor::Config *config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<std::vector<PaddleTensor>> *outputs, int num_threads,
bool use_analysis = true) { bool use_analysis = true) {
std::vector<std::thread> threads; std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors; std::vector<std::unique_ptr<PaddlePredictor>> predictors;
...@@ -376,7 +387,7 @@ void TestMultiThreadPrediction( ...@@ -376,7 +387,7 @@ void TestMultiThreadPrediction(
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
// Each thread should have local inputs and outputs. // Each thread should have local inputs and outputs.
// The inputs of each thread are all the same. // The inputs of each thread are all the same.
std::vector<PaddleTensor> outputs_tid; std::vector<std::vector<PaddleTensor>> outputs_tid;
auto &predictor = predictors[tid]; auto &predictor = predictors[tid];
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (use_analysis) { if (use_analysis) {
...@@ -384,8 +395,8 @@ void TestMultiThreadPrediction( ...@@ -384,8 +395,8 @@ void TestMultiThreadPrediction(
->SetMkldnnThreadID(static_cast<int>(tid) + 1); ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
} }
#endif #endif
PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid); PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads, tid);
PredictionRun(predictor.get(), inputs, outputs, num_threads, tid); PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid);
}); });
} }
for (int i = 0; i < num_threads; ++i) { for (int i = 0; i < num_threads; ++i) {
...@@ -395,8 +406,8 @@ void TestMultiThreadPrediction( ...@@ -395,8 +406,8 @@ void TestMultiThreadPrediction(
void TestPrediction(const PaddlePredictor::Config *config, void TestPrediction(const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<std::vector<PaddleTensor>> *outputs,
bool use_analysis = FLAGS_use_analysis) { int num_threads, bool use_analysis = FLAGS_use_analysis) {
PrintConfig(config, use_analysis); PrintConfig(config, use_analysis);
if (num_threads == 1) { if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs, use_analysis); TestOneThreadPrediction(config, inputs, outputs, use_analysis);
...@@ -406,30 +417,41 @@ void TestPrediction(const PaddlePredictor::Config *config, ...@@ -406,30 +417,41 @@ void TestPrediction(const PaddlePredictor::Config *config,
} }
} }
void CompareTopAccuracy(const std::vector<PaddleTensor> &output_slots1, void CompareTopAccuracy(
const std::vector<PaddleTensor> &output_slots2) { const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
// first output: avg_cost const std::vector<std::vector<PaddleTensor>> &output_slots_ref) {
if (output_slots1.size() == 0 || output_slots2.size() == 0) if (output_slots_quant.size() == 0 || output_slots_ref.size() == 0)
throw std::invalid_argument( throw std::invalid_argument(
"CompareTopAccuracy: output_slots vector is empty."); "CompareTopAccuracy: output_slots vector is empty.");
PADDLE_ENFORCE(output_slots1.size() >= 2UL);
PADDLE_ENFORCE(output_slots2.size() >= 2UL);
// second output: acc_top1 float total_accs1_quant{0};
if (output_slots1[1].lod.size() > 0 || output_slots2[1].lod.size() > 0) float total_accs1_ref{0};
throw std::invalid_argument( for (size_t i = 0; i < output_slots_quant.size(); ++i) {
"CompareTopAccuracy: top1 accuracy output has nonempty LoD."); PADDLE_ENFORCE(output_slots_quant[i].size() >= 2UL);
if (output_slots1[1].dtype != paddle::PaddleDType::FLOAT32 || PADDLE_ENFORCE(output_slots_ref[i].size() >= 2UL);
output_slots2[1].dtype != paddle::PaddleDType::FLOAT32) // second output: acc_top1
throw std::invalid_argument( if (output_slots_quant[i][1].lod.size() > 0 ||
"CompareTopAccuracy: top1 accuracy output is of a wrong type."); output_slots_ref[i][1].lod.size() > 0)
float *top1_quantized = static_cast<float *>(output_slots1[1].data.data()); throw std::invalid_argument(
float *top1_reference = static_cast<float *>(output_slots2[1].data.data()); "CompareTopAccuracy: top1 accuracy output has nonempty LoD.");
LOG(INFO) << "top1 INT8 accuracy: " << *top1_quantized; if (output_slots_quant[i][1].dtype != paddle::PaddleDType::FLOAT32 ||
LOG(INFO) << "top1 FP32 accuracy: " << *top1_reference; output_slots_ref[i][1].dtype != paddle::PaddleDType::FLOAT32)
throw std::invalid_argument(
"CompareTopAccuracy: top1 accuracy output is of a wrong type.");
total_accs1_quant +=
*static_cast<float *>(output_slots_quant[i][1].data.data());
total_accs1_ref +=
*static_cast<float *>(output_slots_ref[i][1].data.data());
}
float avg_acc1_quant = total_accs1_quant / output_slots_quant.size();
float avg_acc1_ref = total_accs1_ref / output_slots_ref.size();
LOG(INFO) << "Avg top1 INT8 accuracy: " << std::fixed << std::setw(6)
<< std::setprecision(4) << avg_acc1_quant;
LOG(INFO) << "Avg top1 FP32 accuracy: " << std::fixed << std::setw(6)
<< std::setprecision(4) << avg_acc1_ref;
LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy; LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy;
CHECK_LE(std::abs(*top1_quantized - *top1_reference), CHECK_LE(std::abs(avg_acc1_quant - avg_acc1_ref), FLAGS_quantized_accuracy);
FLAGS_quantized_accuracy);
} }
void CompareDeterministic( void CompareDeterministic(
...@@ -455,20 +477,35 @@ void CompareNativeAndAnalysis( ...@@ -455,20 +477,35 @@ void CompareNativeAndAnalysis(
const PaddlePredictor::Config *config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs) { const std::vector<std::vector<PaddleTensor>> &inputs) {
PrintConfig(config, true); PrintConfig(config, true);
std::vector<PaddleTensor> native_outputs, analysis_outputs; std::vector<std::vector<PaddleTensor>> native_outputs, analysis_outputs;
TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true); TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
CompareResult(analysis_outputs, native_outputs); PADDLE_ENFORCE(native_outputs.size() > 0, "Native output is empty.");
PADDLE_ENFORCE(analysis_outputs.size() > 0, "Analysis output is empty.");
CompareResult(analysis_outputs.back(), native_outputs.back());
} }
void CompareQuantizedAndAnalysis( void CompareQuantizedAndAnalysis(
const PaddlePredictor::Config *config, const AnalysisConfig *config, const AnalysisConfig *qconfig,
const PaddlePredictor::Config *qconfig,
const std::vector<std::vector<PaddleTensor>> &inputs) { const std::vector<std::vector<PaddleTensor>> &inputs) {
PrintConfig(config, true); PADDLE_ENFORCE_EQ(inputs[0][0].shape[0], FLAGS_batch_size,
std::vector<PaddleTensor> analysis_outputs, quantized_outputs; "Input data has to be packed batch by batch.");
TestOneThreadPrediction(config, inputs, &analysis_outputs, true); LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size
TestOneThreadPrediction(qconfig, inputs, &quantized_outputs, true); << ", warmup batch size " << FLAGS_warmup_batch_size << ".";
LOG(INFO) << "--- FP32 prediction start ---";
auto *cfg = reinterpret_cast<const PaddlePredictor::Config *>(config);
PrintConfig(cfg, true);
std::vector<std::vector<PaddleTensor>> analysis_outputs;
TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true);
LOG(INFO) << "--- INT8 prediction start ---";
auto *qcfg = reinterpret_cast<const PaddlePredictor::Config *>(qconfig);
PrintConfig(qcfg, true);
std::vector<std::vector<PaddleTensor>> quantized_outputs;
TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true);
LOG(INFO) << "--- comparing outputs --- ";
CompareTopAccuracy(quantized_outputs, analysis_outputs); CompareTopAccuracy(quantized_outputs, analysis_outputs);
} }
...@@ -578,9 +615,9 @@ static bool CompareTensorData(const framework::LoDTensor &a, ...@@ -578,9 +615,9 @@ static bool CompareTensorData(const framework::LoDTensor &a,
const framework::LoDTensor &b) { const framework::LoDTensor &b) {
auto a_shape = framework::vectorize(a.dims()); auto a_shape = framework::vectorize(a.dims());
auto b_shape = framework::vectorize(b.dims()); auto b_shape = framework::vectorize(b.dims());
size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1, size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), size_t{1},
[](int a, int b) { return a * b; }); [](int a, int b) { return a * b; });
size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1, size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), size_t{1},
[](int a, int b) { return a * b; }); [](int a, int b) { return a * b; });
if (a_size != b_size) { if (a_size != b_size) {
LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
......
...@@ -74,7 +74,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { ...@@ -74,7 +74,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
} }
std::vector<PaddleTensor> outputs; std::vector<std::vector<PaddleTensor>> outputs;
if (use_analysis || use_tensorrt) { if (use_analysis || use_tensorrt) {
AnalysisConfig config; AnalysisConfig config;
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
......
...@@ -8,9 +8,6 @@ conv_shift ...@@ -8,9 +8,6 @@ conv_shift
cos cos
cos_sim cos_sim
dequantize dequantize
elementwise_div
elementwise_max
elementwise_min
elu elu
fc fc
flatten flatten
...@@ -28,8 +25,6 @@ gelu ...@@ -28,8 +25,6 @@ gelu
gru gru
hard_shrink hard_shrink
hierarchical_sigmoid hierarchical_sigmoid
hinge_loss
huber_loss
leaky_relu leaky_relu
log log
logsigmoid logsigmoid
...@@ -57,7 +52,6 @@ requantize ...@@ -57,7 +52,6 @@ requantize
reshape reshape
rnn_memory_helper rnn_memory_helper
round round
row_conv
sequence_softmax sequence_softmax
sin sin
softplus softplus
......
...@@ -74,5 +74,8 @@ class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -74,5 +74,8 @@ class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
virtual void Apply() = 0; virtual void Apply() = 0;
}; };
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(BatchSizeLikeNoNeedBufferVarsInference,
"Input");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
...@@ -174,24 +177,41 @@ class ConditionalBlockGradOp : public ConditionalOp { ...@@ -174,24 +177,41 @@ class ConditionalBlockGradOp : public ConditionalOp {
framework::Executor exec(dev_place); framework::Executor exec(dev_place);
auto *block = Attr<framework::BlockDesc *>("sub_block"); auto *block = Attr<framework::BlockDesc *>("sub_block");
exec.Run(*block->Program(), &cur_scope, block->ID(), false);
AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"), const auto &ins = Inputs("Input");
Outputs(framework::GradVarName("Input"))); const auto &d_ins = Outputs(framework::GradVarName("Input"));
const auto &conds = Inputs("Cond");
const auto &d_conds = Outputs(framework::GradVarName("Cond"));
std::vector<std::string> ins_conds_grads;
ins_conds_grads.reserve(ins.size() + conds.size());
for (auto &in : ins) {
ins_conds_grads.emplace_back(framework::GradVarName(in));
}
for (auto &cond : conds) {
ins_conds_grads.emplace_back(framework::GradVarName(cond));
}
exec.Run(*block->Program(), &cur_scope, block->ID(), false, true,
ins_conds_grads);
AssignLocalGradientToGlobal(dev_place, cur_scope, ins_conds_grads.data(),
ins.size(), d_ins);
AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"), AssignLocalGradientToGlobal(dev_place, cur_scope,
Outputs(framework::GradVarName("Cond"))); ins_conds_grads.data() + ins.size(),
conds.size(), d_conds);
} }
} }
private: private:
void AssignLocalGradientToGlobal( void AssignLocalGradientToGlobal(
const platform::Place &place, const framework::Scope &cur_scope, const platform::Place &place, const framework::Scope &cur_scope,
const std::vector<std::string> &p_names, const std::string *p_grad_names, size_t p_grad_names_num,
const std::vector<std::string> &pg_names) const { const std::vector<std::string> &pg_names) const {
for (size_t i = 0; i < p_names.size(); ++i) { for (size_t i = 0; i < p_grad_names_num; ++i) {
auto out_grad_name = pg_names[i]; auto out_grad_name = pg_names[i];
auto in_grad_name = framework::GradVarName(p_names[i]); const auto &in_grad_name = p_grad_names[i];
auto *in_var = cur_scope.FindVar(in_grad_name); auto *in_var = cur_scope.FindVar(in_grad_name);
if (in_var == nullptr) { if (in_var == nullptr) {
continue; continue;
......
...@@ -13,10 +13,47 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,47 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
class ElementwiseDivOpMaker : public ElementwiseOpMaker {
protected:
std::string GetName() const override { return "Div"; }
std::string GetEquation() const override { return "Out = X / Y"; }
};
class ElementwiseDivGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("elementwise_div_grad");
op->SetInput("Y", Input("Y"));
op->SetInput("Out", Output("Out"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y"); REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
ops::ElementwiseDivOpMaker, ops::ElementwiseOpInferVarType,
ops::ElementwiseDivGradOpDescMaker);
REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_div, elementwise_div,
......
...@@ -47,7 +47,7 @@ struct DivGradDX { ...@@ -47,7 +47,7 @@ struct DivGradDX {
template <typename T> template <typename T>
struct DivGradDY { struct DivGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return -dout * x / (y * y); return -dout * out / y;
} }
}; };
...@@ -58,13 +58,15 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> { ...@@ -58,13 +58,15 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
ElemwiseGradKernel<T>::Compute(ctx); ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out"); auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
auto* x = dout; // Fake x, not used
ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>( ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>()); ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
} }
......
...@@ -13,9 +13,48 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,48 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_max_op.h" #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
protected:
std::string GetName() const override { return "Max"; }
std::string GetEquation() const override { return "Out = max(X, Y)"; }
};
class ElementwiseMaxGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("elementwise_max_grad");
op->SetInput("X", Input("X"));
op->SetInput("Y", Input("Y"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_ELEMWISE_OP(elementwise_max, "Max", "Out = max(X, Y)");
REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
ops::ElementwiseMaxOpMaker, ops::ElementwiseOpInferVarType,
ops::ElementwiseMaxGradOpDescMaker);
REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_max, elementwise_max,
ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>, ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -63,10 +63,10 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> { ...@@ -63,10 +63,10 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto* out = dout; // Fake out, not used
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>( ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>()); ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
......
...@@ -13,9 +13,48 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,48 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_min_op.h" #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
class ElementwiseMinOpMaker : public ElementwiseOpMaker {
protected:
std::string GetName() const override { return "Min"; }
std::string GetEquation() const override { return "Out = min(X, Y)"; }
};
class ElementwiseMinGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("elementwise_min_grad");
op->SetInput("X", Input("X"));
op->SetInput("Y", Input("Y"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_ELEMWISE_OP(elementwise_min, "Min", "Out = min(X, Y)");
REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
ops::ElementwiseMinOpMaker, ops::ElementwiseOpInferVarType,
ops::ElementwiseMinGradOpDescMaker);
REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_min, elementwise_min,
ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>, ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -62,10 +62,10 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> { ...@@ -62,10 +62,10 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto* out = dout; // Fake out, not used
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>( ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>()); ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
......
...@@ -173,12 +173,12 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ...@@ -173,12 +173,12 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); auto out_grad_name = framework::GradVarName("Out");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(out_grad_name),
"Input(Out@GRAD) should not be null"); "Input(Out@GRAD) should not be null");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim(out_grad_name);
auto y_dims = ctx->GetInputDim("Y"); auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
...@@ -187,8 +187,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ...@@ -187,8 +187,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
auto x_grad_name = framework::GradVarName("X"); auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y"); auto y_grad_name = framework::GradVarName("Y");
if (ctx->HasOutput(x_grad_name)) { if (ctx->HasOutput(x_grad_name)) {
ctx->ShareDim("X", /*->*/ x_grad_name); ctx->ShareDim(out_grad_name, /*->*/ x_grad_name);
ctx->ShareLoD("X", /*->*/ x_grad_name); ctx->ShareLoD(out_grad_name, /*->*/ x_grad_name);
} }
if (ctx->HasOutput(y_grad_name)) { if (ctx->HasOutput(y_grad_name)) {
ctx->ShareDim("Y", /*->*/ y_grad_name); ctx->ShareDim("Y", /*->*/ y_grad_name);
......
...@@ -46,6 +46,7 @@ obtained from the `input` tensor. ...@@ -46,6 +46,7 @@ obtained from the `input` tensor.
)DOC"); )DOC");
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -53,7 +54,8 @@ namespace ops = paddle::operators; ...@@ -53,7 +54,8 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(fill_constant_batch_size_like, REGISTER_OPERATOR(fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOp, ops::FillConstantBatchSizeLikeOp,
paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker,
ops::FillConstantBatchSizeLikeOpMaker); ops::FillConstantBatchSizeLikeOpMaker,
ops::BatchSizeLikeNoNeedBufferVarsInference);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
fill_constant_batch_size_like, fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext, ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
......
...@@ -36,6 +36,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -36,6 +36,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddInput("X", "The input of fill-zeros-like op."); AddInput("X", "The input of fill-zeros-like op.");
AddOutput("Out", "The variable will be filled up with zeros."); AddOutput("Out", "The variable will be filled up with zeros.");
ExtraMake();
AddComment(R"DOC( AddComment(R"DOC(
FillZerosLike Operator. FillZerosLike Operator.
...@@ -44,13 +45,49 @@ The output will have the same size as the input. ...@@ -44,13 +45,49 @@ The output will have the same size as the input.
)DOC"); )DOC");
} }
protected:
virtual void ExtraMake() {}
};
class FillZerosLikeOp2 : public FillZerosLikeOp {
public:
using FillZerosLikeOp::FillZerosLikeOp;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
ctx.GetPlace());
}
}; };
class FillZerosLikeOp2Maker : public FillZerosLikeOpMaker {
protected:
void ExtraMake() override {
this->AddAttr<int>("dtype",
"(int, default 5(FP32)) "
"Output data type.")
.SetDefault(framework::proto::VarType::FP32);
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(FillZerosLikeOp2NoNeedBufferVarsInference,
"X");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
ops::FillZerosLikeOpMaker); ops::FillZerosLikeOpMaker);
REGISTER_OPERATOR(fill_zeros_like2, ops::FillZerosLikeOp2,
ops::FillZerosLikeOp2Maker,
ops::FillZerosLikeOp2NoNeedBufferVarsInference,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
fill_zeros_like, fill_zeros_like,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>, ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
...@@ -58,3 +95,11 @@ REGISTER_OP_CPU_KERNEL( ...@@ -58,3 +95,11 @@ REGISTER_OP_CPU_KERNEL(
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>, ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>, ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>); ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
REGISTER_OP_CPU_KERNEL(
fill_zeros_like2,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
...@@ -26,3 +26,13 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -26,3 +26,13 @@ REGISTER_OP_CUDA_KERNEL(
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>, paddle::platform::float16>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>); ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
REGISTER_OP_CUDA_KERNEL(
fill_zeros_like2,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
...@@ -65,17 +65,13 @@ by input arguments. ...@@ -65,17 +65,13 @@ by input arguments.
} }
}; };
DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(
GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OPERATOR( REGISTER_OPERATOR(gaussian_random_batch_size_like,
gaussian_random_batch_size_like, paddle::operators::GaussianRandomBatchSizeLikeOp,
paddle::operators::GaussianRandomBatchSizeLikeOp, paddle::operators::GaussianRandomBatchSizeLikeOpMaker,
paddle::operators::GaussianRandomBatchSizeLikeOpMaker, paddle::framework::EmptyGradOpMaker,
paddle::framework::EmptyGradOpMaker, paddle::operators::BatchSizeLikeNoNeedBufferVarsInference);
paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference);
// Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu // Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/hinge_loss_op.h" #include "paddle/fluid/operators/hinge_loss_op.h"
#include <memory>
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -97,12 +100,29 @@ class HingeLossGradOp : public framework::OperatorWithKernel { ...@@ -97,12 +100,29 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
} }
}; };
class HingeLossGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("hinge_loss_grad");
op->SetInput("Logits", Input("Logits"));
op->SetInput("Labels", Input("Labels"));
op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>, REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>); ops::HingeLossGradOpDescMaker);
REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp); REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
hinge_loss, hinge_loss,
......
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/huber_loss_op.h" #include "paddle/fluid/operators/huber_loss_op.h"
#include <memory>
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -90,38 +93,45 @@ class HuberLossGradOp : public framework::OperatorWithKernel { ...@@ -90,38 +93,45 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Residual"),
"Input(Residual) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null."); "Input(Out@GRAD) should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
auto residual_dims = ctx->GetInputDim("Residual"); auto residual_dims = ctx->GetInputDim("Residual");
auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(residual_dims, x_dims);
PADDLE_ENFORCE_EQ(out_grad_dims, x_dims);
auto x_grad_name = framework::GradVarName("X"); auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y"); auto y_grad_name = framework::GradVarName("Y");
if (ctx->HasOutput(x_grad_name)) { if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims); ctx->SetOutputDim(x_grad_name, residual_dims);
} }
if (ctx->HasOutput(y_grad_name)) { if (ctx->HasOutput(y_grad_name)) {
ctx->SetOutputDim(y_grad_name, y_dims); ctx->SetOutputDim(y_grad_name, residual_dims);
} }
} }
}; };
class HuberLossGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("huber_loss_grad");
op->SetInput("Residual", Output("Residual"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>, REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>); ops::HuberLossGradOpDescMaker);
REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp); REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>, huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel { ...@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = framework::OpKernelType( framework::OpKernelType kt = framework::OpKernelType(
framework::proto::VarType::FP32, platform::CPUPlace()); framework::proto::VarType::FP32, ctx.GetPlace());
return kt; return kt;
} }
}; };
......
/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pixel_shuffle_op.h"
#include <memory>
namespace paddle {
namespace operators {
class PixelShuffleOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of PixelShuffleOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of PixelShuffleOp should not be null.");
auto input_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
PADDLE_ENFORCE(input_dims[1] % (upscale_factor * upscale_factor) == 0,
"Upscale_factor should devide the number of channel");
auto output_dims = input_dims;
output_dims[0] = input_dims[0];
output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
output_dims[2] = input_dims[2] * upscale_factor;
output_dims[3] = input_dims[3] * upscale_factor;
ctx->SetOutputDim("Out", output_dims);
}
};
class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"X",
"(Tensor, default Tensor<float>), "
"the input feature data of PixelShuffleOp, the layout is [N C H W].");
AddOutput(
"Out",
"(Tensor, default Tensor<float>), the output of "
"PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor].");
AddAttr<int>("upscale_factor",
"the factor to increase spatial resolution by.")
.SetDefault(1)
.AddCustomChecker([](const int& upscale_factor) {
PADDLE_ENFORCE_GE(upscale_factor, 1,
"upscale_factor should be larger than 0.");
});
AddComment(R"DOC(
Pixel Shuffle operator
This operator rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
to a tensor of shape :math:`(C, H \times r, W \times r)`.
This is useful for implementing efficient sub-pixel convolution
with a stride of :math:`1/r`.
Please refer to the paper:
`Real-Time Single Image and Video Super-Resolution Using an Efficient
Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
by Shi et. al (2016) for more details.
)DOC");
}
};
class PixelShuffleGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("pixel_shuffle_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
class PixelShuffleGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@Grad) should not be null");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad) should not be null");
auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE(do_dims.size() == 4, "The layout of input is NCHW.");
auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
auto dx_dims = do_dims;
dx_dims[0] = do_dims[0];
dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
dx_dims[2] = do_dims[2] / upscale_factor;
dx_dims[3] = do_dims[3] / upscale_factor;
ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
ops::PixelShuffleGradMaker);
REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp);
REGISTER_OP_CPU_KERNEL(
pixel_shuffle,
ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
pixel_shuffle_grad,
ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pixel_shuffle_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
pixel_shuffle, ops::PixelShuffleOpKernel<plat::CUDADeviceContext, float>,
ops::PixelShuffleOpKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
pixel_shuffle_grad,
ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, float>,
ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class PixelShuffleOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int factor = ctx.Attr<int>("upscale_factor");
auto in_dims = in->dims();
auto o_dims = out->dims();
framework::Tensor t;
t.ShareDataWith(*in);
t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
std::vector<int> axis = {0, 1, 4, 2, 5, 3};
framework::Tensor o;
o.ShareDataWith(*out);
o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
math::Transpose<DeviceContext, T, 6> trans;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
trans(dev_ctx, t, &o, axis);
out->Resize(o_dims);
}
};
template <typename DeviceContext, typename T>
class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
int factor = ctx.Attr<int>("upscale_factor");
auto do_dims = dout->dims();
auto dx_dims = dx->dims();
framework::Tensor t;
t.ShareDataWith(*dout);
t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
std::vector<int> axis = {0, 1, 3, 5, 2, 4};
framework::Tensor o;
o.ShareDataWith(*dx);
o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
math::Transpose<DeviceContext, T, 6> trans;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
trans(dev_ctx, t, &o, axis);
dx->Resize(dx_dims);
}
};
} // namespace operators
} // namespace paddle
...@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/row_conv_op.h" #include "paddle/fluid/operators/row_conv_op.h"
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
namespace paddle { namespace paddle {
...@@ -54,7 +58,6 @@ class RowConvGradOp : public framework::OperatorWithKernel { ...@@ -54,7 +58,6 @@ class RowConvGradOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"), PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) should not be null."); "Input(Filter) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
...@@ -62,8 +65,8 @@ class RowConvGradOp : public framework::OperatorWithKernel { ...@@ -62,8 +65,8 @@ class RowConvGradOp : public framework::OperatorWithKernel {
auto x_grad_name = framework::GradVarName("X"); auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) { if (ctx->HasOutput(x_grad_name)) {
auto x_dims = ctx->GetInputDim("X"); auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
ctx->SetOutputDim(x_grad_name, x_dims); ctx->SetOutputDim(x_grad_name, dout_dims);
} }
auto filter_grad_name = framework::GradVarName("Filter"); auto filter_grad_name = framework::GradVarName("Filter");
...@@ -259,12 +262,31 @@ class RowConvGradKernel<platform::CPUDeviceContext, T> ...@@ -259,12 +262,31 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
} }
} }
}; };
class RowConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("row_conv_grad");
op->SetAttrMap(Attrs());
op->SetInput("X", Input("X"));
op->SetInput("Filter", Input("Filter"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker, REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::RowConvGradOpDescMaker);
REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp); REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>); row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
......
...@@ -64,8 +64,9 @@ with random values sampled from a uniform distribution. ...@@ -64,8 +64,9 @@ with random values sampled from a uniform distribution.
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT( REGISTER_OPERATOR(uniform_random_batch_size_like,
uniform_random_batch_size_like, paddle::operators::UniformRandomBatchSizeLikeOp,
paddle::operators::UniformRandomBatchSizeLikeOp, paddle::operators::UniformRandomBatchSizeLikeOpMaker,
paddle::operators::UniformRandomBatchSizeLikeOpMaker); paddle::framework::EmptyGradOpMaker,
paddle::operators::BatchSizeLikeNoNeedBufferVarsInference);
// Kernels are registered in uniform_random_op.cc and uniform_random_op.cu // Kernels are registered in uniform_random_op.cc and uniform_random_op.cu
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
tracer analysis_predictor imperative_profiler) tracer analysis_predictor imperative_profiler nccl_context)
if(WITH_PYTHON) if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op) list(APPEND PYBIND_DEPS py_func_op)
......
...@@ -29,7 +29,7 @@ namespace paddle { ...@@ -29,7 +29,7 @@ namespace paddle {
namespace pybind { namespace pybind {
// Bind Methods // Bind Methods
void BindTracer(pybind11::module* m) { void BindImperative(pybind11::module* m) {
pybind11::class_<imperative::Tracer>(*m, "Tracer", "") pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
.def("__init__", .def("__init__",
[](imperative::Tracer& self, framework::BlockDesc* root_block) { [](imperative::Tracer& self, framework::BlockDesc* root_block) {
...@@ -59,6 +59,47 @@ void BindTracer(pybind11::module* m) { ...@@ -59,6 +59,47 @@ void BindTracer(pybind11::module* m) {
}) })
.def("py_trace", &imperative::Tracer::PyTrace, .def("py_trace", &imperative::Tracer::PyTrace,
pybind11::return_value_policy::take_ownership); pybind11::return_value_policy::take_ownership);
// define parallel context
pybind11::class_<imperative::ParallelStrategy> parallel_strategy(
*m, "ParallelStrategy", "");
parallel_strategy.def(pybind11::init())
.def_property(
"nranks",
[](const imperative::ParallelStrategy& self) { return self.nranks_; },
[](imperative::ParallelStrategy& self, int nranks) {
self.nranks_ = nranks;
})
.def_property("local_rank",
[](const imperative::ParallelStrategy& self) {
return self.local_rank_;
},
[](imperative::ParallelStrategy& self, int local_rank) {
self.local_rank_ = local_rank;
})
.def_property(
"trainer_endpoints",
[](const imperative::ParallelStrategy& self) {
return self.trainer_endpoints_;
},
[](imperative::ParallelStrategy& self, std::vector<std::string> eps) {
self.trainer_endpoints_ = eps;
})
.def_property("current_endpoint",
[](const imperative::ParallelStrategy& self) {
return self.current_endpoint_;
},
[](imperative::ParallelStrategy& self,
const std::string& ep) { self.current_endpoint_ = ep; });
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
pybind11::class_<imperative::NCCLParallelContext> nccl_ctx(
*m, "NCCLParallelContext");
nccl_ctx
.def(pybind11::init<const imperative::ParallelStrategy&,
const platform::CUDAPlace&>())
.def("init", [](imperative::NCCLParallelContext& self) { self.Init(); });
#endif
} }
} // namespace pybind } // namespace pybind
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
...@@ -46,7 +47,7 @@ class PyVarBase : public imperative::VarBase { ...@@ -46,7 +47,7 @@ class PyVarBase : public imperative::VarBase {
using imperative::VarBase::VarBase; // Inherit constructors using imperative::VarBase::VarBase; // Inherit constructors
}; };
void BindTracer(pybind11::module* m); void BindImperative(pybind11::module* m);
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -288,7 +288,7 @@ PYBIND11_MODULE(core, m) { ...@@ -288,7 +288,7 @@ PYBIND11_MODULE(core, m) {
}) })
.def_static("num_funcs", &imperative::PyLayer::NumFuncs); .def_static("num_funcs", &imperative::PyLayer::NumFuncs);
BindTracer(&m); BindImperative(&m);
py::class_<Tensor>(m, "Tensor", py::buffer_protocol()) py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer( .def_buffer(
......
...@@ -32,6 +32,7 @@ default_envs = { ...@@ -32,6 +32,7 @@ default_envs = {
"NCCL_SOCKET_IFNAME": "eth0", "NCCL_SOCKET_IFNAME": "eth0",
"NCCL_IB_GID_INDEX": "3", "NCCL_IB_GID_INDEX": "3",
"NCCL_IB_RETRY_CNT": "0", "NCCL_IB_RETRY_CNT": "0",
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
} }
GPUS = 8 GPUS = 8
......
...@@ -231,9 +231,16 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): ...@@ -231,9 +231,16 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
for idx, op_desc in enumerate(op_descs): for idx, op_desc in enumerate(op_descs):
for arg in op_desc.input_arg_names(): for arg in op_desc.input_arg_names():
if core.grad_var_suffix() in arg and arg in no_grad_set: if core.grad_var_suffix() in arg and arg in no_grad_set:
to_insert.append((_create_op_desc_("fill_zeros_like", { x_in = _strip_grad_suffix_(arg)
"X": [_strip_grad_suffix_(arg)] x_in_var_desc = op_desc.block().find_var_recursive(
}, {"Out": [arg]}, {}), idx)) cpt.to_bytes(x_in))
assert x_in_var_desc is not None, "Variable {} not found".format(
x_in)
dtype = x_in_var_desc.dtype()
to_insert.append(
(_create_op_desc_("fill_zeros_like2", {"X": [x_in]},
{"Out": [arg]}, {"dtype": dtype}), idx))
list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)]) list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
......
...@@ -29,6 +29,9 @@ from .tracer import * ...@@ -29,6 +29,9 @@ from .tracer import *
from . import profiler from . import profiler
from .profiler import * from .profiler import *
from . import parallel
from .parallel import *
from . import checkpoint from . import checkpoint
from .checkpoint import * from .checkpoint import *
...@@ -41,5 +44,6 @@ __all__ += base.__all__ ...@@ -41,5 +44,6 @@ __all__ += base.__all__
__all__ += nn.__all__ __all__ += nn.__all__
__all__ += tracer.__all__ __all__ += tracer.__all__
__all__ += profiler.__all__ __all__ += profiler.__all__
__all__ += parallel.__all__
__all__ += checkpoint.__all__ __all__ += checkpoint.__all__
__all__ += learning_rate_scheduler.__all__ __all__ += learning_rate_scheduler.__all__
...@@ -15,19 +15,20 @@ ...@@ -15,19 +15,20 @@
from __future__ import print_function from __future__ import print_function
from six.moves import reduce from six.moves import reduce
import numpy as np
from .. import core from .. import core
from ..layers import utils from ..layers import utils
from . import layers from . import layers
from ..framework import Variable, OpProtoHolder, Parameter from ..framework import Variable, _in_dygraph_mode, OpProtoHolder, Parameter
from ..layers import layer_function_generator
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from ..initializer import Normal, Constant, NumpyArrayInitializer from ..initializer import Normal, Constant, NumpyArrayInitializer
import numpy as np
__all__ = [ __all__ = [
'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm', 'Conv2D', 'Conv3D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit',
'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', 'SequenceConv' 'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose',
'Conv3DTranspose', 'SequenceConv', 'RowConv', 'GroupNorm', 'SpectralNorm',
'TreeConv'
] ]
...@@ -137,6 +138,303 @@ class Conv2D(layers.Layer): ...@@ -137,6 +138,303 @@ class Conv2D(layers.Layer):
return self._helper.append_activation(pre_act, act=self._act) return self._helper.append_activation(pre_act, act=self._act)
class Conv3D(layers.Layer):
"""
**Convlution3D Layer**
The convolution3D layer calculates the output based on the input, filter
and strides, paddings, dilations, groups parameters. Input(Input) and
Output(Output) are in NCDHW format. Where N is batch size C is the number of
channels, D is the depth of the feature, H is the height of the feature,
and W is the width of the feature. Convlution3D is similar with Convlution2D
but adds one dimension(depth). If bias attribution and activation type are
provided, bias is added to the output of the convolution, and the
corresponding activation function is applied to the final result.
For each input :math:`X`, the equation is:
.. math::
Out = \sigma (W \\ast X + b)
In the above equation:
* :math:`X`: Input value, a tensor with NCDHW format.
* :math:`W`: Filter value, a tensor with MCDHW format.
* :math:`\\ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
* :math:`\\sigma`: Activation function.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
Example:
- Input:
Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
- Output:
Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
Where
.. math::
D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
Args:
input (Variable): The input image with [N, C, D, H, W] format.
num_filters(int): The number of filter. It is as same as the output
image channel.
filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
Otherwise, the filter will be a square.
stride (int|tuple): The stride size. If stride is a tuple, it must
contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
stride_D = stride_H = stride_W = stride. Default: stride = 1.
padding (int|tuple): The padding size. If padding is a tuple, it must
contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
padding_D = padding_H = padding_W = padding. Default: padding = 0.
dilation (int|tuple): The dilation size. If dilation is a tuple, it must
contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
groups (int): The groups number of the Conv3d Layer. According to grouped
convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
the first half of the filters is only connected to the first half
of the input channels, while the second half of the filters is only
connected to the second half of the input channels. Default: groups=1
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
will create ParamAttr as param_attr. If it is set to None, the parameter
is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
:math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, conv3d
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True
act (str): Activation type, if it is set to None, activation is not appended.
Default: None.
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None.
Returns:
Variable: The tensor variable storing the convolution and \
non-linearity activation result.
Raises:
ValueError: If the shapes of input, filter_size, stride, padding and
groups mismatch.
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu")
"""
def __init__(self,
name_scope,
num_filters,
filter_size,
stride=1,
padding=0,
dilation=1,
groups=None,
param_attr=None,
bias_attr=None,
use_cudnn=True,
act=None):
assert param_attr is not False, "param_attr should not be False here."
super(Conv3D, self).__init__(name_scope)
self._groups = groups
self._stride = utils.convert_to_list(stride, 3, 'stride')
self._padding = utils.convert_to_list(padding, 3, 'padding')
self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
self._act = act
if not isinstance(use_cudnn, bool):
raise ValueError("use_cudnn should be True or False")
self._use_cudnn = use_cudnn
self._filter_size = filter_size
self._num_filters = num_filters
self._param_attr = param_attr
self._bias_attr = bias_attr
def _build_once(self, input):
num_channels = input.shape[1]
self._dtype = self._helper.input_dtype(input)
if self._groups is None:
num_filter_channels = num_channels
else:
if num_channels % self._groups != 0:
raise ValueError("num_channels must be divisible by groups.")
num_filter_channels = num_channels // self._groups
filter_size = utils.convert_to_list(self._filter_size, 3, 'filter_size')
filter_shape = [self._num_filters, num_filter_channels] + filter_size
def _get_default_param_initializer():
filter_elem_num = filter_size[0] * filter_size[1] * filter_size[
2] * num_channels
std = (2.0 / filter_elem_num)**0.5
return Normal(0.0, std, 0)
self._filter_param = self.create_parameter(
attr=self._param_attr,
shape=filter_shape,
dtype=self._dtype,
default_initializer=_get_default_param_initializer())
self._bias_param = self.create_parameter(
attr=self._bias_attr,
shape=[self._num_filters],
dtype=self._dtype,
is_bias=True)
def forward(self, input):
pre_bias = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
self._helper.append_op(
type='conv3d',
inputs={
'Input': input,
'Filter': self._filter_param,
},
outputs={"Output": pre_bias},
attrs={
'strides': self._stride,
'paddings': self._padding,
'dilations': self._dilation,
'groups': self._groups if self._groups else 1,
'use_cudnn': self._use_cudnn,
'use_mkldnn': False
})
pre_act = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
self._helper.append_op(
type='elementwise_add',
inputs={'X': [pre_bias],
'Y': [self._bias_param]},
outputs={'Out': [pre_act]},
attrs={'axis': 1})
return self._helper.append_activation(pre_act, act=self._act)
class Conv3DTranspose(layers.Layer):
def __init__(self,
name_scope,
num_filters,
output_size=None,
filter_size=None,
padding=0,
stride=1,
dilation=1,
groups=None,
param_attr=None,
bias_attr=None,
use_cudnn=True,
act=None,
name=None):
super(Conv3DTranspose, self).__init__(name_scope)
if not isinstance(use_cudnn, bool):
raise ValueError("use_cudnn should be True or False")
assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
self._padding = utils.convert_to_list(padding, 3, 'padding')
self._stride = utils.convert_to_list(stride, 3, 'stride')
self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
self._param_attr = param_attr
self._filter_size = filter_size
self._output_size = output_size
self._groups = 1 if groups is None else groups
self._num_filters = num_filters
self._use_cudnn = use_cudnn
self._bias_attr = bias_attr
self._act = act
def _build_once(self, input):
self._dtype = self._helper.input_dtype(input)
self._input_channel = input.shape[1]
if self._filter_size is None:
if self._output_size is None:
raise ValueError(
"output_size must be set when filter_size is None")
if isinstance(self._output_size, int):
self._output_size = [self._output_size, self._output_size]
d_in = input.shape[2]
h_in = input.shape[3]
w_in = input.shape[4]
filter_size_d = (self._output_size[0] -
(d_in - 1) * self._stride[0] + 2 * self._padding[0]
- 1) // self._dilation[0] + 1
filter_size_h = (self._output_size[1] -
(h_in - 1) * self._stride[1] + 2 * self._padding[1]
- 1) // self._dilation[1] + 1
filter_size_w = (self._output_size[2] -
(w_in - 1) * self._stride[2] + 2 * self._padding[2]
- 1) // self._dilation[2] + 1
self._filter_size = [filter_size_d, filter_size_h, filter_size_w]
else:
self._filter_size = utils.convert_to_list(
self._filter_size, 3, 'conv3d_transpose.filter_size')
filter_shape = [
self._input_channel, self._num_filters // self._groups
] + self._filter_size
self._img_filter = self.create_parameter(
dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
if self._bias_attr:
self._bias_param = self.create_parameter(
attr=self._bias_attr,
shape=[self._num_filters],
dtype=self._dtype,
is_bias=True)
def forward(self, input):
pre_bias = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
self._helper.append_op(
type="conv3d_transpose",
inputs={'Input': [input],
'Filter': [self._img_filter]},
outputs={'Output': pre_bias},
attrs={
'strides': self._stride,
'paddings': self._padding,
'dilations': self._dilation,
'groups': self._groups if self._groups else 1,
'use_cudnn': self._use_cudnn
})
if self._bias_attr:
pre_act = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
self._helper.append_op(
type='elementwise_add',
inputs={'X': [pre_bias],
'Y': [self._bias_param]},
outputs={'Out': [pre_act]},
attrs={'axis': 1})
else:
pre_act = pre_bias
# Currently, we don't support inplace in imperative mode
return self._helper.append_activation(pre_act, act=self._act)
class Pool2D(layers.Layer): class Pool2D(layers.Layer):
def __init__(self, def __init__(self,
name_scope, name_scope,
...@@ -1388,6 +1686,8 @@ class SequenceConv(layers.Layer): ...@@ -1388,6 +1686,8 @@ class SequenceConv(layers.Layer):
bias_attr=None, bias_attr=None,
param_attr=None, param_attr=None,
act=None): act=None):
assert not _in_dygraph_mode(
), "SequenceConv is not supported by dynamic graph mode yet!"
super(SequenceConv, self).__init__(name_scope) super(SequenceConv, self).__init__(name_scope)
self._num_filters = num_filters self._num_filters = num_filters
self._filter_size = filter_size self._filter_size = filter_size
...@@ -1397,12 +1697,10 @@ class SequenceConv(layers.Layer): ...@@ -1397,12 +1697,10 @@ class SequenceConv(layers.Layer):
self._param_attr = param_attr self._param_attr = param_attr
def _build_once(self, input): def _build_once(self, input):
self._dtype = self._helper.input_dtype(input) self._dtype = self._helper.input_dtype(input)
print(self._filter_size)
filter_shape = [self._filter_size * input.shape[1], self._num_filters] filter_shape = [self._filter_size * input.shape[1], self._num_filters]
self._filter_param = self.create_parameter( self._filter_param = self.create_parameter(
attr=self.param_attr, shape=filter_shape, dtype=self._dtype) attr=self._param_attr, shape=filter_shape, dtype=self._dtype)
def forward(self, input): def forward(self, input):
pre_bias = self._helper.create_variable_for_type_inference(self._dtype) pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
...@@ -1420,3 +1718,237 @@ class SequenceConv(layers.Layer): ...@@ -1420,3 +1718,237 @@ class SequenceConv(layers.Layer):
}) })
pre_act = self._helper.append_bias_op(pre_bias) pre_act = self._helper.append_bias_op(pre_bias)
return self._helper.append_activation(pre_act) return self._helper.append_activation(pre_act)
class RowConv(layers.Layer):
def __init__(self,
name_scope,
future_context_size,
param_attr=None,
act=None):
assert not _in_dygraph_mode(
), "RowConv is not supported by dynamic graph mode yet!"
super(RowConv, self).__init__(name_scope)
self._act = act
self._param_attr = param_attr
self._future_context_size = future_context_size
def _build_once(self, input):
self._dtype = self._helper.input_dtype(input)
filter_shape = [self._future_context_size + 1, input.shape[1]]
self._filter_param = self.create_parameter(
attr=self._param_attr,
shape=filter_shape,
dtype=self._dtype,
is_bias=False)
def forward(self, input):
out = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op(
type='row_conv',
inputs={'X': [input],
'Filter': [self._filter_param]},
outputs={'Out': [out]})
return self._helper.append_activation(out, act=self._act)
class GroupNorm(layers.Layer):
"""
**Group Normalization Layer**
Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
Args:
name_scope (str): See base class.
groups(int): The number of groups that divided from channels.
epsilon(float): The small value added to the variance to prevent
division by zero.
param_attr(ParamAttr|None): The parameter attribute for the learnable
scale :math:`g`. If it is set to False, no scale will be added to the output units.
If it is set to None, the bias is initialized one. Default: None.
bias_attr(ParamAttr|None): The parameter attribute for the learnable
bias :math:`b`. If it is set to False, no bias will be added to the output units.
If it is set to None, the bias is initialized zero. Default: None.
act(str): Activation to be applied to the output of group normalizaiton.
data_layout(string|NCHW): Only NCHW is supported.
dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
Returns:
Variable: A tensor variable which is the result after applying group normalization on the input.
"""
def __init__(self,
name_scope,
groups,
epsilon=1e-05,
param_attr=None,
bias_attr=None,
act=None,
data_layout='NCHW'):
super(GroupNorm, self).__init__(name_scope)
self._param_attr = param_attr
self._bias_attr = bias_attr
self._epsilon = epsilon
self._groups = groups
self._act = act
if data_layout != 'NCHW':
raise ValueError("unsupported data layout:" + data_layout)
def _build_once(self, input):
self._dtype = self._helper.input_dtype(input)
param_shape = [input.shape[1]]
if self._bias_attr:
self._bias = self.create_parameter(
attr=self._bias_attr,
shape=param_shape,
dtype=self._dtype,
is_bias=True)
if self._param_attr:
self._scale = self.create_parameter(
attr=self._param_attr,
shape=param_shape,
dtype=self._dtype,
default_initializer=Constant(1.0))
def forward(self, input):
inputs = {'X': input}
if self._bias:
inputs['Bias'] = self._bias
if self._scale:
inputs['Scale'] = self._scale
# create output
mean_out = self._helper.create_variable_for_type_inference(
dtype=self._dtype, stop_gradient=True)
variance_out = self._helper.create_variable_for_type_inference(
dtype=self._dtype, stop_gradient=True)
group_norm_out = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
self._helper.append_op(
type="group_norm",
inputs=inputs,
outputs={
"Y": group_norm_out,
"Mean": mean_out,
"Variance": variance_out,
},
attrs={"epsilon": self._epsilon,
"groups": self._groups})
return self._helper.append_activation(group_norm_out, self._act)
class SpectralNorm(layers.Layer):
def __init__(self, name_scope, dim=0, power_iters=1, eps=1e-12, name=None):
super(SpectralNorm, self).__init__(name_scope)
self._power_iters = power_iters
self._eps = eps
self._dim = dim
def _build_once(self, weight):
self._dtype = self._helper.input_dtype(weight)
input_shape = weight.shape
h = input_shape[self._dim]
w = np.prod(input_shape) // h
self.u = self.create_parameter(
attr=ParamAttr(),
shape=[h],
dtype=self._dtype,
default_initializer=Normal(0., 1.))
self.u.stop_gradient = True
self.v = self.create_parameter(
attr=ParamAttr(),
shape=[w],
dtype=self._dtype,
default_initializer=Normal(0., 1.))
self.v.stop_gradient = True
def forward(self, weight):
inputs = {'Weight': weight, 'U': self.u, 'V': self.v}
out = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op(
type="spectral_norm",
inputs=inputs,
outputs={"Out": out, },
attrs={
"dim": self._dim,
"power_iters": self._power_iters,
"eps": self._eps,
})
return out
class TreeConv(layers.Layer):
def __init__(self,
name_scope,
output_size,
num_filters=1,
max_depth=2,
act='tanh',
param_attr=None,
bias_attr=None,
name=None):
super(TreeConv, self).__init__(name_scope)
self._name = name
self._output_size = output_size
self._act = act
self._max_depth = max_depth
self._num_filters = num_filters
self._bias_attr = bias_attr
self._param_attr = param_attr
def _build_once(self, nodes_vector, edge_set):
assert isinstance(nodes_vector, Variable)
assert isinstance(edge_set, Variable)
self._dtype = self._helper.input_dtype(nodes_vector)
feature_size = nodes_vector.shape[2]
w_shape = [feature_size, 3, self._output_size, self._num_filters]
if self._bias_attr:
self._bias_param = self.create_parameter(
attr=self._bias_attr,
shape=[self._num_filters],
dtype=self._dtype,
is_bias=True)
self.W = self.create_parameter(
attr=self._param_attr,
shape=w_shape,
dtype=self._dtype,
is_bias=False)
def forward(self, nodes_vector, edge_set):
if self._name:
out = self.create_variable(
name=self._name, dtype=self._dtype, persistable=False)
else:
out = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
self._helper.append_op(
type='tree_conv',
inputs={
'NodesVector': nodes_vector,
'EdgeSet': edge_set,
'Filter': self.W
},
outputs={'Out': out, },
attrs={'max_depth': self._max_depth})
if self._bias_attr:
pre_activation = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
self._helper.append_op(
type='elementwise_add',
inputs={'X': [out],
'Y': [self._bias_param]},
outputs={'Out': [pre_activation]},
attrs={'axis': 1})
else:
pre_activation = out
return self._helper.append_activation(pre_activation, act=self._act)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except jin compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from .. import core
__all__ = ["prepare_context"]
ParallelStrategy = core.ParallelStrategy
__parallel_ctx__clz__ = None
def prepare_context(parallel_strategy, place):
global __parallel_ctx__clz__
assert __parallel_ctx__clz__ is None, "ParallelContext can only be initialized once."
if isinstance(place, core.CUDAPlace):
__parallel_ctx__clz__ = core.NCCLParallelContext(parallel_strategy,
place)
else:
# TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
assert ("Only support CUDAPlace for now.")
__parallel_ctx__clz__.init()
class Env(object):
def __init__(self):
self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
"").split(",")
self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
@property
def nranks(self):
return self._nranks
@property
def local_rank(self):
return self._local_rank
@property
def dev_id(self):
return self._dev_id
@property
def current_endpoint(self):
return self._current_endpoint
...@@ -191,6 +191,7 @@ __all__ = [ ...@@ -191,6 +191,7 @@ __all__ = [
'kldiv_loss', 'kldiv_loss',
'tree_conv', 'tree_conv',
'npair_loss', 'npair_loss',
'pixel_shuffle',
'fsp_matrix', 'fsp_matrix',
] ]
...@@ -10961,6 +10962,65 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002): ...@@ -10961,6 +10962,65 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
return l2loss + celoss return l2loss + celoss
def pixel_shuffle(x, upscale_factor):
"""
**Pixel Shuffle Layer**
This layer rearranges elements in a tensor of shape [N, C, H, W]
to a tensor of shape [N, C/r**2, H*r, W*r].
This is useful for implementing efficient sub-pixel convolution
with a stride of 1/r.
Please refer to the paper: `Real-Time Single Image and Video Super-Resolution
Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
by Shi et. al (2016) for more details.
.. code-block:: text
Given a 4-D tensor with the shape:
x.shape = [1, 9, 4, 4]
Given upscale_factor:
upscale_factor= 3
output shape is:
[1, 1, 12, 12]
Args:
x(Variable): The input tensor variable.
upscale_factor(int): factor to increase spatial resolution
Returns:
Out(Variable): the pixel shuffle result is a tensor variable with the same shape and the same type as the input.
Raises:
ValueError: If the square of upscale_factor cannot divide the channels of input.
Examples:
.. code-block:: python
input = fluid.layers.data(shape=[9,4,4])
output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3)
"""
helper = LayerHelper("pixel_shuffle", **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
if not isinstance(upscale_factor, int):
raise TypeError("upscale factor must be int type")
helper.append_op(
type="pixel_shuffle",
inputs={"X": x},
outputs={"Out": out},
attrs={"upscale_factor": upscale_factor})
return out
def fsp_matrix(x, y): def fsp_matrix(x, y):
""" """
......
...@@ -227,7 +227,7 @@ class Precision(MetricBase): ...@@ -227,7 +227,7 @@ class Precision(MetricBase):
metric.reset() metric.reset()
for data in train_reader(): for data in train_reader():
loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
metric.update(preds=preds, labels=labels) metric.update(preds=preds, labels=labels)
numpy_precision = metric.eval() numpy_precision = metric.eval()
""" """
...@@ -241,9 +241,11 @@ class Precision(MetricBase): ...@@ -241,9 +241,11 @@ class Precision(MetricBase):
raise ValueError("The 'preds' must be a numpy ndarray.") raise ValueError("The 'preds' must be a numpy ndarray.")
if not _is_numpy_(labels): if not _is_numpy_(labels):
raise ValueError("The 'labels' must be a numpy ndarray.") raise ValueError("The 'labels' must be a numpy ndarray.")
sample_num = labels[0] sample_num = labels.shape[0]
preds = np.rint(preds).astype("int32")
for i in range(sample_num): for i in range(sample_num):
pred = preds[i].astype("int32") pred = preds[i]
label = labels[i] label = labels[i]
if label == 1: if label == 1:
if pred == label: if pred == label:
......
...@@ -81,6 +81,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_resnet) ...@@ -81,6 +81,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext) list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
list(REMOVE_ITEM TEST_OPS test_imperative_mnist) list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer) list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
list(REMOVE_ITEM TEST_OPS test_layers)
foreach(TEST_OP ${TEST_OPS}) foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP}) py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP) endforeach(TEST_OP)
...@@ -118,7 +119,7 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE ...@@ -118,7 +119,7 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
if(NOT WIN32) if(NOT WIN32)
py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL) py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
endif() endif()
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import unittest
fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
from test_conditional_block import *
if __name__ == '__main__':
unittest.main()
...@@ -23,6 +23,8 @@ from test_elementwise_sub_op import * ...@@ -23,6 +23,8 @@ from test_elementwise_sub_op import *
from test_concat_op import * from test_concat_op import *
from test_gather_op import * from test_gather_op import *
from test_gaussian_random_batch_size_like_op import * from test_gaussian_random_batch_size_like_op import *
from test_uniform_random_batch_size_like_op import *
from test_fill_constant_batch_size_like_op import *
from test_lod_reset_op import * from test_lod_reset_op import *
from test_scatter_op import * from test_scatter_op import *
from test_mean_op import * from test_mean_op import *
...@@ -40,6 +42,7 @@ from test_sequence_unpad_op import * ...@@ -40,6 +42,7 @@ from test_sequence_unpad_op import *
from test_sequence_scatter_op import * from test_sequence_scatter_op import *
from test_sequence_slice_op import * from test_sequence_slice_op import *
from test_pad2d_op import * from test_pad2d_op import *
from test_fill_zeros_like2_op import *
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from paddle.fluid.framework import convert_np_dtype_to_dtype_
from op_test import OpTest
class TestFillZerosLike2Op(OpTest):
def setUp(self):
self.op_type = "fill_zeros_like2"
self.dtype = np.float32
self.init_dtype()
self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
self.attrs = {'dtype': convert_np_dtype_to_dtype_(self.dtype)}
def init_dtype(self):
pass
def test_check_output(self):
self.check_output()
class TestFillZerosLike2OpFp16(TestFillZerosLike2Op):
def init_dtype(self):
self.dtype = np.float16
class TestFillZerosLike2OpFp64(TestFillZerosLike2Op):
def init_dtype(self):
self.dtype = np.float64
if __name__ == "__main__":
unittest.main()
...@@ -348,6 +348,55 @@ class TestImperative(unittest.TestCase): ...@@ -348,6 +348,55 @@ class TestImperative(unittest.TestCase):
self.assertEqual(mlp._fc2, sublayers[1]) self.assertEqual(mlp._fc2, sublayers[1])
self.assertEqual(len(sublayers), 2) self.assertEqual(len(sublayers), 2)
def test_dygraph_vs_static(self):
inp1 = np.random.rand(4, 3, 3)
inp2 = np.random.rand(4, 3, 3)
# dynamic graph
with fluid.dygraph.guard():
if np.sum(inp1) < np.sum(inp2):
x = fluid.layers.elementwise_add(inp1, inp2)
else:
x = fluid.layers.elementwise_sub(inp1, inp2)
dygraph_result = x._numpy()
# static graph
with new_program_scope():
inp_data1 = fluid.layers.data(
name='inp1', shape=[3, 3], dtype=np.float32)
inp_data2 = fluid.layers.data(
name='inp2', shape=[3, 3], dtype=np.float32)
a = fluid.layers.expand(
fluid.layers.reshape(
fluid.layers.reduce_sum(inp_data1), [1, 1]), [4, 1])
b = fluid.layers.expand(
fluid.layers.reshape(
fluid.layers.reduce_sum(inp_data2), [1, 1]), [4, 1])
cond = fluid.layers.less_than(x=a, y=b)
ie = fluid.layers.IfElse(cond)
with ie.true_block():
d1 = ie.input(inp_data1)
d2 = ie.input(inp_data2)
d3 = fluid.layers.elementwise_add(d1, d2)
ie.output(d3)
with ie.false_block():
d1 = ie.input(inp_data1)
d2 = ie.input(inp_data2)
d3 = fluid.layers.elementwise_sub(d1, d2)
ie.output(d3)
out = ie()
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
static_result = exe.run(fluid.default_main_program(),
feed={'inp1': inp1,
'inp2': inp2},
fetch_list=out)[0]
self.assertTrue(np.allclose(dygraph_result, static_result))
def test_rnn(self): def test_rnn(self):
np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
[10.0, 11.0, 12.0]]) [10.0, 11.0, 12.0]])
......
...@@ -302,8 +302,11 @@ use_py_reader = False ...@@ -302,8 +302,11 @@ use_py_reader = False
# if we run sync mode # if we run sync mode
sync = False sync = False
# how many batches we use if not core.is_compiled_with_cuda():
batch_num = 50 # how many batches we use
batch_num = 50
else:
batch_num = 5
np.random.seed = 1 np.random.seed = 1
src_word_np = np.random.randint( src_word_np = np.random.randint(
......
...@@ -600,6 +600,280 @@ class TestLayer(LayerTest): ...@@ -600,6 +600,280 @@ class TestLayer(LayerTest):
self.assertTrue(np.allclose(static_rlt2, static_rlt)) self.assertTrue(np.allclose(static_rlt2, static_rlt))
self.assertTrue(np.allclose(nce_loss3._numpy(), static_rlt)) self.assertTrue(np.allclose(nce_loss3._numpy(), static_rlt))
def test_conv3d(self):
with self.static_graph():
images = layers.data(
name='pixel', shape=[3, 6, 6, 6], dtype='float32')
ret = layers.conv3d(input=images, num_filters=3, filter_size=2)
static_ret = self.get_static_graph_result(
feed={'pixel': np.ones(
[2, 3, 6, 6, 6], dtype='float32')},
fetch_list=[ret])[0]
with self.static_graph():
images = layers.data(
name='pixel', shape=[3, 6, 6, 6], dtype='float32')
conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2)
ret = conv3d(images)
static_ret2 = self.get_static_graph_result(
feed={'pixel': np.ones(
[2, 3, 6, 6, 6], dtype='float32')},
fetch_list=[ret])[0]
with self.dynamic_graph():
images = np.ones([2, 3, 6, 6, 6], dtype='float32')
conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2)
dy_ret = conv3d(base.to_variable(images))
self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
self.assertTrue(np.allclose(static_ret, static_ret2))
def test_row_conv(self):
input = np.arange(15).reshape([3, 5]).astype('float32')
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
else:
place = core.CPUPlace()
with self.static_graph():
x = layers.data(
name='X',
shape=[3, 5],
dtype='float32',
lod_level=1,
append_batch_size=False)
ret = layers.row_conv(input=x, future_context_size=2)
static_ret = self.get_static_graph_result(
feed={
'X': fluid.create_lod_tensor(
data=input, recursive_seq_lens=[[1, 1, 1]], place=place)
},
fetch_list=[ret],
with_lod=True)[0]
with self.static_graph():
x = layers.data(
name='X',
shape=[3, 5],
dtype='float32',
lod_level=1,
append_batch_size=False)
rowConv = nn.RowConv('RowConv', future_context_size=2)
ret = rowConv(x)
static_ret2 = self.get_static_graph_result(
feed={
'X': fluid.create_lod_tensor(
data=input, recursive_seq_lens=[[1, 1, 1]], place=place)
},
fetch_list=[ret],
with_lod=True)[0]
# TODO: dygraph can't support LODTensor
self.assertTrue(np.allclose(static_ret, static_ret2))
def test_group_norm(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
else:
place = core.CPUPlace()
shape = (2, 4, 3, 3)
input = np.random.random(shape).astype('float32')
with self.static_graph():
X = fluid.layers.data(
name='X',
shape=shape,
dtype='float32',
lod_level=1,
append_batch_size=False)
ret = layers.group_norm(input=X, groups=2)
static_ret = self.get_static_graph_result(
feed={
'X': fluid.create_lod_tensor(
data=input, recursive_seq_lens=[[1, 1]], place=place)
},
fetch_list=[ret],
with_lod=True)[0]
with self.static_graph():
X = fluid.layers.data(
name='X',
shape=shape,
dtype='float32',
lod_level=1,
append_batch_size=False)
groupNorm = nn.GroupNorm('GroupNorm', groups=2)
ret = groupNorm(X)
static_ret2 = self.get_static_graph_result(
feed={
'X': fluid.create_lod_tensor(
data=input, recursive_seq_lens=[[1, 1]], place=place)
},
fetch_list=[ret],
with_lod=True)[0]
with self.dynamic_graph():
groupNorm = nn.GroupNorm('GroupNorm', groups=2)
dy_ret = groupNorm(base.to_variable(input))
self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
self.assertTrue(np.allclose(static_ret, static_ret2))
def test_spectral_norm(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
else:
place = core.CPUPlace()
shape = (2, 4, 3, 3)
input = np.random.random(shape).astype('float32')
with self.static_graph():
Weight = fluid.layers.data(
name='Weight',
shape=shape,
dtype='float32',
lod_level=1,
append_batch_size=False)
ret = layers.spectral_norm(weight=Weight, dim=1, power_iters=2)
static_ret = self.get_static_graph_result(
feed={
'Weight': fluid.create_lod_tensor(
data=input, recursive_seq_lens=[[1, 1]], place=place),
},
fetch_list=[ret],
with_lod=True)[0]
with self.static_graph():
Weight = fluid.layers.data(
name='Weight',
shape=shape,
dtype='float32',
lod_level=1,
append_batch_size=False)
spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
ret = spectralNorm(Weight)
static_ret2 = self.get_static_graph_result(
feed={
'Weight': fluid.create_lod_tensor(
data=input, recursive_seq_lens=[[1, 1]], place=place)
},
fetch_list=[ret],
with_lod=True)[0]
with self.dynamic_graph():
spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
dy_ret = spectralNorm(base.to_variable(input))
self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
self.assertTrue(np.allclose(static_ret, static_ret2))
def test_tree_conv(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
else:
place = core.CPUPlace()
adj_array = [1, 2, 1, 3, 1, 4, 1, 5, 2, 6, 2, 7, 2, 8, 4, 9, 4, 10]
adj = np.array(adj_array).reshape((1, 9, 2)).astype('int32')
adj = np.tile(adj, (1, 1, 1))
vectors = np.random.random((1, 10, 5)).astype('float32')
with self.static_graph():
NodesVector = fluid.layers.data(
name='NodesVector',
shape=(1, 10, 5),
dtype='float32',
lod_level=1,
append_batch_size=False)
EdgeSet = fluid.layers.data(
name='EdgeSet',
shape=(1, 9, 2),
dtype='int32',
lod_level=1,
append_batch_size=False)
ret = layers.tree_conv(
nodes_vector=NodesVector,
edge_set=EdgeSet,
output_size=6,
num_filters=1,
max_depth=2)
static_ret = self.get_static_graph_result(
feed={
'NodesVector': fluid.create_lod_tensor(
data=vectors, recursive_seq_lens=[[1]], place=place),
'EdgeSet': fluid.create_lod_tensor(
data=adj, recursive_seq_lens=[[1]], place=place)
},
fetch_list=[ret],
with_lod=False)[0]
with self.static_graph():
NodesVector = fluid.layers.data(
name='NodesVector',
shape=(1, 10, 5),
dtype='float32',
lod_level=1,
append_batch_size=False)
EdgeSet = fluid.layers.data(
name='EdgeSet',
shape=(1, 9, 2),
dtype='int32',
lod_level=1,
append_batch_size=False)
treeConv = nn.TreeConv(
'TreeConv', output_size=6, num_filters=1, max_depth=2)
ret = treeConv(NodesVector, EdgeSet)
static_ret2 = self.get_static_graph_result(
feed={
'NodesVector': fluid.create_lod_tensor(
data=vectors, recursive_seq_lens=[[1]], place=place),
'EdgeSet': fluid.create_lod_tensor(
data=adj, recursive_seq_lens=[[1]], place=place)
},
fetch_list=[ret],
with_lod=False)[0]
with self.dynamic_graph():
treeConv = nn.TreeConv(
'SpectralNorm', output_size=6, num_filters=1, max_depth=2)
dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj))
self.assertTrue(np.allclose(static_ret, static_ret2))
self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
def test_conv3d_transpose(self):
input_array = np.arange(0, 48).reshape(
[2, 3, 2, 2, 2]).astype('float32')
with self.static_graph():
img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32')
out = layers.conv3d_transpose(
input=img, num_filters=12, filter_size=12, use_cudnn=False)
static_rlt = self.get_static_graph_result(
feed={'pixel': input_array}, fetch_list=[out])[0]
with self.static_graph():
img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32')
conv3d_transpose = nn.Conv3DTranspose(
'Conv3DTranspose',
num_filters=12,
filter_size=12,
use_cudnn=False)
out = conv3d_transpose(img)
static_rlt2 = self.get_static_graph_result(
feed={'pixel': input_array}, fetch_list=[out])[0]
with self.dynamic_graph():
conv3d_transpose = nn.Conv3DTranspose(
'Conv3DTranspose',
num_filters=12,
filter_size=12,
use_cudnn=False)
dy_rlt = conv3d_transpose(base.to_variable(input_array))
self.assertTrue(np.allclose(static_rlt2, static_rlt))
self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
class TestBook(LayerTest): class TestBook(LayerTest):
def test_all_layers(self): def test_all_layers(self):
...@@ -1634,6 +1908,41 @@ class TestBook(LayerTest): ...@@ -1634,6 +1908,41 @@ class TestBook(LayerTest):
out = layers.flatten(x, axis=1, name="flatten") out = layers.flatten(x, axis=1, name="flatten")
return (out) return (out)
def test_kldiv_loss(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[32, 128, 128], dtype="float32")
target = layers.data(
name='target', shape=[32, 128, 128], dtype="float32")
loss = layers.kldiv_loss(x=x, target=target, reduction='batchmean')
self.assertIsNotNone(loss)
print(str(program))
def test_temporal_shift(self):
program = Program()
with program_guard(program):
x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
out = layers.temporal_shift(x, seg_num=4, shift_ratio=0.2)
self.assertIsNotNone(out)
print(str(program))
def test_shuffle_channel(self):
program = Program()
with program_guard(program):
x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
out = layers.shuffle_channel(x, group=4)
self.assertIsNotNone(out)
print(str(program))
def test_pixel_shuffle(self):
program = Program()
with program_guard(program):
x = layers.data(name="X", shape=[9, 4, 4], dtype="float32")
out = layers.pixel_shuffle(x, upscale_factor=3)
self.assertIsNotNone(out)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
class TestPixelShuffle(OpTest):
def setUp(self):
self.op_type = "pixel_shuffle"
n, c, h, w = 2, 9, 4, 4
up_factor = 3
shape = [n, c, h, w]
x = np.random.random(shape).astype("float32")
new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h,
w)
# reshape to (num,output_channel,upscale_factor,upscale_factor,h,w)
npresult = np.reshape(x, new_shape)
# transpose to (num,output_channel,h,upscale_factor,w,upscale_factor)
npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor]
npresult = np.reshape(npresult, oshape)
self.inputs = {'X': x}
self.outputs = {'Out': npresult}
self.attrs = {'upscale_factor': up_factor}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册