提交 16f09947 编写于 作者: S sneaxiy

Merge develop

test=develop
...@@ -64,6 +64,7 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) ...@@ -64,6 +64,7 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF) option(WITH_PSLIB "Compile with pslib support" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF) option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON) option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
...@@ -190,6 +191,7 @@ include(configure) # add paddle env configuration ...@@ -190,6 +191,7 @@ include(configure) # add paddle env configuration
if(WITH_GPU) if(WITH_GPU)
include(cuda) include(cuda)
include(tensorrt) include(tensorrt)
include(anakin_subgraph)
endif() endif()
if(WITH_MKL OR WITH_MKLML) if(WITH_MKL OR WITH_MKLML)
include(external/anakin) include(external/anakin)
......
...@@ -156,7 +156,7 @@ python \ ...@@ -156,7 +156,7 @@ python \
This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows: This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows:
- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework) - verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework)
- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) - verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)
- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform) - verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform)
- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math) - verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators/math/)
if(NOT WITH_GPU)
return()
endif()
set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
find_path(ANAKIN_INCLUDE_DIR anakin_config.h
PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
$ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include
NO_DEFAULT_PATH
)
find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
PATHS ${ANAKIN_ROOT}
$ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib
NO_DEFAULT_PATH
DOC "Path to ANAKIN library.")
if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
if(WITH_DSO)
set(ANAKIN_FOUND ON)
endif(WITH_DSO)
else()
set(ANAKIN_FOUND OFF)
endif()
if(ANAKIN_FOUND)
message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
include_directories(${ANAKIN_ROOT}/include)
include_directories(${ANAKIN_ROOT}/include/saber)
link_directories(${ANAKIN_ROOT})
add_definitions(-DPADDLE_WITH_ANAKIN)
endif()
...@@ -33,5 +33,6 @@ if(TENSORRT_FOUND) ...@@ -33,5 +33,6 @@ if(TENSORRT_FOUND)
message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
"Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
include_directories(${TENSORRT_INCLUDE_DIR}) include_directories(${TENSORRT_INCLUDE_DIR})
link_directories(${TENSORRT_LIBRARY})
add_definitions(-DPADDLE_WITH_TENSORRT) add_definitions(-DPADDLE_WITH_TENSORRT)
endif() endif()
...@@ -520,6 +520,7 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke ...@@ -520,6 +520,7 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke
paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310')) paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7')) paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
paddle.fluid.install_check.run_check (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '66b7c84a17ed32fec2df9628367be2b9'))
paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c')) paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d')) paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb')) paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
......
...@@ -5,6 +5,7 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h ...@@ -5,6 +5,7 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
...@@ -72,7 +73,7 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap ...@@ -72,7 +73,7 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle) cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
...@@ -52,13 +53,28 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( ...@@ -52,13 +53,28 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
// Note that must assert topology sort is stable // Note that must assert topology sort is stable
auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs); auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
for (auto* op_desc : ops) { for (auto* op_desc : ops) {
auto outputs = op_desc->Outputs(); try {
for (auto& o_it : outputs) { bool is_bk_op =
for (auto& v : o_it.second) { // values static_cast<bool>(boost::get<int>(op_desc->GetAttr(
vars[v] = order; OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) continue;
auto backward_vars =
boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
auto outputs = op_desc->Outputs();
for (auto& o_it : outputs) {
for (auto& v : o_it.second) { // values
vars[v] = order;
VLOG(1) << "in all_reduce_deps_pass:" << v;
}
} }
order++;
} catch (boost::bad_get e) {
} }
order++;
} }
std::vector<OpHandleBase*> dist_ops; std::vector<OpHandleBase*> dist_ops;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h"
#include <string>
namespace paddle {
namespace framework {
namespace details {
FetchBarrierOpHandle::FetchBarrierOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
// fetch_barrier op always run on place0, but output on all places.
: OpHandleBase(node),
op_(framework::OpRegistry::CreateOp(*node->Op())),
local_scopes_(local_scopes),
places_(places),
run_scope_(local_scopes[0]),
place_(places[0]) {
for (auto &p : places) {
this->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p));
}
}
bool FetchBarrierOpHandle::IsMultiDeviceTransfer() {
// override IsMultiDeviceTransfer to return true
return true;
}
void FetchBarrierOpHandle::RunImpl() {
WaitInputVarGenerated(place_);
auto run_func = [this]() {
op_->Run(*run_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
};
if (is_lock_and_record_event_free_) {
run_func();
} else {
this->RunAndRecordEvent(run_func);
}
}
bool FetchBarrierOpHandle::NeedWait(VarHandleBase *in_var) {
bool need_wait =
in_var && in_var->GeneratedOp() &&
in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
return need_wait;
}
std::string FetchBarrierOpHandle::Name() const { return op_->Type(); }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
// **NOTE**: fetch_barrier op is special it outputs all recved variables on
// all places if there are multiple places, must init with
// multiple dev_ctxes_ !!!!
struct FetchBarrierOpHandle : public OpHandleBase {
public:
FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places);
bool IsMultiDeviceTransfer() override;
std::string Name() const override;
protected:
void RunImpl() override;
bool NeedWait(VarHandleBase *in_var) override;
private:
std::unique_ptr<OperatorBase> op_;
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
Scope *run_scope_;
platform::Place place_;
bool is_lock_and_record_event_free_{false};
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <deque> #include <deque>
#include <iterator> #include <iterator>
#include <memory> #include <memory>
#include <queue>
#include <sstream>
#include <stack> #include <stack>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -148,12 +150,14 @@ std::unique_ptr<ir::Graph> InplacePass::ApplyImpl( ...@@ -148,12 +150,14 @@ std::unique_ptr<ir::Graph> InplacePass::ApplyImpl(
view_.Build(graph.get()); view_.Build(graph.get());
InitSSAGraphNodes(); InitSSAGraphNodes();
auto cnt = 0;
for (auto* op : view_.AllOps()) { for (auto* op : view_.AllOps()) {
VLOG(4) << "Handle op " << cnt++ << ": " << op->Name();
if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name())) if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name()))
continue; continue;
TryInplaceOpInputOutput(op, graph.get()); TryInplaceOpInputOutput(op, graph.get());
} }
graph->ResolveHazard(var_nodes_); // graph->ResolveHazard(var_nodes_);
return graph; return graph;
} }
...@@ -264,13 +268,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, ...@@ -264,13 +268,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
void InplacePass::TryInplaceOpInputOutput(ir::Node* op, void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
ir::Graph* graph) const { ir::Graph* graph) const {
VLOG(4) << "Try to inplace op " << op->Name(); VLOG(4) << "Try to inplace op " << op->Name();
// FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and // PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
// ProgramDescs. // "op_desc is nullptr");
// The operations related to BlockDesc or ProgramDesc should perform on Graph
// or Node directly!
PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
"op_desc is nullptr");
// some pre-requirments need to meet if the op want to inplaced. // some pre-requirments need to meet if the op want to inplaced.
PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr");
auto* op_desc = op->Op(); auto* op_desc = op->Op();
auto& infer_inplace = auto& infer_inplace =
...@@ -281,21 +282,58 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op, ...@@ -281,21 +282,58 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
PADDLE_ENFORCE(static_cast<bool>(infer_inplace), PADDLE_ENFORCE(static_cast<bool>(infer_inplace),
"%s's infer_inplace has not been registered", op_desc->Type()); "%s's infer_inplace has not been registered", op_desc->Type());
auto* block = op_desc->Block(); auto in_to_outs = infer_inplace(*op_desc);
auto in_to_outs = infer_inplace(*op_desc, block);
auto& all_ops = view_.AllOps(); auto& all_ops = view_.AllOps();
auto cursor = std::find(all_ops.begin(), all_ops.end(), op); auto cursor = std::find(all_ops.begin(), all_ops.end(), op);
size_t idx = std::distance(all_ops.begin(), cursor); size_t idx = std::distance(all_ops.begin(), cursor);
for (auto& pair : in_to_outs) { for (auto& pair : in_to_outs) {
auto& in_var_name = pair.first; auto& in_para_name = pair.first;
auto& out_var_name = pair.second; auto& out_para_name = pair.second;
auto input_vars = op->Op()->Input(in_para_name);
if (!input_vars.size()) {
VLOG(4) << "Parameter " << in_para_name << " is empty skip "
<< in_para_name << " => " << out_para_name << " pair";
continue;
}
auto output_vars = op->Op()->Output(out_para_name);
if (!output_vars.size()) {
VLOG(4) << "Parameter " << out_para_name << " is empty skip "
<< in_para_name << " => " << out_para_name << " pair";
continue;
}
auto in_var_name = input_vars.at(0);
auto out_var_name = output_vars.at(0);
auto* in_node = view_.GetNodeByName(in_var_name, op->inputs); auto* in_node = view_.GetNodeByName(in_var_name, op->inputs);
auto* out_node = view_.GetNodeByName(out_var_name, op->outputs); auto* out_node = view_.GetNodeByName(out_var_name, op->outputs);
VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
bool can_replace = true;
if (in_var_name == out_var_name) {
can_replace = false;
VLOG(4) << "SKIP: Input variable " << in_var_name << " & Output variable "
<< out_var_name << " are the same";
} else if (!NodeCanReused(in_node)) {
can_replace = false;
VLOG(4) << "SKIP: Input varialbe " << in_var_name << "cannot be reused";
} else if (!NodeCanReused(out_node)) {
can_replace = false;
VLOG(4) << "SKIP: Output variable " << out_var_name
<< " cannot be reused";
} else if (details::NodeSize(*in_node->Var()) !=
details::NodeSize(*out_node->Var())) {
can_replace = false;
VLOG(4) << "SKIP: Input and Output varialbe size not match";
}
if (!can_replace) continue;
// 2. there is no external pending op on the input node // 2. there is no external pending op on the input node
if (view_.PendingOpsOnVar(in_node).size() > 1) { // if (view_.PendingOpsOnVar(in_node).size() > 1) {
if (in_node->outputs.size() > 1 && !view_.CheckDeps(in_node, op)) {
VLOG(4) << string::Sprintf( VLOG(4) << string::Sprintf(
"Skiped pair %s => %s. %s input has external dependency." "Skiped pair %s => %s. %s input has external dependency."
"inplace such pair will overwrite the memory.", "inplace such pair will overwrite the memory.",
...@@ -342,6 +380,97 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op, ...@@ -342,6 +380,97 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
} }
} }
void GraphView::TopoSort(ir::Graph* graph) {
//
ops_.clear();
auto deps_num = [](ir::Node* op) {
auto cnt = 0;
for (auto& var : op->inputs)
if (var->inputs.size() > 0) ++cnt;
return cnt;
};
std::queue<std::pair<ir::Node*, uint32_t>> ready_ops;
int level = 0;
auto nodes = graph->Nodes();
std::unordered_map<ir::Node*, uint32_t> deps_map;
for (auto& node : nodes) {
if (node->IsOp() && node->Op() != nullptr) {
deps_map[node] = deps_num(node);
if (0 == deps_map[node]) {
ready_ops.push({node, level});
}
}
}
while (!ready_ops.empty()) {
auto item = ready_ops.front();
ready_ops.pop();
ops_.emplace_back(item.first);
// record level when pop from queue
op_level_[item.first] = item.second;
for (auto node : item.first->outputs) {
for (auto op : node->outputs) {
--deps_map[op];
if (deps_map[op] == 0) ready_ops.push({op, item.second + 1});
}
}
}
bool all_ops_checked = true;
for (auto& node : nodes) {
if (node->IsOp() && node->Op() != nullptr && deps_map[node] > 0) {
all_ops_checked = false;
break;
}
}
PADDLE_ENFORCE(all_ops_checked, "All ops deps should be 0 after analysis");
}
// return true if current op node depeneds on all other op that use the same
// variable node
bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const {
// get op list that rely on the same variable
auto op_list = var->outputs;
for (auto& op : op_list) {
if (op == current_op) continue;
VLOG(4) << " GraphView::CheckDeps : " << op->Name() << " & "
<< current_op->Name();
if (!CheckOpDeps(op, current_op)) return false;
VLOG(4) << "";
}
return true;
}
// check if op2 depends on op1's output
bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const {
auto print_op = [&](ir::Node* op, const char* name) {
std::ostringstream os;
os << " " << name << " : " << op->Name() << " ";
os << "Input args : ";
for (auto& arg : op->inputs) os << arg->Name() << " ";
os << "Output args : ";
for (auto& arg : op->outputs) os << arg->Name() << " ";
os << "Level : " << op_level_.at(op);
VLOG(4) << os.str();
};
print_op(op1, "OP1");
print_op(op2, "OP2");
if (op1 == op2) return true;
if (op_level_.at(op1) >= op_level_.at(op2)) return false;
for (auto& var : op2->inputs)
if (var->inputs.size() > 0 && CheckOpDeps(op1, var->inputs[0])) return true;
return false;
}
ir::Node* GraphView::GetNodeByName(const std::string& name, ir::Node* GraphView::GetNodeByName(const std::string& name,
const std::vector<ir::Node*>& nodes) const { const std::vector<ir::Node*>& nodes) const {
// nodes should be op->inputs/outputs // nodes should be op->inputs/outputs
...@@ -387,22 +516,7 @@ void GraphView::Build(ir::Graph* g) { ...@@ -387,22 +516,7 @@ void GraphView::Build(ir::Graph* g) {
// Because we insert some new created node. Which may have data race between // Because we insert some new created node. Which may have data race between
// nodes. // nodes.
// resolve data harzards depends on the var nodes in right order. // resolve data harzards depends on the var nodes in right order.
ops_ = SortOpLikeDescOrder(*g); TopoSort(g);
// 1. track the nodes which reused previous node in Python memory optimize.
// these node can not be inplaced, otherwise may generate a circle in graph.
std::unordered_set<std::string> all_vars;
for (auto& node : g->Nodes()) {
if (node->IsVar()) continue;
for (auto& out : node->outputs) {
if (out->IsCtrlVar() || out->Var() == nullptr) continue;
if (all_vars.count(out->Name())) {
dup_nodes_.emplace(out->Name());
} else {
all_vars.emplace(out->Name());
}
}
}
// 2. track the nodes which used by parameter server. // 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer // these node can not be inplaced, otherwise trainer
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <map> #include <map>
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
...@@ -50,10 +51,15 @@ class GraphView { ...@@ -50,10 +51,15 @@ class GraphView {
// map the parameter and gradient, must be skipped. // map the parameter and gradient, must be skipped.
bool InSkipSet(const std::string& var) const; bool InSkipSet(const std::string& var) const;
bool CheckDeps(ir::Node* var, ir::Node* current_op) const;
bool CheckOpDeps(ir::Node* op1, ir::Node* op2) const;
void TopoSort(ir::Graph* g);
private: private:
std::vector<ir::Node*> ops_; std::vector<ir::Node*> ops_;
std::unordered_set<std::string> dup_nodes_; // mem opt affect nodes std::unordered_set<std::string> dup_nodes_; // mem opt affect nodes
std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_; std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
std::unordered_map<ir::Node*, uint32_t> op_level_;
}; };
// swap pairs in sequence // swap pairs in sequence
......
...@@ -190,7 +190,7 @@ struct NodeComparator { ...@@ -190,7 +190,7 @@ struct NodeComparator {
auto rhs_shape = rhs_desc->GetShape(); auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
(lhs_shape[0] != -1 && rhs_shape[0] != -1)) { (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
return NodeSize(lhs) <= NodeSize(rhs); return NodeSize(lhs) == NodeSize(rhs);
} else { } else {
return false; return false;
} }
...@@ -449,6 +449,7 @@ void ControlFlowGraph::LiveVariableAnalysis() { ...@@ -449,6 +449,7 @@ void ControlFlowGraph::LiveVariableAnalysis() {
live_in_[op].insert(var); live_in_[op].insert(var);
} }
for (auto& var : defs_[op]) { for (auto& var : defs_[op]) {
if (uses_[op].count(var)) continue;
live_in_[op].erase(var); live_in_[op].erase(var);
} }
......
...@@ -142,15 +142,16 @@ TEST(OrderedSet, FindBestFitNode) { ...@@ -142,15 +142,16 @@ TEST(OrderedSet, FindBestFitNode) {
for (auto& node : nodes) { for (auto& node : nodes) {
pool.Insert(node.get()); pool.Insert(node.get());
} }
// FIXME(liuwei1031) this API has changed,
// disable these tests temporarily
// FindNextBestFitNode // FindNextBestFitNode
auto* n = nodes[0].get(); // auto* n = nodes[0].get();
auto* cache = pool.FindBestFitNode(n); // auto* cache = pool.FindBestFitNode(n);
PADDLE_ENFORCE(cache->Name() == "a"); // PADDLE_ENFORCE(cache->Name() == "a");
cache = pool.FindNextBestFitNode(n, cache); // cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "c"); // PADDLE_ENFORCE(cache->Name() == "c");
cache = pool.FindNextBestFitNode(n, cache); // cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "b"); // PADDLE_ENFORCE(cache->Name() == "b");
} }
} // namespace details } // namespace details
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h"
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h"
...@@ -851,9 +852,17 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { ...@@ -851,9 +852,17 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
node->Op()->Type()); node->Op()->Type());
result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], // Create fetch_barrier op handle to enable output on all devices.
node->Op()->Type(), places_[op_dev_id])); // **NOTE** fetch_barrier should output variables list same as recv op does.
if (node->Op()->Type() == "fetch_barrier") {
result->Get<GraphOps>(kGraphOps).emplace_back(new FetchBarrierOpHandle(
result->CreateOpNode(node->Op()), local_scopes_, places_));
} else {
result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
node->Op()->Type(), places_[op_dev_id]));
}
if (node->Op()->Type() == "send") { if (node->Op()->Type() == "send") {
CreateOpHandleIOs(result, node, op_dev_id); CreateOpHandleIOs(result, node, op_dev_id);
......
...@@ -55,7 +55,7 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -55,7 +55,7 @@ void OpHandleBase::Run(bool use_cuda) {
if (out_var_handle) { if (out_var_handle) {
int dev_id = int dev_id =
boost::get<platform::CUDAPlace>(out_var_handle->place()).device; boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
out_var_handle->SetGenerateEvent(events_[dev_id]); out_var_handle->SetGenerateEvent(events_.at(dev_id));
} }
} }
} else { } else {
...@@ -71,7 +71,7 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -71,7 +71,7 @@ void OpHandleBase::Run(bool use_cuda) {
"The place of input(%s) is not consistent with the " "The place of input(%s) is not consistent with the "
"place of current op(%s).", "place of current op(%s).",
out_var_handle->Name(), Name()); out_var_handle->Name(), Name());
out_var_handle->SetGenerateEvent(events_[dev_id]); out_var_handle->SetGenerateEvent(events_.at(dev_id));
} }
} }
} }
......
...@@ -209,9 +209,9 @@ struct OpInfoFiller<T, kShapeInference> { ...@@ -209,9 +209,9 @@ struct OpInfoFiller<T, kShapeInference> {
template <typename T> template <typename T>
struct OpInfoFiller<T, kInplaceOpInference> { struct OpInfoFiller<T, kInplaceOpInference> {
void operator()(const char* op_type, OpInfo* info) const { void operator()(const char* op_type, OpInfo* info) const {
info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) { info->infer_inplace_ = [](const OpDesc& op_desc) {
T infer; T infer;
return infer(op_desc, block); return infer(op_desc);
}; };
} }
}; };
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include <numeric> #include <numeric>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/type_defs.h"
...@@ -32,55 +32,22 @@ namespace framework { ...@@ -32,55 +32,22 @@ namespace framework {
then Out will inplaced use X's memory. The base class will do then Out will inplaced use X's memory. The base class will do
legality validation for both variables. legality validation for both variables.
*/ */
class InplaceOpInference { class InplaceOpInference {
public: public:
virtual ~InplaceOpInference() {} virtual ~InplaceOpInference() {}
virtual std::unordered_map<std::string, std::string> operator()( virtual std::unordered_map<std::string, std::string> operator()(
const OpDesc& op_desc, BlockDesc* block) const = 0; const OpDesc& op_desc) const = 0;
};
class InplaceInToOut : public InplaceOpInference {
public:
std::unordered_map<std::string, std::string> operator()(
const OpDesc& op_desc, BlockDesc* block) const {
std::unordered_map<std::string, std::string> ret;
auto in_out_var_names_pair = this->Apply(op_desc, block);
for (auto& pair : in_out_var_names_pair) {
PADDLE_ENFORCE(!op_desc.Input(pair.first).empty(),
string::Sprintf("op %s do not have input of %s!",
op_desc.Type(), pair.first));
PADDLE_ENFORCE(!op_desc.Output(pair.second).empty(),
string::Sprintf("op %s do not have output of %s!",
op_desc.Type(), pair.second));
auto& in_name = op_desc.Input(pair.first).at(0);
auto& out_name = op_desc.Output(pair.second).at(0);
auto in = block->FindRecursiveOrCreateVar(in_name);
auto out = block->FindRecursiveOrCreateVar(out_name);
if (TryInplaceInputOutput(in, out)) ret.insert({in_name, out_name});
}
return ret;
}
protected:
virtual std::unordered_map<std::string, std::string> Apply(
const OpDesc& op_desc, BlockDesc* block) const = 0;
bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
return in.Name() != out.Name() && details::NodeCanReused(in) &&
details::NodeCanReused(out) &&
details::NodeSize(out) <= details::NodeSize(in);
}
}; };
/* /*
Inplace In and Out for operator only have an Input and an Output. Inplace In and Out for operator only have an Input and an Output.
For example, activation op. For example, activation op.
*/ */
class SingleOpInplaceInToOut : public InplaceInToOut { class SingleOpInplaceInToOut : public InplaceOpInference {
protected: public:
std::unordered_map<std::string, std::string> Apply( std::unordered_map<std::string, std::string> operator()(
const OpDesc& op_desc, BlockDesc* block) const override { const OpDesc& op_desc) const override {
PADDLE_ENFORCE(!op_desc.InputNames().empty(), PADDLE_ENFORCE(!op_desc.InputNames().empty(),
"Op inputs must not be empty"); "Op inputs must not be empty");
PADDLE_ENFORCE(!op_desc.OutputNames().empty(), PADDLE_ENFORCE(!op_desc.OutputNames().empty(),
...@@ -95,10 +62,10 @@ class SingleOpInplaceInToOut : public InplaceInToOut { ...@@ -95,10 +62,10 @@ class SingleOpInplaceInToOut : public InplaceInToOut {
Gradient op. Inplace output use it's Input. Gradient op. Inplace output use it's Input.
For example, Input@Grad->Input reuse strategy. For example, Input@Grad->Input reuse strategy.
*/ */
class GradOpInplaceInToOut : public InplaceInToOut { class GradOpInplaceInToOut : public InplaceOpInference {
protected: public:
std::unordered_map<std::string, std::string> Apply( std::unordered_map<std::string, std::string> operator()(
const OpDesc& op_desc, BlockDesc* block) const override { const OpDesc& op_desc) const override {
std::unordered_map<std::string, std::string> ret; std::unordered_map<std::string, std::string> ret;
std::unordered_set<std::string> output_names(op_desc.OutputNames().begin(), std::unordered_set<std::string> output_names(op_desc.OutputNames().begin(),
op_desc.OutputNames().end()); op_desc.OutputNames().end());
......
...@@ -127,26 +127,20 @@ class MultiOutGradShapeInference : public framework::InferShapeBase { ...@@ -127,26 +127,20 @@ class MultiOutGradShapeInference : public framework::InferShapeBase {
} }
}; };
class MultiOutInplaceInToOut : public framework::InplaceInToOut { class MultiOutInplaceInToOut : public framework::InplaceOpInference {
public: public:
using framework::InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const OpDesc& op_desc) const override {
protected:
std::unordered_map<std::string, std::string> Apply(
const OpDesc& op_desc, BlockDesc* block) const override {
return std::unordered_map<std::string, std::string>{ return std::unordered_map<std::string, std::string>{
{"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"}, {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"},
}; };
} }
}; };
class MultiOutGradInplaceInToOut : public framework::InplaceInToOut { class MultiOutGradInplaceInToOut : public framework::InplaceOpInference {
public: public:
using framework::InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const OpDesc& op_desc) const override {
protected:
std::unordered_map<std::string, std::string> Apply(
const OpDesc& op_desc, BlockDesc* block) const override {
return std::unordered_map<std::string, std::string>{ return std::unordered_map<std::string, std::string>{
{framework::GradVarName("YOut"), framework::GradVarName("Y")}, {framework::GradVarName("YOut"), framework::GradVarName("Y")},
{framework::GradVarName("Out"), framework::GradVarName("X")}, {framework::GradVarName("Out"), framework::GradVarName("X")},
...@@ -171,118 +165,118 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut, ...@@ -171,118 +165,118 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut,
namespace paddle { namespace paddle {
namespace framework { namespace framework {
TEST(InferInplace, SingleOpInplaceInToOut) { // TEST(InferInplace, SingleOpInplaceInToOut) {
ProgramDesc prog; // ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp(); // auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("single_op"); // op->SetType("single_op");
op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); // op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
op->SetOutput("Out", {"test2_out"}); // op->SetOutput("Out", {"test2_out"});
//
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); // prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); // prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128}); // prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
//
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; // auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); // auto in_to_outs = infer_inplace(*op);
EXPECT_EQ(in_to_outs.size(), 1ul); // EXPECT_EQ(in_to_outs.size(), 1ul);
auto it = in_to_outs.begin(); // auto it = in_to_outs.begin();
EXPECT_EQ(it->first, "test2_a"); // EXPECT_EQ(it->first, "test2_a");
EXPECT_EQ(it->second, "test2_out"); // EXPECT_EQ(it->second, "test2_out");
} // }
//
TEST(InferInplace, SingleGradOpInplaceInToOut) { // TEST(InferInplace, SingleGradOpInplaceInToOut) {
ProgramDesc prog; // ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp(); // auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("single_op_grad"); // op->SetType("single_op_grad");
op->SetInput(GradVarName("Out"), {"test2_out"}); // op->SetInput(GradVarName("Out"), {"test2_out"});
op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); // op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
//
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); // prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
//
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; // auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); // auto in_to_outs = infer_inplace(*op);
EXPECT_EQ(in_to_outs.size(), 1ul); // EXPECT_EQ(in_to_outs.size(), 1ul);
auto it = in_to_outs.begin(); // auto it = in_to_outs.begin();
EXPECT_EQ(it->first, "test2_out"); // EXPECT_EQ(it->first, "test2_out");
EXPECT_EQ(it->second, "test2_a"); // EXPECT_EQ(it->second, "test2_a");
} // }
//
TEST(InferInplace, MultiOutInplaceInToOut) { // TEST(InferInplace, MultiOutInplaceInToOut) {
ProgramDesc prog; // ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp(); // auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("multi_out_op"); // op->SetType("multi_out_op");
op->SetInput("X", {"a0", "a1"}); // op->SetInput("X", {"a0", "a1"});
op->SetInput("Y", {"b0"}); // op->SetInput("Y", {"b0"});
op->SetInput("Z", {"c0", "c1"}); // op->SetInput("Z", {"c0", "c1"});
op->SetOutput("Out", {"o0"}); // op->SetOutput("Out", {"o0"});
op->SetOutput("YOut", {"y0"}); // op->SetOutput("YOut", {"y0"});
op->SetOutput("ZOut", {"z0"}); // op->SetOutput("ZOut", {"z0"});
//
prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("o0"); // prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); // prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); // prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
//
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; // auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); // auto in_to_outs = infer_inplace(*op);
EXPECT_EQ(in_to_outs.size(), 3ul); // EXPECT_EQ(in_to_outs.size(), 3ul);
std::unordered_map<std::string, std::string> expects = { // std::unordered_map<std::string, std::string> expects = {
{"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"}, // {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
}; // };
EXPECT_TRUE(expects == in_to_outs); // EXPECT_TRUE(expects == in_to_outs);
} // }
//
TEST(InferInplace, MultiGradInplaceInToOut) { // TEST(InferInplace, MultiGradInplaceInToOut) {
ProgramDesc prog; // ProgramDesc prog;
auto* op = prog.MutableBlock(0)->AppendOp(); // auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("multi_out_grad"); // op->SetType("multi_out_grad");
op->SetInput(GradVarName("Out"), {"o0"}); // op->SetInput(GradVarName("Out"), {"o0"});
op->SetInput(GradVarName("YOut"), {"y0"}); // op->SetInput(GradVarName("YOut"), {"y0"});
op->SetInput(GradVarName("ZOut"), {"z0"}); // op->SetInput(GradVarName("ZOut"), {"z0"});
op->SetOutput(GradVarName("X"), {"a0", "a1"}); // op->SetOutput(GradVarName("X"), {"a0", "a1"});
op->SetOutput(GradVarName("Y"), {"b0"}); // op->SetOutput(GradVarName("Y"), {"b0"});
op->SetOutput(GradVarName("Z"), {"c0", "c1"}); // op->SetOutput(GradVarName("Z"), {"c0", "c1"});
//
prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); // prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("o0"); // prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); // prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); // prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); // prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
//
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; // auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); // auto in_to_outs = infer_inplace(*op);
//
EXPECT_EQ(in_to_outs.size(), 3ul); // EXPECT_EQ(in_to_outs.size(), 3ul);
std::unordered_map<std::string, std::string> expects = { // std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, // {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
}; // };
EXPECT_TRUE(expects == in_to_outs); // EXPECT_TRUE(expects == in_to_outs);
} // }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -46,9 +46,6 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) ...@@ -46,9 +46,6 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(lock_free_optimize_pass base) pass_library(lock_free_optimize_pass base)
pass_library(cpu_quantize_placement_pass base)
pass_library(cpu_quantize_pass inference)
pass_library(cpu_quantize_squash_pass inference)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference) pass_library(infer_clean_graph_pass inference)
...@@ -71,22 +68,31 @@ pass_library(transpose_flatten_concat_fuse_pass inference) ...@@ -71,22 +68,31 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base) pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base) pass_library(sync_batch_norm_pass base)
pass_library(runtime_context_cache_pass base) pass_library(runtime_context_cache_pass base)
pass_library(simplify_anakin_detection_pattern_pass inference)
pass_library(anakin_fillconstant_elementwisemul_fuse inference)
# There may be many transpose-flatten structures in a model, and the output of # There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will # these structures will be used as inputs to the concat Op. This pattern will
# be detected by our pass. The index here represents the number of structures in the # be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are # pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models. # common in the models.
foreach (index RANGE 3 6) foreach (index RANGE 2 6)
file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n") file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
endforeach() endforeach()
foreach (index RANGE 2 6)
file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
endforeach()
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base mkldnn) pass_library(mkldnn_placement_pass base mkldnn)
pass_library(depthwise_conv_mkldnn_pass base mkldnn) pass_library(depthwise_conv_mkldnn_pass base mkldnn)
pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn) pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn) pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
pass_library(cpu_quantize_placement_pass base mkldnn)
pass_library(cpu_quantize_pass inference mkldnn)
pass_library(cpu_quantize_squash_pass inference mkldnn)
endif() endif()
cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
...@@ -105,9 +111,6 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g ...@@ -105,9 +111,6 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
cc_test(test_cpu_quantize_placement_pass SRCS cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
if(NOT WIN32) if(NOT WIN32)
cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass) cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
endif() endif()
...@@ -117,4 +120,7 @@ if (WITH_MKLDNN) ...@@ -117,4 +120,7 @@ if (WITH_MKLDNN)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass) cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
endif () endif ()
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
namespace framework {
namespace ir {
#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
#define GET_NODES \
GET_IR_NODE(fill_constant); \
GET_IR_NODE(fill_constant_out); \
GET_IR_NODE(elementwise_mul); \
GET_IR_NODE(elementwise_mul_out);
std::unique_ptr<ir::Graph> AnakinFillconstantElementwisemulFuse::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
FusePassBase::Init(pattern_name, graph.get());
GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern()
->NewNode("x")
->assert_is_op_input("elementwise_mul", "X")
->AsInput();
patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
pattern_name);
pattern(x);
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_NODES;
PADDLE_ENFORCE(subgraph.count(x));
auto* elementwise_in = subgraph.at(x);
float constant_value =
boost::get<float>(fill_constant->Op()->GetAttr("value"));
framework::OpDesc new_op_desc;
new_op_desc.SetType("scale");
new_op_desc.SetInput("X", {elementwise_in->Name()});
new_op_desc.SetAttr("scale", constant_value);
new_op_desc.SetAttr("bias", static_cast<float>(0.0));
new_op_desc.SetAttr("bias_after_scale", true);
new_op_desc.SetOutput("Out", {elementwise_mul_out->Name()});
new_op_desc.Flush();
// Create a new node for the fused op.
auto* scale_op = graph->CreateOpNode(&new_op_desc);
IR_NODE_LINK_TO(elementwise_in, scale_op); // Input
IR_NODE_LINK_TO(scale_op, elementwise_mul_out); // Output
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph.get(),
{fill_constant, fill_constant_out, elementwise_mul});
};
gpd(graph.get(), handler);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse,
paddle::framework::ir::AnakinFillconstantElementwisemulFuse);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
class AnakinFillconstantElementwisemulFuse : public FusePassBase {
public:
virtual ~AnakinFillconstantElementwisemulFuse() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -1470,6 +1470,171 @@ PDNode *patterns::TransposeFlattenConcat::operator()( ...@@ -1470,6 +1470,171 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
return concat_out; return concat_out;
} }
PDNode *patterns::AnakinDetectionPattern::operator()(
std::vector<PDNode *> conv_in, int times) {
// The times represents the repeat times of the
// {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
const int kNumFields = 7;
const int kPriorBoxLocOffset = 1;
const int kReshape1Offset = 2;
const int kReshape1OutOffset = 3;
const int kPriorBoxVarOffset = 4;
const int kReshape2Offset = 5;
const int kReshape2OutOffset = 6;
const int kBoxCoderThirdInputOffset = times;
const int kMultiClassSecondInputNmsOffset = times + 1;
std::vector<PDNode *> nodes;
for (int i = 0; i < times; i++) {
nodes.push_back(
pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
->assert_is_op("density_prior_box"));
nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
->assert_is_op_output("density_prior_box", "Boxes")
->assert_is_op_input("reshape2", "X")
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
->assert_is_op("reshape2"));
nodes.push_back(
pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
->assert_is_op_output("reshape2")
->assert_is_op_nth_input("concat", "X", i)
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
->assert_is_op_output("density_prior_box", "Variances")
->assert_is_op_input("reshape2", "X")
->AsIntermediate());
nodes.push_back(
pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
->assert_is_op("reshape2"));
nodes.push_back(
pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
->assert_is_op_output("reshape2")
->assert_is_op_nth_input("concat", "X", i)
->AsIntermediate());
}
auto concat_op1 = pattern->NewNode(GetNodeName("concat1"))
->assert_is_op("concat")
->assert_op_has_n_inputs("concat", times);
auto concat_out1 = pattern->NewNode(GetNodeName("concat1_out"))
->assert_is_op_output("concat")
->AsIntermediate();
auto concat_op2 = pattern->NewNode(GetNodeName("concat2"))
->assert_is_op("concat")
->assert_op_has_n_inputs("concat", times);
auto concat_out2 = pattern->NewNode(GetNodeName("concat2_out"))
->assert_is_op_output("concat")
->AsIntermediate();
auto box_coder_op = pattern->NewNode(GetNodeName("box_coder"))
->assert_is_op("box_coder")
->assert_op_has_n_inputs("box_coder", 3);
auto box_coder_out = pattern->NewNode(GetNodeName("box_coder_out"))
->assert_is_op_output("box_coder")
->AsIntermediate();
auto transpose_before_nms =
pattern->NewNode(GetNodeName("transpose_before_nms"))
->assert_is_op("transpose2");
auto transpose_before_nms_out =
pattern->NewNode(GetNodeName("transpose_before_nms_out"))
->assert_is_op_output("transpose2")
->assert_is_op_input("multiclass_nms", "Scores")
->AsIntermediate();
auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms"))
->assert_is_op("multiclass_nms")
->assert_op_has_n_inputs("multiclass_nms", 2);
auto multiclass_nms_out = pattern->NewNode(GetNodeName("multiclass_nms_out"))
->assert_is_op_output("multiclass_nms")
->AsOutput();
std::vector<PDNode *> reshape1_outs;
std::vector<PDNode *> reshape2_outs;
for (int i = 0; i < times; i++) {
conv_in[i]->AsInput();
// prior_box
nodes[i * kNumFields]->LinksFrom({conv_in[i]});
// prior_box box out
nodes[i * kNumFields + kPriorBoxLocOffset]->LinksFrom(
{nodes[i * kNumFields]});
// reshape
nodes[i * kNumFields + kReshape1Offset]->LinksFrom(
{nodes[i * kNumFields + kPriorBoxLocOffset]});
// reshape_out
nodes[i * kNumFields + kReshape1OutOffset]->LinksFrom(
{nodes[i * kNumFields + kReshape1Offset]});
nodes[i * kNumFields + kPriorBoxVarOffset]->LinksFrom(
{nodes[i * kNumFields]});
// reshape
nodes[i * kNumFields + kReshape2Offset]->LinksFrom(
{nodes[i * kNumFields + kPriorBoxVarOffset]});
// reshape_out
nodes[i * kNumFields + kReshape2OutOffset]->LinksFrom(
{nodes[i * kNumFields + kReshape2Offset]});
reshape1_outs.push_back(nodes[i * kNumFields + kReshape1OutOffset]);
reshape2_outs.push_back(nodes[i * kNumFields + kReshape2OutOffset]);
}
concat_op1->LinksFrom(reshape1_outs);
concat_op2->LinksFrom(reshape2_outs);
concat_out1->LinksFrom({concat_op1});
concat_out2->LinksFrom({concat_op2});
conv_in[kBoxCoderThirdInputOffset]->AsInput();
conv_in[kMultiClassSecondInputNmsOffset]->AsInput();
box_coder_op->LinksFrom(
{concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]});
box_coder_out->LinksFrom({box_coder_op});
transpose_before_nms->LinksFrom({conv_in[kMultiClassSecondInputNmsOffset]});
transpose_before_nms_out->LinksFrom({transpose_before_nms});
multiclass_nms_op->LinksFrom({box_coder_out, transpose_before_nms_out})
.LinksTo({multiclass_nms_out});
return multiclass_nms_out;
}
PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
PDNode *elementwise_op_input) {
auto fill_constant =
pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
auto fill_constant_out = pattern->NewNode(fill_constant_out_repr())
->assert_is_op_output("fill_constant")
->assert_is_op_input("elementwise_mul", "Y")
->AsIntermediate();
auto elementwise_mul_op =
pattern->NewNode(elementwise_mul_repr())->assert_is_op("elementwise_mul");
auto elementwise_mul_out = pattern->NewNode(elementwise_mul_out_repr())
->assert_is_op_output("elementwise_mul")
->AsOutput();
fill_constant_out->LinksFrom({fill_constant});
elementwise_mul_op->LinksFrom({elementwise_op_input, fill_constant_out});
elementwise_mul_out->LinksFrom({elementwise_mul_op});
return elementwise_mul_out;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -844,6 +844,36 @@ struct TransposeFlattenConcat : public PatternBase { ...@@ -844,6 +844,36 @@ struct TransposeFlattenConcat : public PatternBase {
} }
}; };
struct AnakinDetectionPattern : public PatternBase {
AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
std::string GetNodeName(const std::string& op_type) {
return PDNodeName(name_scope_, repr_, id_, op_type);
}
PDNode* GetPDNode(const std::string& op_type) {
return pattern->RetrieveNode(GetNodeName(op_type));
}
};
struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
AnakinFillConstantElementWiseMulFuse(PDPattern* pattern,
const std::string& name_scope)
: PatternBase(pattern, name_scope,
"anakin_fillconstant_elementwisemul_fuse") {}
PDNode* operator()(PDNode* elementwise_op_input);
// declare operator node's name
PATTERN_DECL_NODE(fill_constant);
PATTERN_DECL_NODE(fill_constant_out);
PATTERN_DECL_NODE(elementwise_mul);
PATTERN_DECL_NODE(elementwise_mul_out);
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_pass.h" #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_pass.h" #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h" #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h" #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <boost/logic/tribool.hpp> #include <boost/logic/tribool.hpp>
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h" #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h" #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h"
namespace paddle {
namespace framework {
namespace ir {
template <int times>
std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
const std::string pattern_name =
"simplify_anakin_detection_pattern_pass" + std::to_string(times);
FusePassBase::Init(pattern_name, graph.get());
GraphPatternDetector gpd;
std::vector<PDNode *> input_nodes;
for (int i = 0; i < times; i++) {
input_nodes.push_back(gpd.mutable_pattern()
->NewNode("x" + std::to_string(i))
->assert_is_op_input("density_prior_box", "Input")
->AsInput());
}
input_nodes.push_back(gpd.mutable_pattern()
->NewNode("x" + std::to_string(times))
->assert_is_op_input("box_coder", "TargetBox")
->AsInput());
input_nodes.push_back(gpd.mutable_pattern()
->NewNode("x" + std::to_string(times + 1))
->assert_is_op_input("transpose2")
->AsInput());
patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
pattern(input_nodes, times);
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
const int kNumFields = 7;
const int kPriorBoxLocOffset = 1;
const int kReshape1Offset = 2;
const int kReshape1OutOffset = 3;
const int kPriorBoxVarOffset = 4;
const int kReshape2Offset = 5;
const int kReshape2OutOffset = 6;
std::vector<Node *> nodes;
for (int i = 0; i < times; i++) {
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
}
Node *concat_op1 = subgraph.at(pattern.GetPDNode("concat1"));
Node *concat_out1 = subgraph.at(pattern.GetPDNode("concat1_out"));
Node *concat_op2 = subgraph.at(pattern.GetPDNode("concat2"));
Node *concat_out2 = subgraph.at(pattern.GetPDNode("concat2_out"));
Node *box_coder_third_input = subgraph.at(input_nodes[times]);
Node *box_coder_op = subgraph.at(pattern.GetPDNode("box_coder"));
Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out"));
Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]);
Node *transpose_before_nms =
subgraph.at(pattern.GetPDNode("transpose_before_nms"));
Node *transpose_before_nms_out =
subgraph.at(pattern.GetPDNode("transpose_before_nms_out"));
Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms"));
Node *multiclass_nms_out =
subgraph.at(pattern.GetPDNode("multiclass_nms_out"));
std::string code_type =
boost::get<std::string>(box_coder_op->Op()->GetAttr("code_type"));
bool box_normalized =
boost::get<bool>(box_coder_op->Op()->GetAttr("box_normalized"));
// auto variance =
// boost::get<std::vector<float>>(box_coder_op->Op()->GetAttr("variance"));
int background_label =
boost::get<int>(multiclass_nms->Op()->GetAttr("background_label"));
float score_threshold =
boost::get<float>(multiclass_nms->Op()->GetAttr("score_threshold"));
int nms_top_k = boost::get<int>(multiclass_nms->Op()->GetAttr("nms_top_k"));
float nms_threshold =
boost::get<float>(multiclass_nms->Op()->GetAttr("nms_threshold"));
float nms_eta = boost::get<float>(multiclass_nms->Op()->GetAttr("nms_eta"));
int keep_top_k =
boost::get<int>(multiclass_nms->Op()->GetAttr("keep_top_k"));
std::vector<std::string> concat1_input_names;
for (int i = 0; i < times; i++) {
concat1_input_names.push_back(
nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
}
// int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
framework::OpDesc concat1_desc;
concat1_desc.SetType("concat");
concat1_desc.SetInput("X", concat1_input_names);
concat1_desc.SetAttr("axis", 2);
concat1_desc.SetOutput("Out", {concat_out1->Name()});
auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc);
for (int i = 0; i < times; i++) {
nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(
new_add_concat_op);
new_add_concat_op->inputs.push_back(
nodes[i * kNumFields + kPriorBoxLocOffset]);
}
framework::OpDesc new_op_desc;
new_op_desc.SetType("detection_out");
new_op_desc.SetInput("PriorBox", {concat_out1->Name()});
new_op_desc.SetInput("TargetBox", {box_coder_third_input->Name()});
new_op_desc.SetInput("Scores", {multiclass_nms_second_input->Name()});
new_op_desc.SetAttr("code_type", code_type);
new_op_desc.SetAttr("box_normalized", box_normalized);
new_op_desc.SetAttr("background_label", background_label);
new_op_desc.SetAttr("score_threshold", score_threshold);
new_op_desc.SetAttr("nms_top_k", nms_top_k);
new_op_desc.SetAttr("nms_threshold", nms_threshold);
new_op_desc.SetAttr("nms_eta", nms_eta);
new_op_desc.SetAttr("keep_top_k", keep_top_k);
new_op_desc.SetOutput("Out", {multiclass_nms_out->Name()});
new_op_desc.Flush();
// Create a new node for the fused op.
auto *detection_out_op = graph->CreateOpNode(&new_op_desc);
std::unordered_set<const Node *> delete_nodes;
for (int i = 0; i < times; i++) {
nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(concat_op1);
delete_nodes.insert(nodes[i * kNumFields + kReshape1Offset]);
delete_nodes.insert(nodes[i * kNumFields + kReshape1OutOffset]);
delete_nodes.insert(nodes[i * kNumFields + kPriorBoxVarOffset]);
delete_nodes.insert(nodes[i * kNumFields + kReshape2Offset]);
delete_nodes.insert(nodes[i * kNumFields + kReshape2OutOffset]);
}
delete_nodes.insert(concat_op1);
delete_nodes.insert(concat_op2);
delete_nodes.insert(concat_out2);
delete_nodes.insert(box_coder_op);
delete_nodes.insert(box_coder_out);
delete_nodes.insert(transpose_before_nms);
delete_nodes.insert(transpose_before_nms_out);
delete_nodes.insert(multiclass_nms);
new_add_concat_op->outputs.push_back(concat_out1);
concat_out1->inputs.push_back(new_add_concat_op);
detection_out_op->inputs.push_back(concat_out1);
detection_out_op->inputs.push_back(box_coder_third_input);
detection_out_op->inputs.push_back(multiclass_nms_second_input);
detection_out_op->outputs.push_back(multiclass_nms_out);
concat_out1->outputs.push_back(detection_out_op);
box_coder_third_input->outputs.push_back(detection_out_op);
multiclass_nms_second_input->outputs.push_back(detection_out_op);
multiclass_nms_out->inputs.push_back(detection_out_op);
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph.get(), delete_nodes);
};
gpd(graph.get(), handler);
return graph;
}
template class SimplifyAnakinDetectionPatternPass<1>;
template class SimplifyAnakinDetectionPatternPass<2>;
template class SimplifyAnakinDetectionPatternPass<3>;
template class SimplifyAnakinDetectionPatternPass<4>;
template class SimplifyAnakinDetectionPatternPass<5>;
template class SimplifyAnakinDetectionPatternPass<6>;
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(simplify_anakin_detection_pattern_pass,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass2,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<2>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass4,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass5,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>);
REGISTER_PASS(simplify_anakin_detection_pattern_pass6,
paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <unordered_set>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
// There may be many transpose-flatten structures in a model, and the output of
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template <int times>
class SimplifyAnakinDetectionPatternPass : public FusePassBase {
public:
virtual ~SimplifyAnakinDetectionPatternPass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <memory>
#include <string> #include <string>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
...@@ -123,6 +125,7 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl( ...@@ -123,6 +125,7 @@ std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
} }
template class TransposeFlattenConcatFusePass<1>; template class TransposeFlattenConcatFusePass<1>;
template class TransposeFlattenConcatFusePass<2>;
template class TransposeFlattenConcatFusePass<3>; template class TransposeFlattenConcatFusePass<3>;
template class TransposeFlattenConcatFusePass<4>; template class TransposeFlattenConcatFusePass<4>;
template class TransposeFlattenConcatFusePass<5>; template class TransposeFlattenConcatFusePass<5>;
...@@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>; ...@@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>;
REGISTER_PASS(transpose_flatten_concat_fuse_pass, REGISTER_PASS(transpose_flatten_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<1>); paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
REGISTER_PASS(transpose_flatten2_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<2>);
REGISTER_PASS(transpose_flatten3_concat_fuse_pass, REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<3>); paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
......
...@@ -65,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name, ...@@ -65,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name,
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>(); const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) { // if (UNLIKELY(!tensor.IsInitialized())) {
return DDim({-1}); // return DDim({-1});
} // }
return tensor.dims(); return tensor.dims();
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
if (get_actual_dim) { if (get_actual_dim) {
...@@ -133,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -133,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>(); const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) { // if (UNLIKELY(!tensor.IsInitialized())) {
return default_lod; // return default_lod;
} // }
return tensor.lod(); return tensor.lod();
} else { } else {
return default_lod; return default_lod;
......
...@@ -60,7 +60,7 @@ using InferVarTypeFN = ...@@ -60,7 +60,7 @@ using InferVarTypeFN =
using InferShapeFN = std::function<void(InferShapeContext*)>; using InferShapeFN = std::function<void(InferShapeContext*)>;
using InplacePair = std::unordered_map<std::string, std::string>; using InplacePair = std::unordered_map<std::string, std::string>;
using InferInplaceOpFN = std::function<InplacePair(const OpDesc&, BlockDesc*)>; using InferInplaceOpFN = std::function<InplacePair(const OpDesc&)>;
using InferNoNeedBufferVarsFN = std::function<std::unordered_set<std::string>( using InferNoNeedBufferVarsFN = std::function<std::unordered_set<std::string>(
const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/, const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/,
......
...@@ -315,6 +315,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { ...@@ -315,6 +315,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
for (size_t i = 0; i < outputs.size(); ++i) { for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* grad = outputs[i]->var_; framework::Variable* grad = outputs[i]->var_;
framework::Variable* orig_grad = origin_outputs[i]->var_; framework::Variable* orig_grad = origin_outputs[i]->var_;
VLOG(3) << "AddTo Called with orig_grad is: "
<< origin_outputs[i]->name_ << " Grad to be added is "
<< outputs[i]->name_;
AddTo(grad, orig_grad, place_); AddTo(grad, orig_grad, place_);
delete grad; delete grad;
} }
......
...@@ -277,6 +277,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -277,6 +277,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
VarBase* var = current_vars_map[var_it->second]; VarBase* var = current_vars_map[var_it->second];
InitGrad(var, prepared_op.GetDeviceContext()); InitGrad(var, prepared_op.GetDeviceContext());
grad_out_vars.push_back(var->grads_); grad_out_vars.push_back(var->grads_);
VLOG(3) << "grads output var name: " << var->name_;
} }
} }
} }
......
...@@ -16,7 +16,10 @@ add_subdirectory(utils) ...@@ -16,7 +16,10 @@ add_subdirectory(utils)
if (TENSORRT_FOUND) if (TENSORRT_FOUND)
add_subdirectory(tensorrt) add_subdirectory(tensorrt)
endif() endif()
# add_subdirectory(anakin)
if (ANAKIN_FOUND)
add_subdirectory(anakin)
endif()
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
......
cc_library(anakin_engine SRCS engine.cc) cc_library(anakin_engine SRCS engine.cc DEPS framework_proto)
cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
target_link_libraries(anakin_engine anakin anakin_saber_common) target_link_libraries(anakin_engine anakin anakin_saber_common)
cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine) cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
add_subdirectory(convert) add_subdirectory(convert)
cc_library(anakin_op_converter SRCS fc.cc registrar.cc DEPS anakin_engine framework_proto scope) cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op) elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL)
cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling SERIAL)
cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split SERIAL)
cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split SERIAL)
cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op SERIAL)
cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL SERIAL)
cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax SERIAL)
cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op SERIAL)
cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op SERIAL)
cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL)
cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL)
cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL)
#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS anakin_op_converter sum_op selected_rows_functor SERIAL)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/activation.h"
#include <algorithm>
#include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
: op_type_(op_type) {
auto it = anakin_op_types_.find(op_type_);
PADDLE_ENFORCE(it != anakin_op_types_.end(),
"activation op type is not support");
anakin_op_type_ = it->second;
}
void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
auto input_name = op_desc.Input("X").front();
auto output_name = op_desc.Output("Out").front();
engine_->AddOp(op_name, "Activation", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "type", anakin_op_type_);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter);
REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter);
...@@ -14,45 +14,39 @@ ...@@ -14,45 +14,39 @@
#pragma once #pragma once
#include <functional>
#include <map> #include <map>
#include <memory>
#include <string> #include <string>
#include <utility> #include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class AnakinOpConverter; class ActivationOpConverter : public AnakinOpConverter {
class OpRegister {
public: public:
OpRegister() = default; explicit ActivationOpConverter(const std::string &op_type);
std::shared_ptr<AnakinOpConverter> Get(const std::string &name);
static OpRegister *instance(); virtual void operator()(const framework::proto::OpDesc &op,
void OpRegisterFn(const std::string &name, const framework::Scope &scope,
std::function<std::shared_ptr<AnakinOpConverter>()> fn) { bool test_mode) override;
registry_[name] = fn; virtual ~ActivationOpConverter() {}
}
private: private:
using RegisterFnType = std::function<std::shared_ptr<AnakinOpConverter>()>; std::string op_type_;
std::map<std::string, std::function<std::shared_ptr<AnakinOpConverter>()>> std::string anakin_op_type_;
registry_; std::map<std::string, std::string> anakin_op_types_{{"tanh", "TanH"},
{"sigmoid", "Sigmoid"}};
}; };
template <typename T, typename... Args> class TanhOpConverter : public ActivationOpConverter {
class Registrar {
public: public:
Registrar(const std::string &name, Args... args) { TanhOpConverter() : ActivationOpConverter("tanh") {}
std::shared_ptr<AnakinOpConverter> converter =
std::make_shared<T>(std::move(args)...);
OpRegister::instance()->OpRegisterFn(name,
[converter]() { return converter; });
}
}; };
class SigmoidOpConverter : public ActivationOpConverter {
public:
SigmoidOpConverter() : ActivationOpConverter("sigmoid") {}
};
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/batch_norm.h"
#include <math.h>
#include <algorithm>
#include <map>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
std::map<std::string, std::string> inputs;
for (auto k : {"X", "Scale", "Bias", "Mean", "Variance"}) {
PADDLE_ENFORCE_EQ(op_desc.Input(k).size(), 1UL);
auto v = op_desc.Input(k).front();
inputs.insert({k, v});
}
auto output = op_desc.Output("Y").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front();
auto epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));
// auto momentum = boost::get<float>(op_desc.GetAttr("momentum"));
auto bn_op_name = op_name + ":bn";
auto bn_output = bn_op_name + "_output";
engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output});
engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0));
auto scale_op_name = op_name + ":scale";
auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name,
framework::LoDTensor *tensor) {
auto *v = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(v);
auto *t = v->GetMutable<framework::LoDTensor>();
tensor->Resize(t->dims());
TensorCopySync(*t, platform::CPUPlace(), tensor);
};
framework::LoDTensor bias_t;
framework::LoDTensor mean_t;
framework::LoDTensor scale_t;
framework::LoDTensor variance_t;
get_lod_tensor(inputs["Bias"], &bias_t);
get_lod_tensor(inputs["Mean"], &mean_t);
get_lod_tensor(inputs["Scale"], &scale_t);
get_lod_tensor(inputs["Variance"], &variance_t);
auto fill_shape = [](size_t n, std::vector<int> shape) {
shape.insert(shape.begin(), 1);
if (shape.size() < n) {
shape.insert(shape.end(), n - shape.size(), 1);
}
return shape;
};
Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims())));
Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims())));
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
auto *mean_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(mean_t.data<float>(), mean_t.numel(), mean_data);
engine_->AddOpAttr(bn_op_name, "weight_1", *weight1);
auto *weight2 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape2);
auto *variance_data =
static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(variance_t.data<float>(), variance_t.numel(), variance_data);
engine_->AddOpAttr(bn_op_name, "weight_2", *weight2);
Shape shape3(std::vector<int>({1, 1, 1, 1}));
auto *weight3 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape3);
auto *alpha_data = static_cast<float *>(weight3->h_tensor().mutable_data());
float weight3_data[] = {1};
std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data);
engine_->AddOpAttr(bn_op_name, "weight_3", *weight3);
Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims())));
auto *scale =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(scale_shape);
auto *scale_data = static_cast<float *>(scale->h_tensor().mutable_data());
std::copy_n(scale_t.data<float>(), scale_t.numel(), scale_data);
Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims())));
auto *bias =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(bias_shape);
auto *bias_data = static_cast<float *>(bias->h_tensor().mutable_data());
std::copy_n(bias_t.data<float>(), bias_t.numel(), bias_data);
engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output});
engine_->AddOpAttr(scale_op_name, "axis", 1);
engine_->AddOpAttr(scale_op_name, "num_axes", 1);
engine_->AddOpAttr(scale_op_name, "bias_term", true);
engine_->AddOpAttr(scale_op_name, "weight_1", *scale);
engine_->AddOpAttr(scale_op_name, "weight_2", *bias);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(batch_norm, BatchNormOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class BatchNormOpConverter : public AnakinOpConverter {
public:
BatchNormOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~BatchNormOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/concat.h"
#include <algorithm>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
int axis = boost::get<int>(op_desc.GetAttr("axis"));
auto input_names = op_desc.Input("X");
// PADDLE_ENFORCE(axis > 0,
// "The axis attr of Concat op should be large than 0 for trt");
auto y_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Concat", input_names, {y_name});
engine_->AddOpAttr(op_name, "axis", axis);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(concat, ConcatOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class ConcatOpConverter : public AnakinOpConverter {
public:
ConcatOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~ConcatOpConverter() {}
private:
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/conv2d.h"
#include <algorithm>
#include <memory>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL);
auto input_name = op_desc.Input("Input").front();
auto output_name = op_desc.Output("Output").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(filter_v);
auto *filter_t = filter_v->GetMutable<framework::LoDTensor>();
std::unique_ptr<framework::LoDTensor> weight_tensor(
new framework::LoDTensor());
weight_tensor->Resize(filter_t->dims());
TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get());
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
// const int n_output = weight_tensor->dims()[0];
// const int n_input = weight_tensor->dims()[1];
const int filter_h = weight_tensor->dims()[2];
const int filter_w = weight_tensor->dims()[3];
// auto filter_num = n_input * filter_h * filter_w ;
auto filter_num = weight_tensor->dims()[0];
engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations);
const int groups = boost::get<int>(op_desc.GetAttr("groups"));
engine_->AddOpAttr(op_name, "group", groups);
engine_->AddOpAttr(op_name, "axis", 1);
engine_->AddOpAttr(op_name, "bias_term", false);
auto weight_shape = framework::vectorize2int(filter_t->dims());
Shape anakin_shape(weight_shape);
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(conv2d, Conv2dOpConverter);
...@@ -12,22 +12,23 @@ ...@@ -12,22 +12,23 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/anakin/convert/registrar.h" #pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
std::shared_ptr<AnakinOpConverter> OpRegister::Get(const std::string &name) { class Conv2dOpConverter : public AnakinOpConverter {
auto it = registry_.find(name); public:
if (it == registry_.end()) return nullptr; Conv2dOpConverter() = default;
return it->second();
}
OpRegister *OpRegister::instance() { virtual void operator()(const framework::proto::OpDesc &op,
static OpRegister factory; const framework::Scope &scope,
return &factory; bool test_mode) override;
} virtual ~Conv2dOpConverter() {}
};
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/conv2d_fusion.h"
#include <algorithm>
#include <memory>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL);
auto input_name = op_desc.Input("Input").front();
auto output_name = op_desc.Output("Output").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front();
engine_->AddOp(op_name, "Convolution", {input_name}, {output_name});
auto *filter_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(filter_v);
auto *filter_t = filter_v->GetMutable<framework::LoDTensor>();
auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
PADDLE_ENFORCE_NOT_NULL(b_v);
auto *b_t = b_v->GetMutable<framework::LoDTensor>();
std::unique_ptr<framework::LoDTensor> weight_tensor(
new framework::LoDTensor());
weight_tensor->Resize(filter_t->dims());
TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get());
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
// const int n_output = weight_tensor->dims()[0];
// const int n_input = weight_tensor->dims()[1];
const int filter_h = weight_tensor->dims()[2];
const int filter_w = weight_tensor->dims()[3];
// auto filter_num = n_input * filter_h * filter_w ;
auto filter_num = weight_tensor->dims()[0];
engine_->AddOpAttr<int>(op_name, "filter_num", filter_num);
engine_->AddOpAttr<PTuple<int>>(op_name, "kernel_size", {filter_h, filter_w});
auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
auto dilations = boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
engine_->AddOpAttr<PTuple<int>>(op_name, "dilation_rate", dilations);
const int groups = boost::get<int>(op_desc.GetAttr("groups"));
engine_->AddOpAttr(op_name, "group", groups);
engine_->AddOpAttr(op_name, "axis", 1);
engine_->AddOpAttr(op_name, "bias_term", true);
auto weight_shape = framework::vectorize2int(filter_t->dims());
Shape anakin_shape(weight_shape);
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(weight_tensor->data<float>(), weight_tensor->numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1);
auto bias_shape = framework::vectorize2int(b_t->dims());
framework::LoDTensor bias_tensor;
bias_tensor.Resize(b_t->dims());
TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
auto *bias_data = bias_tensor.data<float>();
bias_shape.insert(bias_shape.begin(), 1);
bias_shape.insert(bias_shape.begin(), 1);
bias_shape.insert(bias_shape.begin(), 1);
// bias_shape.push_back(1);
// bias_shape.push_back(1);
Shape anakin_bias_shape(bias_shape);
auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape);
float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor());
engine_->AddOpAttr(op_name, "weight_2", *weight2);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class Conv2dFusionOpConverter : public AnakinOpConverter {
public:
Conv2dFusionOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~Conv2dFusionOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/density_prior_box.h"
#include <algorithm>
#include <map>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
auto input_name = op_desc.Input("Input").front();
auto image_name = op_desc.Input("Image").front();
auto output_name = op_desc.Output("Boxes").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front();
auto fixed_sizes =
boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
auto fixed_ratios =
boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
std::vector<float> dens;
for (auto& ele : densities) {
dens.push_back(static_cast<float>(ele));
}
// lack flip
// auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
for (auto& ele : variances) {
LOG(INFO) << ele;
}
// lack img_h, img_w
auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
auto step_w = boost::get<float>(op_desc.GetAttr("step_w"));
auto offset = boost::get<float>(op_desc.GetAttr("offset"));
PTuple<std::string> t_order;
t_order.push_back("MIN");
t_order.push_back("COM");
t_order.push_back("MAX");
std::vector<float> temp_v = {};
engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes);
engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios);
engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens);
engine_->AddOpAttr(op_name, "is_flip", static_cast<bool>(false));
engine_->AddOpAttr(op_name, "is_clip", static_cast<bool>(false));
engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
engine_->AddOpAttr(op_name, "step_h", step_h);
engine_->AddOpAttr(op_name, "step_w", step_w);
engine_->AddOpAttr(op_name, "offset", offset);
engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", t_order);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class DensityPriorBoxOpConverter : public AnakinOpConverter {
public:
DensityPriorBoxOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~DensityPriorBoxOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/detection_out.h"
#include <algorithm>
#include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
auto target_name = op_desc.Input("TargetBox").front();
auto prior_box_name = op_desc.Input("PriorBox").front();
auto scores_name = op_desc.Input("Scores").front();
auto output_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
auto code_type = boost::get<std::string>(op_desc.GetAttr("code_type"));
auto background_label = boost::get<int>(op_desc.GetAttr("background_label"));
auto score_threshold = boost::get<float>(op_desc.GetAttr("score_threshold"));
auto nms_top_k = boost::get<int>(op_desc.GetAttr("nms_top_k"));
auto nms_threshold = boost::get<float>(op_desc.GetAttr("nms_threshold"));
auto nms_eta = boost::get<float>(op_desc.GetAttr("nms_eta"));
auto keep_top_k = boost::get<int>(op_desc.GetAttr("keep_top_k"));
std::string anakin_code_type;
if (code_type == "decode_center_size") {
anakin_code_type = "CENTER_SIZE";
} else if (code_type == "encode_center_size") {
PADDLE_THROW(
"Not support encode_center_size code_type in DetectionOut of anakin");
}
engine_->AddOp(op_name, "DetectionOutput",
{target_name, scores_name, prior_box_name}, {output_name});
engine_->AddOpAttr(op_name, "share_location", true);
engine_->AddOpAttr(op_name, "variance_encode_in_target", false);
engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0));
engine_->AddOpAttr(op_name, "background_id", background_label);
engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k);
engine_->AddOpAttr(op_name, "code_type", anakin_code_type);
engine_->AddOpAttr(op_name, "conf_thresh", score_threshold);
engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k);
engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold);
engine_->AddOpAttr(op_name, "nms_eta", nms_eta);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class DetectionOutOpConverter : public AnakinOpConverter {
public:
DetectionOutOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~DetectionOutOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/dropout.h"
#include <algorithm>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void DropoutOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Mask").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto x_name = op_desc.Input("X").front();
auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Scale", {x_name}, {out_name});
auto dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
auto factor = 1 - dropout_prob;
Shape shape1(std::vector<int>({1, 1, 1, 1}));
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
auto *factor_data = static_cast<float *>(weight1->h_tensor().mutable_data());
float weight1_data[] = {factor};
std::copy(std::begin(weight1_data), std::end(weight1_data), factor_data);
engine_->AddOpAttr(op_name, "weight_1", *weight1);
engine_->AddOpAttr(op_name, "axis", 0);
engine_->AddOpAttr(op_name, "num_axes", 0);
engine_->AddOpAttr(op_name, "bias_term", false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(dropout, DropoutOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class DropoutOpConverter : public AnakinOpConverter {
public:
DropoutOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~DropoutOpConverter() {}
private:
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/elementwise.h"
#include <algorithm>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto x_name = op_desc.Input("X").front();
auto y_name = op_desc.Input("Y").front();
auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
std::string elementwise_type = "Add";
engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type);
std::vector<float> coeff = {1.0, 1.0};
engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
}
void ElementwiseMulOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto x_name = op_desc.Input("X").front();
auto y_name = op_desc.Input("Y").front();
auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Scale", {x_name, y_name}, {out_name});
// Fill a number to weight_1 as a placeholder.
Shape shape1(std::vector<int>({1, 1, 1, 1}));
auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
auto *placeholder_data =
static_cast<float *>(weight1->h_tensor().mutable_data());
float weight1_data[] = {1};
std::copy(std::begin(weight1_data), std::end(weight1_data), placeholder_data);
engine_->AddOpAttr(op_name, "weight_1", *weight1);
auto axis = boost::get<int>(op_desc.GetAttr("axis"));
engine_->AddOpAttr(op_name, "axis", axis);
engine_->AddOpAttr(op_name, "num_axes", 1);
engine_->AddOpAttr(op_name, "bias_term", false);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter);
REGISTER_ANAKIN_OP_CONVERTER(elementwise_mul, ElementwiseMulOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class ElementwiseAddOpConverter : public AnakinOpConverter {
public:
ElementwiseAddOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~ElementwiseAddOpConverter() {}
private:
};
class ElementwiseMulOpConverter : public AnakinOpConverter {
public:
ElementwiseMulOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~ElementwiseMulOpConverter() {}
private:
};
} // namespace anakin
} // namespace inference
} // namespace paddle
...@@ -14,60 +14,108 @@ ...@@ -14,60 +14,108 @@
#include "paddle/fluid/inference/anakin/convert/fc.h" #include "paddle/fluid/inference/anakin/convert/fc.h"
#include <algorithm> #include <algorithm>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem; using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT; using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV; using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape; using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void FcOpConverter::operator()(const framework::proto::OpDesc &op, void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope, bool test_mode) { const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); auto input_names = op_desc.InputNames();
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); bool with_bias = input_names.size() == 3;
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
std::string w_name = "Y";
std::string i_name = "X";
if (with_bias) {
w_name = "W";
i_name = "Input";
}
auto x_name = op_desc.Input("X").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
auto *y_v = scope.FindVar(op_desc.Input("Y").front());
// get weights
auto *y_v = scope.FindVar(op_desc.Input(w_name).front());
PADDLE_ENFORCE_NOT_NULL(y_v); PADDLE_ENFORCE_NOT_NULL(y_v);
auto *y_t = y_v->GetMutable<framework::LoDTensor>(); auto *y_t = y_v->GetMutable<framework::LoDTensor>();
auto input_name = op_desc.Input("X").front(); auto input_name = op_desc.Input(i_name).front();
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
auto weight_shape = framework::vectorize2int(y_t->dims());
engine_->AddOp(op_name, "Dense", {input_name}, {output_name}); engine_->AddOp(op_name, "Dense", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "bias_term", false); engine_->AddOpAttr(op_name, "bias_term", with_bias);
engine_->AddOpAttr(op_name, "axis", 1); engine_->AddOpAttr(op_name, "axis", 1);
auto weight_shape = framework::vectorize2int(y_t->dims());
int out_dim = weight_shape[1]; int out_dim = weight_shape[1];
engine_->AddOpAttr(op_name, "out_dim", out_dim); engine_->AddOpAttr(op_name, "out_dim", out_dim);
const int w_m = weight_shape[0];
const int w_k = weight_shape[1];
weight_shape.push_back(1); if (weight_shape.size() < 4UL) {
weight_shape.push_back(1); weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1);
}
Shape anakin_shape(weight_shape); Shape anakin_shape(weight_shape);
framework::LoDTensor weight_tensor; framework::LoDTensor weight_tensor;
weight_tensor.Resize(y_t->dims()); weight_tensor.Resize(y_t->dims());
TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor); TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor);
auto *weight_data = weight_tensor.data<float>();
PADDLE_ENFORCE(w_m * w_k == weight_tensor.numel());
std::vector<float> trans_weight_data(weight_tensor.numel());
for (int i = 0; i < w_m; i++) {
for (int j = 0; j < w_k; j++) {
trans_weight_data[i + j * w_m] = weight_data[i * w_k + j];
}
}
auto *weight1 = auto *weight1 =
GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape); GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(anakin_shape);
float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data()); float *cpu_data = static_cast<float *>(weight1->h_tensor().mutable_data());
std::copy_n(weight_tensor.data<float>(), weight_tensor.numel(), cpu_data); std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data);
weight1->d_tensor().set_shape(anakin_shape); weight1->d_tensor().set_shape(anakin_shape);
weight1->d_tensor().copy_from(weight1->h_tensor()); weight1->d_tensor().copy_from(weight1->h_tensor());
engine_->AddOpAttr(op_name, "weight_1", *weight1); engine_->AddOpAttr(op_name, "weight_1", *weight1);
// get bias
if (with_bias) {
auto *b_v = scope.FindVar(op_desc.Input("Bias").front());
PADDLE_ENFORCE_NOT_NULL(b_v);
auto *b_t = b_v->GetMutable<framework::LoDTensor>();
auto bias_shape = framework::vectorize2int(b_t->dims());
framework::LoDTensor bias_tensor;
bias_tensor.Resize(b_t->dims());
TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor);
auto *bias_data = bias_tensor.data<float>();
bias_shape.insert(bias_shape.begin(), 1);
bias_shape.insert(bias_shape.begin(), 1);
bias_shape.insert(bias_shape.begin(), 1);
// bias_shape.push_back(1);
// bias_shape.push_back(1);
Shape anakin_bias_shape(bias_shape);
auto *weight2 = GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(
anakin_bias_shape);
float *cpu_data2 = static_cast<float *>(weight2->h_tensor().mutable_data());
std::copy_n(bias_data, bias_tensor.numel(), cpu_data2);
weight2->d_tensor().set_shape(anakin_bias_shape);
weight2->d_tensor().copy_from(weight2->h_tensor());
engine_->AddOpAttr(op_name, "weight_2", *weight2);
}
} }
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter);
REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter);
...@@ -20,19 +20,28 @@ namespace paddle { ...@@ -20,19 +20,28 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
class FcOpConverter : public AnakinOpConverter { class FcBaseOpConverter : public AnakinOpConverter {
public: public:
FcOpConverter() = default; FcBaseOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~FcOpConverter() {} virtual ~FcBaseOpConverter() {}
};
private: // with bias
class FcOpConverter : public FcBaseOpConverter {
public:
FcOpConverter() = default;
};
// without bias
class MulOpConverter : public FcBaseOpConverter {
public:
MulOpConverter() = default;
}; };
static Registrar<FcOpConverter> register_fc_op_converter("fc");
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/flatten.h"
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
auto input = op_desc.Input("X").front();
auto output = op_desc.Output("Out").front();
int axis = boost::get<int>(op_desc.GetAttr("axis"));
PADDLE_ENFORCE(axis == 1,
"the anakin flatten op converter now only support aixs == 1.");
std::vector<int> out_dims = {0, -1, 1, 1};
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Reshape", {input}, {output});
engine_->AddOpAttr<PTuple<int>>(op_name, "dims", out_dims);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class FlattenOpConverter : public AnakinOpConverter {
public:
FlattenOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~FlattenOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/im2sequence.h"
#include <algorithm>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 0);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto x_name = op_desc.Input("X").front();
auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name});
std::vector<int> dilations = {1, 1};
auto paddings = boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
auto strides = boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
auto kernels = boost::get<std::vector<int>>(op_desc.GetAttr("kernels"));
engine_->AddOpAttr<PTuple<int>>(op_name, "paddings", paddings);
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
engine_->AddOpAttr<PTuple<int>>(op_name, "window_size", kernels);
engine_->AddOpAttr<PTuple<int>>(op_name, "dilations", dilations);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(im2sequence, Im2SequenceConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class Im2SequenceConverter : public AnakinOpConverter {
public:
Im2SequenceConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~Im2SequenceConverter() {}
private:
};
} // namespace anakin
} // namespace inference
} // namespace paddle
...@@ -14,15 +14,16 @@ ...@@ -14,15 +14,16 @@
#pragma once #pragma once
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector>
#include "framework/core/types.h" #include "framework/core/types.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/anakin/convert/registrar.h"
#include "paddle/fluid/inference/anakin/engine.h" #include "paddle/fluid/inference/anakin/engine.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "saber/saber_types.h" #include "saber/saber_types.h"
...@@ -46,19 +47,14 @@ class AnakinOpConverter { ...@@ -46,19 +47,14 @@ class AnakinOpConverter {
bool test_mode = false) { bool test_mode = false) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
std::string op_type = op_desc.Type(); std::string op_type = op_desc.Type();
std::shared_ptr<AnakinOpConverter> it{nullptr}; AnakinOpConverter *it = nullptr;
if (op_type == "mul") { if (op_type == "reshape2") op_type = "reshape";
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); if (op_type == "transpose2") op_type = "transpose";
std::string Y = op_desc.Input("Y")[0]; if (op_type == "flatten2") op_type = "flatten";
std::cout << Y << parameters.count(Y) << std::endl;
if (parameters.count(Y)) {
it = OpRegister::instance()->Get("fc");
}
}
if (!it) { if (!it) {
it = OpRegister::instance()->Get(op_type); it = Registry<AnakinOpConverter>::Global().Lookup(op_type);
} }
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type); PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type);
it->SetEngine(engine); it->SetEngine(engine);
...@@ -74,6 +70,63 @@ class AnakinOpConverter { ...@@ -74,6 +70,63 @@ class AnakinOpConverter {
ConvertOp(op, parameters, scope, engine); ConvertOp(op, parameters, scope, engine);
} }
} }
// The scope here should be inited with the parameter vars.
void ConvertBlockToAnakinEngine(
framework::BlockDesc *block_desc, framework::Scope *scope,
const std::vector<std::string> &inputs,
const std::unordered_set<std::string> &parameters,
const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
framework::proto::BlockDesc *block_proto = block_desc->Proto();
ConvertBlock(*block_proto, parameters, *scope, engine);
engine->Freeze();
// if the max_batch size
int max_batch_size = engine->GetMaxBatchSize();
PADDLE_ENFORCE(max_batch_size > 0,
"the max_batch_size setted from config->EnableAnakinEngine "
"must largger than 0");
// If the user does not specify this variable, we use the input shape from
// the block_desc.
auto max_input_shape = engine->GetMaxInputShape();
std::map<std::string, std::vector<int>> temp_max_input_shape;
for (auto &input : inputs) {
if (parameters.count(input)) continue;
std::vector<int> input_shape;
input_shape.resize(4);
input_shape[0] = max_batch_size;
if (max_input_shape.count(input)) {
PADDLE_ENFORCE(max_input_shape[input].size() == 4,
"the dimensions of max_input_shape setted from "
"config->EnableAnakinEngine must be 4");
for (int i = 1; i < 4; i++) {
input_shape[i] = max_input_shape[input][i];
}
} else {
auto *var = block_desc->FindVar(input);
PADDLE_ENFORCE(var, "no variable called %s", input);
auto var_shape = var->GetShape();
std::cout << "input :" << input << std::endl;
PADDLE_ENFORCE(var_shape.size() == 4);
for (size_t i = 1; i < var_shape.size(); i++) {
input_shape[i] = var_shape[i];
}
}
temp_max_input_shape[input] = input_shape;
engine->SetInputShape(input, input_shape);
engine->Graph()->RegistVar(input); // For share from data.
}
engine->SetMaxInputShape(temp_max_input_shape);
engine->Optimize();
// For anakin share with fluid tensor.
engine->AllocTmpMem();
engine->InitGraph();
}
void SetEngine(AnakinNvEngine *engine) { engine_ = engine; } void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
virtual ~AnakinOpConverter() {} virtual ~AnakinOpConverter() {}
...@@ -91,22 +144,23 @@ class AnakinOpConverter { ...@@ -91,22 +144,23 @@ class AnakinOpConverter {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ #define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \
struct anakin_##op_type__##_converter \ struct anakin_##op_type__##_converter \
: public ::paddle::framework::Registrar { \ : public ::paddle::framework::Registrar { \
anakin_##op_type__##_converter() { \ anakin_##op_type__##_converter() { \
::paddle::inference:: \ LOG(INFO) << "register convert " << #op_type__; \
Registry<paddle::inference::anakin::AnakinOpConverter>::Register< \ ::paddle::inference::Registry< \
::paddle::inference::anakin::Converter__>(#op_type__); \ ::paddle::inference::anakin::AnakinOpConverter>::Global() \
} \ .Register<::paddle::inference::anakin::Converter__>(#op_type__); \
}; \ } \
anakin_##op_type__##_converter anakin_##op_type__##_converter__; \ }; \
int TouchConverterRegister_anakin_##op_type__() { \ anakin_##op_type__##_converter anakin_##op_type__##_converter__; \
anakin_##op_type__##_converter__.Touch(); \ int TouchConverterRegister_anakin_##op_type__() { \
return 0; \ anakin_##op_type__##_converter__.Touch(); \
return 0; \
} }
#define USE_ANAKIN_CONVERTER(op_type__) \ #define USE_ANAKIN_CONVERTER(op_type__) \
extern int TouchConverterRegister_anakin_##op_type__(); \ extern int TouchConverterRegister_anakin_##op_type__(); \
static int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \ int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \
TouchConverterRegister_anakin_##op_type__(); TouchConverterRegister_anakin_##op_type__();
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/pool2d.h"
#include <algorithm>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto x_name = op_desc.Input("X").front();
auto y_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
std::string pool_type =
boost::get<std::string>(op_desc.GetAttr("pooling_type"));
std::vector<int> ksize =
boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));
std::string anakin_pool_type;
if (pool_type == "max") {
anakin_pool_type = "MAX";
} else if (pool_type == "avg") {
if (paddings[0] || paddings[1]) {
anakin_pool_type = "AVGEXC";
} else {
anakin_pool_type = "AVG";
}
} else {
PADDLE_THROW("TensorRT unsupported pooling type!");
}
engine_->AddOp(op_name, "Pooling", {x_name}, {y_name});
engine_->AddOpAttr<PTuple<int>>(op_name, "pool_size", ksize);
engine_->AddOpAttr<PTuple<int>>(op_name, "strides", strides);
engine_->AddOpAttr<PTuple<int>>(op_name, "padding", paddings);
engine_->AddOpAttr(op_name, "method", anakin_pool_type);
engine_->AddOpAttr(op_name, "global_pooling", global_pooling);
engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(pool2d, Pool2dOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class Pool2dOpConverter : public AnakinOpConverter {
public:
Pool2dOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~Pool2dOpConverter() {}
private:
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/relu.h"
#include <algorithm>
#include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
auto input_name = op_desc.Input("X").front();
auto output_name = op_desc.Output("Out").front();
engine_->AddOp(op_name, "ReLU", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "alpha", 0);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class ReluOpConverter : public AnakinOpConverter {
public:
ReluOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~ReluOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/reshape.h"
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL);
auto input = op_desc.Input("X").front();
auto output = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Reshape", {input}, {output});
auto shape = boost::get<std::vector<int>>(op_desc.GetAttr("shape"));
if (shape.size() < 4) {
shape.insert(shape.end(), 4 - shape.size(), 1);
}
engine_->AddOpAttr<PTuple<int>>(op_name, "dims", shape);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class ReshapeOpConverter : public AnakinOpConverter {
public:
ReshapeOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~ReshapeOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/scale.h"
#include <algorithm>
#include <map>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
auto input_name = op_desc.Input("X").front();
auto output_name = op_desc.Output("Out").front();
float scale = boost::get<float>(op_desc.GetAttr("scale"));
float bias = boost::get<float>(op_desc.GetAttr("bias"));
float bias_after_scale =
boost::get<bool>(op_desc.GetAttr("bias_after_scale"));
PADDLE_ENFORCE(bias_after_scale,
"The anakin scale layer only support bias after scale now.");
engine_->AddOp(op_name, "Power", {input_name}, {output_name});
engine_->AddOpAttr(op_name, "shift", bias);
engine_->AddOpAttr(op_name, "scale", scale);
engine_->AddOpAttr(op_name, "power", static_cast<float>(1.0));
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class ScaleOpConverter : public AnakinOpConverter {
public:
ScaleOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~ScaleOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/softmax.h"
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
namespace paddle {
namespace inference {
namespace anakin {
void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL);
auto input = op_desc.Input("X").front();
auto output = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Softmax", {input}, {output});
engine_->AddOpAttr(op_name, "axis", 2);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(softmax, SoftMaxOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class SoftMaxOpConverter : public AnakinOpConverter {
public:
SoftMaxOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~SoftMaxOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/split.h"
#include <algorithm>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void SplitOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
auto input_name = op_desc.Input("X").front();
auto y_names = op_desc.Output("Out");
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
int axis = boost::get<int>(op_desc.GetAttr("axis"));
std::vector<int> output_lengths =
boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
int split_num = output_lengths.size();
PADDLE_ENFORCE(split_num > 1,
"anakin split op converter: the split num should > 1");
int num_sum = 0;
std::vector<int> slice_point;
for (int i = 0; i < split_num - 1; i++) {
num_sum += output_lengths[i];
slice_point.push_back(num_sum);
}
engine_->AddOp(op_name, "Slice", {input_name}, y_names);
engine_->AddOpAttr(op_name, "axis", axis);
engine_->AddOpAttr<PTuple<int>>(op_name, "slice_point", slice_point);
// slice_dim is useless in anakin
engine_->AddOpAttr(op_name, "slice_dim", 4);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(split, SplitOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class SplitOpConverter : public AnakinOpConverter {
public:
SplitOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~SplitOpConverter() {}
private:
};
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/sum.h"
#include <algorithm>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void SumOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto input_names = op_desc.Input("X");
auto out_name = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
std::vector<float> coeff = {1, 1};
std::string elementwise_type = "Add";
engine_->AddOp(op_name, "Eltwise", input_names, {out_name});
engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
engine_->AddOpAttr<std::string>(op_name, "type", elementwise_type);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(sum, SumOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class SumOpConverter : public AnakinOpConverter {
public:
SumOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~SumOpConverter() {}
private:
};
} // namespace anakin
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/activation.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
static void test_activation_op(const std::string &op_type) {
auto *converter = Registry<AnakinOpConverter>::Global().Lookup(op_type);
PADDLE_ENFORCE(converter != nullptr);
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("act-X", {10, 6, 1, 1});
validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
framework::OpDesc desc;
desc.SetType(op_type);
desc.SetInput("X", {"act-X"});
desc.SetOutput("Out", {"act-Out"});
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(5);
}
TEST(sigm_op, test) { test_activation_op("sigmoid"); }
TEST(tanh_op, test) { test_activation_op("tanh"); }
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(sigmoid);
USE_OP(tanh);
USE_ANAKIN_CONVERTER(sigmoid);
USE_ANAKIN_CONVERTER(tanh);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
TEST(batch_norm_op, test) {
std::unordered_set<std::string> parameters(
{"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
"batch_norm_variance"});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
std::vector<int> param_shape{2};
validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5});
validator.DeclParamVar("batch_norm_scale", param_shape);
validator.DeclParamVar("batch_norm_bias", param_shape);
validator.DeclParamVar("batch_norm_mean", param_shape);
validator.DeclParamVar("batch_norm_variance", param_shape);
validator.DeclOutputVar("batch_norm_Y", {1, 2, 5, 5});
validator.DeclOutputVar("batch_norm_save_mean", param_shape);
validator.DeclOutputVar("batch_norm_save_variance", param_shape);
// Prepare Op description
framework::OpDesc desc;
desc.SetType("batch_norm");
desc.SetInput("X", {"batch_norm_X"});
desc.SetInput("Scale", {"batch_norm_scale"});
desc.SetInput("Bias", {"batch_norm_bias"});
desc.SetInput("Mean", {"batch_norm_mean"});
desc.SetInput("Variance", {"batch_norm_variance"});
desc.SetOutput("Y", {"batch_norm_Y"});
desc.SetOutput("MeanOut", {"batch_norm_mean"});
desc.SetOutput("VarianceOut", {"batch_norm_variance"});
desc.SetOutput("SavedMean", {"batch_norm_save_mean"});
desc.SetOutput("SavedVariance", {"batch_norm_save_variance"});
float eps = 1e-5f;
bool is_test = true;
desc.SetAttr("epsilon", eps);
desc.SetAttr("is_test", is_test);
validator.SetOp(*desc.Proto());
std::unordered_set<std::string> neglected_output = {
"batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean",
"batch_norm_variance"};
validator.Execute(1, neglected_output);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(batch_norm);
USE_ANAKIN_CONVERTER(batch_norm);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/concat.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
TEST(concat_op, test) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
validator.DeclOutputVar("concat_out", {1, 6, 1, 1});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("concat");
desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
desc.SetOutput("Out", {"concat_out"});
int axis = 1;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
TEST(concat_op, test2) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("concat_x1", {1, 4});
validator.DeclInputVar("concat_x2", {3, 4});
validator.DeclInputVar("concat_x3", {2, 4});
validator.DeclOutputVar("concat_out", {6, 4});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("concat");
desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
desc.SetOutput("Out", {"concat_out"});
int axis = 0;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(concat);
USE_ANAKIN_CONVERTER(concat);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/conv2d.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
TEST(conv2d_op, test) {
auto* conv2d_converter =
Registry<AnakinOpConverter>::Global().Lookup("conv2d");
ASSERT_TRUE(conv2d_converter != nullptr);
std::unordered_set<std::string> parameters({"conv2d-Y"});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("conv2d");
desc.SetInput("Input", {"conv2d-X"});
desc.SetInput("Filter", {"conv2d-Y"});
desc.SetOutput("Output", {"conv2d-Out"});
const std::vector<int> strides({1, 1});
const std::vector<int> paddings({0, 0});
const std::vector<int> dilations({1, 1});
const int groups = 1;
desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings);
desc.SetAttr("dilations", dilations);
desc.SetAttr("groups", groups);
validator.SetOp(*desc.Proto());
validator.Execute(3);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(conv2d);
USE_ANAKIN_CONVERTER(conv2d);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/dropout.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
TEST(dropout_op, native) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1, 2, 2});
validator.DeclOutputVar("mask", {1, 1, 2, 2});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("dropout");
desc.SetInput("X", {"x"});
desc.SetOutput("Out", {"out"});
desc.SetOutput("Mask", {"mask"});
float dropout_prob = 0.5;
desc.SetAttr("dropout_prob", dropout_prob);
desc.SetAttr("is_test", true);
validator.SetOp(*desc.Proto());
std::unordered_set<std::string> neglected_output = {"mask"};
validator.Execute(1, neglected_output);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(dropout);
USE_ANAKIN_CONVERTER(dropout);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/elementwise.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
static void test_elementwise_op(const std::string &op_type) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclInputVar("y", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1, 2, 2});
// Prepare Op description
framework::OpDesc desc;
desc.SetType(op_type);
desc.SetInput("X", {"x"});
desc.SetInput("Y", {"y"});
desc.SetOutput("Out", {"out"});
int axis = -1;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
TEST(elementwise_op, native_add) { test_elementwise_op("elementwise_add"); }
TEST(elementwise_op, native_mul) { test_elementwise_op("elementwise_mul"); }
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(elementwise_add);
USE_ANAKIN_CONVERTER(elementwise_add);
USE_OP(elementwise_mul);
USE_ANAKIN_CONVERTER(elementwise_mul);
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/fc.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h" #include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h" #include "paddle/fluid/inference/anakin/convert/ut_helper.h"
...@@ -22,17 +21,15 @@ namespace inference { ...@@ -22,17 +21,15 @@ namespace inference {
namespace anakin { namespace anakin {
TEST(fc_op, test) { TEST(fc_op, test) {
auto fc_converter = OpRegister::instance()->Get("fc"); auto* fc_converter = Registry<AnakinOpConverter>::Global().Lookup("fc");
ASSERT_TRUE(fc_converter != nullptr); ASSERT_TRUE(fc_converter);
// Registrar<FcOpConverter> register_fc("fc");
// auto fc = std::make_shared<FcOpConverter>();
std::unordered_set<std::string> parameters({"mul_y"}); std::unordered_set<std::string> parameters({"mul_y"});
framework::Scope scope; framework::Scope scope;
AnakinConvertValidation validator(parameters, scope); AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("mul_x", {1, 1, 1, 1}); validator.DeclInputVar("mul_x", {1, 1, 2, 2});
validator.DeclParamVar("mul_y", {1, 2}); validator.DeclParamVar("mul_y", {4, 2});
validator.DeclOutputVar("mul_out", {1, 1, 1, 2}); validator.DeclOutputVar("mul_out", {1, 2});
// Prepare Op description // Prepare Op description
framework::OpDesc desc; framework::OpDesc desc;
...@@ -40,8 +37,6 @@ TEST(fc_op, test) { ...@@ -40,8 +37,6 @@ TEST(fc_op, test) {
desc.SetInput("X", {"mul_x"}); desc.SetInput("X", {"mul_x"});
desc.SetInput("Y", {"mul_y"}); desc.SetInput("Y", {"mul_y"});
desc.SetOutput("Out", {"mul_out"}); desc.SetOutput("Out", {"mul_out"});
int num_flatten_dims = 3;
desc.SetAttr("x_num_col_dims", num_flatten_dims);
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
validator.Execute(10); validator.Execute(10);
...@@ -52,3 +47,4 @@ TEST(fc_op, test) { ...@@ -52,3 +47,4 @@ TEST(fc_op, test) {
} // namespace paddle } // namespace paddle
USE_OP(mul); USE_OP(mul);
USE_ANAKIN_CONVERTER(fc);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
TEST(flatten_op, test) {
auto *converter = Registry<AnakinOpConverter>::Global().Lookup("flatten");
ASSERT_TRUE(converter);
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
framework::OpDesc desc;
desc.SetType("flatten");
desc.SetInput("X", {"flatten-X"});
desc.SetOutput("Out", {"flatten-Out"});
desc.SetAttr("axis", 1);
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(5);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(reshape);
USE_OP_ITSELF(flatten);
USE_ANAKIN_CONVERTER(flatten);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/im2sequence.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
TEST(im2sequence_op, native) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
std::vector<int> kernels = {6, 1};
std::vector<int> strides = {1, 1};
std::vector<int> paddings = {0, 0, 0, 0};
validator.DeclInputVar("x", {1, 1, 2, 2});
validator.DeclOutputVar("out", {1, 1 * kernels[0] * kernels[1]});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("im2sequence");
desc.SetInput("X", {"x"});
desc.SetOutput("Out", {"out"});
desc.SetAttr("kernels", kernels);
desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings);
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(im2sequence);
USE_ANAKIN_CONVERTER(im2sequence);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
void test_pool2d(bool global_pooling, bool ceil_mode,
std::string pool_type = "max") {
auto* pool2d_converter =
Registry<AnakinOpConverter>::Global().Lookup("pool2d");
ASSERT_TRUE(pool2d_converter);
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
validator.DeclInputVar("pool2d_x", {1, 3, 6, 7});
if (global_pooling)
validator.DeclOutputVar("pool2d_out", {1, 3, 1, 1});
else if (ceil_mode)
validator.DeclOutputVar("pool2d_out", {1, 3, 3, 4});
else
validator.DeclOutputVar("pool2d_out", {1, 3, 3, 3});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("pool2d");
desc.SetInput("X", {"pool2d_x"});
desc.SetOutput("Out", {"pool2d_out"});
std::vector<int> ksize({2, 2});
std::vector<int> strides({2, 2});
std::vector<int> paddings({0, 0});
std::string pooling_t = pool_type;
desc.SetAttr("pooling_type", pooling_t);
desc.SetAttr("ksize", ksize);
desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings);
desc.SetAttr("global_pooling", global_pooling);
desc.SetAttr("ceil_mode", ceil_mode);
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(1);
}
void test_pool2d2(bool global_pooling, bool ceil_mode,
std::string pool_type = "max") {
auto* pool2d_converter =
Registry<AnakinOpConverter>::Global().Lookup("pool2d");
ASSERT_TRUE(pool2d_converter);
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
validator.DeclInputVar("pool2d_x", {1, 1, 17, 17});
validator.DeclOutputVar("pool2d_out", {1, 1, 17, 17});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("pool2d");
desc.SetInput("X", {"pool2d_x"});
desc.SetOutput("Out", {"pool2d_out"});
std::vector<int> ksize({3, 3});
std::vector<int> strides({1, 1});
std::vector<int> paddings({1, 1});
std::string pooling_t = pool_type;
desc.SetAttr("pooling_type", pooling_t);
desc.SetAttr("ksize", ksize);
desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings);
desc.SetAttr("global_pooling", global_pooling);
desc.SetAttr("ceil_mode", true);
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(1);
}
TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }
TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); }
TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
TEST(Pool2dOpConverter, avg_ceil_test2) { test_pool2d2(false, true, "avg"); }
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(pool2d);
USE_ANAKIN_CONVERTER(pool2d);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/relu.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
static void test_activation_op(const std::string &op_type) {
auto *converter = Registry<AnakinOpConverter>::Global().Lookup(op_type);
PADDLE_ENFORCE(converter != nullptr);
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("act-X", {10, 6, 1, 1});
validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
framework::OpDesc desc;
desc.SetType(op_type);
desc.SetInput("X", {"act-X"});
desc.SetOutput("Out", {"act-Out"});
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(5);
}
TEST(sigm_op, test) { test_activation_op("relu"); }
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(relu);
USE_ANAKIN_CONVERTER(relu);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
TEST(reshape, test) {
auto* converter = Registry<AnakinOpConverter>::Global().Lookup("reshape");
ASSERT_TRUE(converter);
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope);
// validator.DeclInputVar("reshape-X", {2, 3, 3, 1});
// validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3});
validator.DeclInputVar("reshape-X", {1, 2, 4, 1});
validator.DeclOutputVar("reshape-Out", {1, 8, 1, 1});
framework::OpDesc desc;
desc.SetType("reshape");
desc.SetInput("X", {"reshape-X"});
desc.SetOutput("Out", {"reshape-Out"});
// desc.SetAttr("shape", std::vector<int>({3, 2, 1, 3}));
desc.SetAttr("shape", std::vector<int>({1, 8, 1, 1}));
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(1);
}
TEST(reshape, test2) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("reshape-X", {1, 2, 4});
validator.DeclOutputVar("reshape-Out", {1, 4, 2});
framework::OpDesc desc;
desc.SetType("reshape");
desc.SetInput("X", {"reshape-X"});
desc.SetOutput("Out", {"reshape-Out"});
// desc.SetAttr("shape", std::vector<int>({3, 2, 1, 3}));
desc.SetAttr("shape", std::vector<int>({0, -1, 2}));
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(1);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(reshape);
USE_ANAKIN_CONVERTER(reshape);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
TEST(softmax, test) {
auto* converter = Registry<AnakinOpConverter>::Global().Lookup("softmax");
ASSERT_TRUE(converter);
framework::Scope scope;
std::unordered_set<std::string> parameters;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("softmax-X", {1, 10, 2});
validator.DeclOutputVar("softmax-Out", {1, 10, 2});
framework::OpDesc desc;
desc.SetType("softmax");
desc.SetInput("X", {"softmax-X"});
desc.SetOutput("Out", {"softmax-Out"});
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(1);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(softmax);
USE_ANAKIN_CONVERTER(softmax);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/split.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
template <int Axis>
void AnakinSliceTest(const std::vector<int> &in_shape,
const std::vector<int> &sections) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("split_input", in_shape);
std::vector<std::string> output_vars;
for (size_t i = 0; i < sections.size(); ++i) {
auto out_shape = in_shape;
out_shape[Axis] = sections[i];
std::string output_name = "split_out" + std::to_string(i);
validator.DeclOutputVar(output_name, out_shape);
output_vars.push_back(output_name);
}
// Prepare Op description
framework::OpDesc desc;
desc.SetType("split");
desc.SetInput("X", {"split_input"});
desc.SetOutput("Out", output_vars);
desc.SetAttr("axis", Axis);
desc.SetAttr("num", 0);
desc.SetAttr("sections", sections);
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
// batch = 0, axis = 1, same shape
TEST(split_op, test_same_shape_axis1_batch1) {
AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2});
}
// batch = 0, axis = 1, different shape
TEST(split_op, test_different_shape_axis1_batch1) {
AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1});
}
// batch = 10, axis = 1, same shape
TEST(split_op, test_same_shape_axis1_batch10) {
AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2});
}
// batch = 10, axis = 1, different shape
TEST(split_op, test_different_shape_axis1_batch10) {
AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1});
}
// batch = 0, axis = 2, same shape
TEST(split_op, test_same_shape_axis2_batch1) {
AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2});
}
// batch = 0, axis = 2, different shape
TEST(split_op, test_different_shape_axis2_batch1) {
AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1});
}
// batch = 10, axis = 2, same shape
TEST(split_op, test_same_shape_axis2_batch10) {
AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2});
}
// batch = 10, axis = 2, different shape
TEST(split_op, test_different_shape_axis2_batch10) {
AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1});
}
// batch = 0, axis = 3, same shape
TEST(split_op, test_same_shape_axis3_batch1) {
AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2});
}
// batch = 0, axis = 3, different shape
TEST(split_op, test_different_shape_axis3_batch1) {
AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1});
}
// batch = 10, axis = 3, same shape
TEST(split_op, test_same_shape_axis3_batch10) {
AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2});
}
// batch = 10, axis = 3, different shape
TEST(split_op, test_different_shape_axis3_batch10) {
AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1});
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(split);
USE_ANAKIN_CONVERTER(split);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/sum.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
#include "paddle/fluid/operators/sum_op.h"
namespace paddle {
namespace inference {
namespace anakin {
TEST(sum, native) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("sum_x1", {1, 2, 1, 2});
validator.DeclInputVar("sum_x2", {1, 2, 1, 2});
validator.DeclOutputVar("sum_out", {1, 2, 1, 2});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("sum");
desc.SetInput("X", {"sum_x1", "sum_x2"});
desc.SetOutput("Out", {"sum_out"});
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(sum);
USE_ANAKIN_CONVERTER(sum);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace anakin {
TEST(transpose_op, test) {
auto* converter = Registry<AnakinOpConverter>::Global().Lookup("transpose");
ASSERT_TRUE(converter != nullptr);
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("transpose-X", {2, 3, 4, 5});
validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("transpose");
desc.SetInput("X", {"transpose-X"});
desc.SetOutput("Out", {"transpose-Out"});
desc.SetAttr("axis", std::vector<int>({2, 0, 3, 1}));
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(3);
}
// test input shape's dims < 4
TEST(transpose_op, test2) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
AnakinConvertValidation validator(parameters, &scope);
validator.DeclInputVar("transpose-X", {3, 4, 5});
validator.DeclOutputVar("transpose-Out", {3, 5, 4});
// Prepare Op description
framework::OpDesc desc;
desc.SetType("transpose");
desc.SetInput("X", {"transpose-X"});
desc.SetOutput("Out", {"transpose-Out"});
desc.SetAttr("axis", std::vector<int>({0, 2, 1}));
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(1);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
USE_OP(transpose);
USE_ANAKIN_CONVERTER(transpose);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/convert/transpose.h"
#include <algorithm>
#include <string>
#include <vector>
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::saber::NV;
using anakin::saber::Shape;
using anakin::PTuple;
namespace paddle {
namespace inference {
namespace anakin {
void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) {
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto input = op_desc.Input("X").front();
auto output = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
engine_->AddOp(op_name, "Permute", {input}, {output});
auto axis = boost::get<std::vector<int>>(op_desc.GetAttr("axis"));
size_t axis_size = axis.size();
while (axis.size() < 4) {
axis.push_back(axis_size);
axis_size += 1;
}
engine_->AddOpAttr<PTuple<int>>(op_name, "dims", axis);
}
} // namespace anakin
} // namespace inference
} // namespace paddle
REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace anakin {
class TransposeOpConverter : public AnakinOpConverter {
public:
TransposeOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op,
const framework::Scope &scope,
bool test_mode) override;
virtual ~TransposeOpConverter() {}
};
} // namespace anakin
} // namespace inference
} // namespace paddle
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <gtest/gtest.h>
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -24,6 +25,7 @@ limitations under the License. */ ...@@ -24,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/engine.h" #include "paddle/fluid/inference/anakin/engine.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
...@@ -82,7 +84,7 @@ class AnakinConvertValidation { ...@@ -82,7 +84,7 @@ class AnakinConvertValidation {
AnakinConvertValidation() = delete; AnakinConvertValidation() = delete;
AnakinConvertValidation(const std::unordered_set<std::string>& parameters, AnakinConvertValidation(const std::unordered_set<std::string>& parameters,
const framework::Scope& scope) framework::Scope* scope)
: parameters_(parameters), scope_(scope), place_(0) { : parameters_(parameters), scope_(scope), place_(0) {
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_.reset(new AnakinEngine<NV, Precision::FP32>(true)); engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
...@@ -106,7 +108,7 @@ class AnakinConvertValidation { ...@@ -106,7 +108,7 @@ class AnakinConvertValidation {
void DeclVar(const std::string& name, const std::vector<int> dim_vec) { void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CUDADeviceContext ctx(place_); platform::CUDADeviceContext ctx(place_);
auto* x = scope_.Var(name); auto* x = scope_->Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec)); x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place_, ctx); RandomizeTensor(x_tensor, place_, ctx);
...@@ -118,15 +120,22 @@ class AnakinConvertValidation { ...@@ -118,15 +120,22 @@ class AnakinConvertValidation {
// should init anakin engine here. // should init anakin engine here.
Singleton<AnakinOpConverter>::Global().ConvertOp( Singleton<AnakinOpConverter>::Global().ConvertOp(
desc, parameters_, scope_, engine_.get(), true /*test_mode*/); desc, parameters_, *scope_, engine_.get(), true /*test_mode*/);
engine_->Freeze(); engine_->Freeze();
std::map<std::string, std::vector<int>> temp_max_input_shape;
for (const auto& input : op_desc_->InputArgumentNames()) { for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue; if (parameters_.count(input)) continue;
auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_, auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(*scope_,
input); input);
auto t_shape = framework::vectorize2int(t.dims()); auto t_shape = framework::vectorize2int(t.dims());
while (t_shape.size() < 4) {
t_shape.push_back(1);
}
engine_->SetInputShape(input, t_shape); engine_->SetInputShape(input, t_shape);
temp_max_input_shape[input] = t_shape;
} }
engine_->SetMaxInputShape(temp_max_input_shape);
engine_->Optimize(); engine_->Optimize();
engine_->InitGraph(); engine_->InitGraph();
} }
...@@ -138,14 +147,14 @@ class AnakinConvertValidation { ...@@ -138,14 +147,14 @@ class AnakinConvertValidation {
std::unordered_set<std::string> neglected_output = {}) { std::unordered_set<std::string> neglected_output = {}) {
// Execute Fluid Op // Execute Fluid Op
platform::CUDADeviceContext ctx(place_); platform::CUDADeviceContext ctx(place_);
op_->Run(scope_, place_); op_->Run(*scope_, place_);
// std::vector<framework::LoDTensor> input_vector; // std::vector<framework::LoDTensor> input_vector;
// std::vector<framework::LoDTensor> output_vector; // std::vector<framework::LoDTensor> output_vector;
std::map<std::string, framework::LoDTensor*> inputs; std::map<std::string, framework::LoDTensor*> inputs;
for (const auto& input : op_desc_->InputArgumentNames()) { for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue; if (parameters_.count(input)) continue;
auto* var = scope_.FindVar(input); auto* var = scope_->FindVar(input);
auto tensor = var->GetMutable<framework::LoDTensor>(); auto tensor = var->GetMutable<framework::LoDTensor>();
inputs.insert({input, tensor}); inputs.insert({input, tensor});
} }
...@@ -155,45 +164,38 @@ class AnakinConvertValidation { ...@@ -155,45 +164,38 @@ class AnakinConvertValidation {
for (const auto& output : op_desc_->OutputArgumentNames()) { for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue; if (neglected_output.count(output)) continue;
std::vector<float> fluid_out; std::vector<float> fluid_out;
auto* var = scope_.FindVar(output); auto* var = scope_->FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>(); auto tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &fluid_out); framework::TensorToVector(*tensor, ctx, &fluid_out);
fluid_outputs.push_back(fluid_out); fluid_outputs.push_back(fluid_out);
// size_t fluid_out_size = fluid_out.size();
/*for (size_t i = 0; i < fluid_out_size; i++) {
std::cout << fluid_out[i] << std::endl;
}*/
outputs.insert({output, tensor}); outputs.insert({output, tensor});
} }
engine_->Execute(inputs, outputs); engine_->Execute(inputs, outputs, stream_);
int i_output = 0; int i_output = 0;
for (const auto& output : op_desc_->OutputArgumentNames()) { for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue; if (neglected_output.count(output)) continue;
std::vector<float> anakin_out; std::vector<float> anakin_out;
auto* var = scope_.FindVar(output); auto* var = scope_->FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>(); auto tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &anakin_out); framework::TensorToVector(*tensor, ctx, &anakin_out);
size_t anakin_out_size = anakin_out.size(); size_t anakin_out_size = anakin_out.size();
auto fluid_out = fluid_outputs[i_output++]; auto fluid_out = fluid_outputs[i_output++];
for (size_t i = 0; i < anakin_out_size; i++) { for (size_t i = 0; i < anakin_out_size; i++) {
LOG(INFO) << "Output[" << i << "]: anakin[" << anakin_out[i] << "], " EXPECT_LT(std::abs(fluid_out[i] - anakin_out[i]), 1e-3);
<< "fluid[" << fluid_out[i] << "]";
} }
} }
} }
framework::Scope& scope() { return scope_; }
private: private:
std::unique_ptr<AnakinNvEngineT> engine_{nullptr}; std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
cudaStream_t stream_; cudaStream_t stream_;
std::unique_ptr<framework::OperatorBase> op_; std::unique_ptr<framework::OperatorBase> op_;
std::unique_ptr<framework::OpDesc> op_desc_; std::unique_ptr<framework::OpDesc> op_desc_;
const std::unordered_set<std::string>& parameters_; const std::unordered_set<std::string>& parameters_;
framework::Scope& scope_; framework::Scope* scope_;
platform::CUDAPlace place_; platform::CUDAPlace place_;
}; };
......
...@@ -33,9 +33,15 @@ namespace inference { ...@@ -33,9 +33,15 @@ namespace inference {
namespace anakin { namespace anakin {
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary) AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(
bool need_summary, int device, int max_batch_size,
std::map<std::string, std::vector<int>> max_input_shape)
: graph_(new AnakinGraphT<TargetT, PrecisionType>()), : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {} net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
device_ = device;
max_batch_size_ = max_batch_size;
max_input_shape_ = max_input_shape;
}
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
AnakinEngine<TargetT, PrecisionType, RunType>::~AnakinEngine() {} AnakinEngine<TargetT, PrecisionType, RunType>::~AnakinEngine() {}
...@@ -63,34 +69,53 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp( ...@@ -63,34 +69,53 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
const std::map<std::string, framework::LoDTensor *> &inputs, const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs) { const std::map<std::string, framework::LoDTensor *> &outputs,
cudaStream_t stream) {
cudaDeviceSynchronize();
for (const auto &input : inputs) { for (const auto &input : inputs) {
auto *tensor = input.second; auto *tensor = input.second;
auto *data = tensor->data<float>(); auto *data = tensor->data<float>();
auto shape = framework::vectorize2int(tensor->dims());
::anakin::saber::Shape anakin_shape(shape); auto fluid_input_shape = framework::vectorize2int(tensor->dims());
while (fluid_input_shape.size() < 4) {
fluid_input_shape.push_back(1);
}
auto *anakin_input = net_->get_in(input.first); auto *anakin_input = net_->get_in(input.first);
std::vector<int> max_input_shape = max_input_shape_[input.first];
int max_shape_sum =
std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1,
std::multiplies<int>());
PADDLE_ENFORCE(max_shape_sum >= tensor->numel(),
"The anakin input max shape should be greater than"
" or equal to the real input shape, Please set the max "
"input shape using EnableAnakinEngine");
anakin_input->reshape(fluid_input_shape);
::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0, ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
anakin_shape); fluid_input_shape);
anakin_input->share_from(tmp_anakin_tensor); anakin_input->copy_from(tmp_anakin_tensor);
} }
net_->prediction();
cudaDeviceSynchronize();
for (const auto &output : outputs) { for (const auto &output : outputs) {
platform::CUDAPlace gpu_place(device_);
auto *tensor = output.second; auto *tensor = output.second;
auto *data = tensor->data<float>();
auto shape = framework::vectorize2int(tensor->dims());
::anakin::saber::Shape anakin_shape(shape);
auto *anakin_output = net_->get_out(output.first); auto *anakin_output = net_->get_out(output.first);
::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0, auto *anakin_data = anakin_output->data();
anakin_shape); auto anakin_output_shape = anakin_output->valid_shape();
anakin_output->share_from(tmp_anakin_tensor); tensor->Resize(framework::make_ddim(anakin_output_shape));
auto *fluid_data = tensor->mutable_data<float>(gpu_place);
memory::Copy(gpu_place, static_cast<void *>(fluid_data), gpu_place,
static_cast<void *>(anakin_data),
tensor->numel() * sizeof(float), stream);
} }
net_->prediction(); cudaDeviceSynchronize();
} }
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
void AnakinEngine<TargetT, PrecisionType, RunType>::Freeze() { void AnakinEngine<TargetT, PrecisionType, RunType>::Freeze() {
PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph."); PADDLE_ENFORCE(graph_->Freeze_v3(), "Freeze anakin subgraph.");
} }
template <typename TargetT, Precision PrecisionType, OpRunType RunType> template <typename TargetT, Precision PrecisionType, OpRunType RunType>
......
...@@ -15,9 +15,11 @@ ...@@ -15,9 +15,11 @@
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <functional>
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/engine.h"
...@@ -26,8 +28,12 @@ ...@@ -26,8 +28,12 @@
#include "framework/core/net/net.h" #include "framework/core/net/net.h"
#include "framework/core/types.h" #include "framework/core/types.h"
#include "framework/graph/graph.h" #include "framework/graph/graph.h"
#include "framework/graph/graph_global_mem.h"
#include "saber/saber_types.h" #include "saber/saber_types.h"
using anakin::Precision;
using anakin::saber::NV;
namespace anakin { namespace anakin {
template <typename, Precision, OpRunType> template <typename, Precision, OpRunType>
...@@ -46,8 +52,13 @@ namespace anakin { ...@@ -46,8 +52,13 @@ namespace anakin {
template <typename TargetT, ::anakin::Precision PrecisionType, template <typename TargetT, ::anakin::Precision PrecisionType,
::anakin::OpRunType RunType = ::anakin::OpRunType::ASYNC> ::anakin::OpRunType RunType = ::anakin::OpRunType::ASYNC>
class AnakinEngine { class AnakinEngine {
using NetT = ::anakin::Net<TargetT, PrecisionType, RunType>;
using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
public: public:
explicit AnakinEngine(bool need_summary = false); explicit AnakinEngine(
bool need_summary = false, int device = 0, int max_batch_size = 1,
std::map<std::string, std::vector<int>> max_input_shape = {});
~AnakinEngine(); ~AnakinEngine();
void InitGraph(); void InitGraph();
void SetInputShape(const std::string &name, std::vector<int> shape); void SetInputShape(const std::string &name, std::vector<int> shape);
...@@ -61,20 +72,72 @@ class AnakinEngine { ...@@ -61,20 +72,72 @@ class AnakinEngine {
PADDLE_ENFORCE(graph_->AddOpAttr(op_name, attr_name, attr_value), PADDLE_ENFORCE(graph_->AddOpAttr(op_name, attr_name, attr_value),
"Add operation's attribution."); "Add operation's attribution.");
} }
NetT *Net() { return net_.get(); }
GraphT *Graph() { return graph_.get(); }
std::unique_ptr<AnakinEngine> Clone(); std::unique_ptr<AnakinEngine> Clone();
const std::map<std::string, std::vector<int>> &GetMaxInputShape() {
return max_input_shape_;
}
void SetMaxInputShape(std::map<std::string, std::vector<int>> shape) {
max_input_shape_ = shape;
}
int GetMaxBatchSize() { return max_batch_size_; }
void Freeze(); void Freeze();
void Optimize(); void Optimize();
void AllocTmpMem() {
PADDLE_ENFORCE(net_->alloc_memory_first(*graph_),
"anakin alloc temp memory first failed");
}
void Save(std::string path) { graph_->save(path); }
bool IsInit() { return initialized_; }
int GetDevice() { return device_; }
void Execute(const std::map<std::string, framework::LoDTensor *> &inputs, void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
const std::map<std::string, framework::LoDTensor *> &outputs); const std::map<std::string, framework::LoDTensor *> &outputs,
cudaStream_t stream);
private: private:
using NetT = ::anakin::Net<TargetT, PrecisionType, RunType>; bool initialized_{false};
using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>; int max_batch_size_;
std::map<std::string, std::vector<int>> max_input_shape_;
int device_;
std::unique_ptr<GraphT> graph_; std::unique_ptr<GraphT> graph_;
std::unique_ptr<NetT> net_; std::unique_ptr<NetT> net_;
}; };
class AnakinEngineManager {
using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
public:
bool HasEngine(const std::string &name) const {
if (engines_.count(name) == 0) return false;
return engines_.at(name).get() != nullptr;
}
AnakinNvEngineT *Get(const std::string &name) const {
return engines_.at(name).get();
}
AnakinNvEngineT *Create(
bool need_summary, int device, int max_batch_size,
std::map<std::string, std::vector<int>> max_input_shape,
std::string engine_name) {
std::unique_lock<std::mutex> lk(mut_);
auto *p = new AnakinEngine<NV, Precision::FP32>(
need_summary, device, max_batch_size, max_input_shape);
engines_[engine_name].reset(p);
return p;
}
void DeleteALL() {
for (auto &item : engines_) {
item.second.reset(nullptr);
}
}
private:
std::unordered_map<std::string, std::unique_ptr<AnakinNvEngineT>> engines_;
std::mutex mut_;
};
} // namespace anakin } // namespace anakin
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/anakin/op_teller.h"
namespace paddle {
namespace inference {
namespace anakin {
// Just tell by the op_types.
struct SimpleOpTypeSetTeller : public Teller {
SimpleOpTypeSetTeller() {
teller_set.insert("mul");
teller_set.insert("fc");
teller_set.insert("conv2d_fusion");
teller_set.insert("split");
teller_set.insert("relu");
teller_set.insert("pool2d");
teller_set.insert("elementwise_add");
teller_set.insert("elementwise_mul");
teller_set.insert("concat");
teller_set.insert("tanh");
teller_set.insert("conv2d");
teller_set.insert("batch_norm");
teller_set.insert("softmax");
teller_set.insert("flatten2");
teller_set.insert("reshape2");
teller_set.insert("transpose2");
teller_set.insert("density_prior_box");
teller_set.insert("detection_out");
teller_set.insert("dropout");
teller_set.insert("sigmoid");
teller_set.insert("sum");
}
bool operator()(const std::string& op_type,
const framework::OpDesc& desc) override {
return teller_set.count(op_type);
}
private:
std::unordered_set<std::string> teller_set;
};
bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
for (auto& teller : tellers_) {
if ((*teller)(op_type, desc)) return true;
}
return false;
}
OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
} // namespace anakin
} // namespace inference
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
namespace paddle {
namespace inference {
namespace anakin {
/*
* Single Op teller definition.
* One can override this and define a more complex tell logic, considerring more
* issues such as op_desc.
*/
struct Teller {
virtual bool operator()(const std::string& op_type,
const framework::OpDesc& desc) = 0;
virtual ~Teller() = default;
};
/*
* A real example:
*
* struct SomeTeller : public Teller {
* bool operator()(const std::string& op_type,
* const framework::OpDesc& desc) override {
* return op_type == "fc" && desc.Inputs().size() == 2;
* }
*};
*/
/*
* class OpTeller helps to tell whether a fluid
* operator can be transformed to a TensorRT layer.
*/
class OpTeller {
public:
static OpTeller& Global() {
static std::unique_ptr<OpTeller> x(new OpTeller);
return *x;
}
bool Tell(const std::string& op_type, const framework::OpDesc& desc);
private:
OpTeller();
private:
std::vector<std::unique_ptr<Teller>> tellers_;
};
} // namespace anakin
} // namespace inference
} // namespace paddle
...@@ -17,9 +17,6 @@ limitations under the License. */ ...@@ -17,9 +17,6 @@ limitations under the License. */
#include <map> #include <map>
#include "framework/core/net/net.h"
#include "framework/graph/graph.h"
#include "framework/graph/graph_global_mem.h"
#include "paddle/fluid/inference/anakin/engine.h" #include "paddle/fluid/inference/anakin/engine.h"
using anakin::graph::GraphGlobalMem; using anakin::graph::GraphGlobalMem;
...@@ -84,7 +81,9 @@ TEST_F(TestAnakinEngine, Execute) { ...@@ -84,7 +81,9 @@ TEST_F(TestAnakinEngine, Execute) {
auto *y_data = y.mutable_data<float>(platform::CUDAPlace()); auto *y_data = y.mutable_data<float>(platform::CUDAPlace());
std::map<std::string, framework::LoDTensor *> outputs = {{"y", &y}}; std::map<std::string, framework::LoDTensor *> outputs = {{"y", &y}};
engine_->Execute(inputs, outputs); cudaStream_t stream;
engine_->Execute(inputs, outputs, stream);
auto *y_data_gpu = y_data; auto *y_data_gpu = y_data;
float y_data_cpu[2]; float y_data_cpu[2];
cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost); cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost);
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#pragma once #pragma once
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -41,8 +42,11 @@ namespace inference { ...@@ -41,8 +42,11 @@ namespace inference {
namespace analysis { namespace analysis {
using framework::ir::Graph; using framework::ir::Graph;
#ifdef PADDLE_WITH_MKLDNN
using VarQuantScale = using VarQuantScale =
std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>; std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
#endif
/* /*
* The argument definition of both Pass and PassManagers. * The argument definition of both Pass and PassManagers.
...@@ -55,6 +59,8 @@ struct Argument { ...@@ -55,6 +59,8 @@ struct Argument {
using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>; using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>;
using fusion_statis_t = std::unordered_map<std::string, int>; using fusion_statis_t = std::unordered_map<std::string, int>;
using engine_opt_info_t = std::map<std::string, std::string>;
using anakin_max_shape_t = std::map<std::string, std::vector<int>>;
bool Has(const std::string& key) const { return valid_fields_.count(key); } bool Has(const std::string& key) const { return valid_fields_.count(key); }
...@@ -107,12 +113,14 @@ struct Argument { ...@@ -107,12 +113,14 @@ struct Argument {
private: \ private: \
unique_ptr_t field__##_; unique_ptr_t field__##_;
DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
// Model path // Model path
DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string); DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string);
// Model specified with program and parameters files. // Model specified with program and parameters files.
DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool); DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
DECL_ARGUMENT_FIELD(engine_opt_info, EngineOptInfo, engine_opt_info_t);
// The overall graph to work on. // The overall graph to work on.
DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
...@@ -132,6 +140,7 @@ struct Argument { ...@@ -132,6 +140,7 @@ struct Argument {
DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
std::unordered_set<std::string>); std::unordered_set<std::string>);
#ifdef PADDLE_WITH_MKLDNN
// A set of op types to enable their quantized kernels // A set of op types to enable their quantized kernels
DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes, DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes,
std::unordered_set<std::string>); std::unordered_set<std::string>);
...@@ -142,6 +151,7 @@ struct Argument { ...@@ -142,6 +151,7 @@ struct Argument {
// Scales for variables to be quantized // Scales for variables to be quantized
DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale); DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
#endif
// Passed from config. // Passed from config.
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
...@@ -155,6 +165,11 @@ struct Argument { ...@@ -155,6 +165,11 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
bool); bool);
DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
anakin_max_shape_t);
DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
// Memory optimized related. // Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool); DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);
......
...@@ -13,9 +13,12 @@ ...@@ -13,9 +13,12 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include <map>
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -61,6 +64,7 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -61,6 +64,7 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("mkldnn_enabled_op_types", pass->Set("mkldnn_enabled_op_types",
new std::unordered_set<std::string>( new std::unordered_set<std::string>(
argument->mkldnn_enabled_op_types())); argument->mkldnn_enabled_op_types()));
#ifdef PADDLE_WITH_MKLDNN
} else if (pass_name == "cpu_quantize_placement_pass") { } else if (pass_name == "cpu_quantize_placement_pass") {
pass->Set("quantize_enabled_op_types", pass->Set("quantize_enabled_op_types",
new std::unordered_set<std::string>( new std::unordered_set<std::string>(
...@@ -71,6 +75,7 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -71,6 +75,7 @@ void IRPassManager::CreatePasses(Argument *argument,
} else if (pass_name == "cpu_quantize_pass") { } else if (pass_name == "cpu_quantize_pass") {
pass->Set("quant_var_scales", pass->Set("quant_var_scales",
new VarQuantScale(argument->quant_var_scales())); new VarQuantScale(argument->quant_var_scales()));
#endif
} else if (pass_name == "tensorrt_subgraph_pass") { } else if (pass_name == "tensorrt_subgraph_pass") {
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
...@@ -83,16 +88,40 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -83,16 +88,40 @@ void IRPassManager::CreatePasses(Argument *argument,
AnalysisConfig::Precision::kInt8; AnalysisConfig::Precision::kInt8;
pass->Set("enable_int8", new bool(enable_int8)); pass->Set("enable_int8", new bool(enable_int8));
std::string model_opt_cache_dir =
argument->Has("model_dir") bool use_static_engine = argument->tensorrt_use_static_engine();
? argument->model_dir() bool model_from_memory = argument->model_from_memory();
: GetDirRoot(argument->model_program_path()); bool int8_valid = !(model_from_memory && enable_int8);
pass->Set( PADDLE_ENFORCE(int8_valid,
"model_opt_cache_dir", "TRT INT8 Now don't support model load from memory.");
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
if ((!model_from_memory && use_static_engine) || enable_int8) {
std::string model_opt_cache_dir =
argument->Has("model_dir")
? argument->model_dir()
: GetDirRoot(argument->model_program_path());
pass->Set(
"model_opt_cache_dir",
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
}
pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("use_static_engine", new bool(use_static_engine));
pass->Set("model_from_memory", new bool(argument->model_from_memory()));
pass->Set("engine_opt_info", new std::map<std::string, std::string>(
argument->engine_opt_info()));
}
if (pass_name == "anakin_subgraph_pass") {
pass->Set("program",
new framework::ProgramDesc *(&argument->main_program()));
pass->Set("gpu_device_id", new int(argument->gpu_device_id())); pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("use_static_engine", pass->Set("model_from_memory", new bool(argument->model_from_memory()));
new bool(argument->tensorrt_use_static_engine())); pass->Set("engine_opt_info", new std::map<std::string, std::string>(
argument->engine_opt_info()));
pass->Set("predictor_id", new int(argument->predictor_id()));
pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
argument->anakin_max_input_shape()));
pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
} }
pre_pass = pass_name; pre_pass = pass_name;
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) cc_library(subgraph_detector SRCS subgraph_detector.cc subgraph_util.cc DEPS proto_desc)
if(WITH_TESTING) if(WITH_TESTING)
add_dependencies(subgraph_detector gtest) add_dependencies(subgraph_detector gtest)
endif() endif()
...@@ -14,3 +14,15 @@ if (WITH_GPU AND TENSORRT_FOUND) ...@@ -14,3 +14,15 @@ if (WITH_GPU AND TENSORRT_FOUND)
file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
endif() endif()
if (ANAKIN_FOUND)
cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller)
set(analysis_deps ${analysis_deps}
subgraph_detector anakin_subgraph_pass
CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(APPEND ${pass_file} "USE_PASS(anakin_subgraph_pass);\n")
set(INFER_IR_PASSES ${INFER_IR_PASSES} anakin_subgraph_pass CACHE INTERNAL "")
endif()
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/op_teller.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace inference {
namespace analysis {
using framework::ir::Node;
std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
auto teller = [](const framework::ir::Node *node) {
if (!node->IsOp() || !node->Op()) return false;
return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
};
SubGraphFuser fuser(graph.get(), teller, 6 /* min_subgraph_size */);
fuser();
std::vector<std::string> graph_param_names =
ExtractParameters(graph->Nodes());
// those parameter already exist in anakin, and should not have another copy
// in fluid.
std::vector<std::string> repetitive_params;
for (auto *node : graph->Nodes()) {
if (node->IsOp() && !Agent(node).subgraph()->empty()) {
CreateAnakinOp(node, graph.get(), graph_param_names, &repetitive_params);
std::unordered_set<const Node *> nodes2remove(
Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
}
}
std::unordered_set<const Node *> nodes2remove;
for (auto *node : graph->Nodes()) {
if (node->IsOp() && Agent(node).deleted()) {
nodes2remove.insert(node);
}
}
framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
graph->Set(framework::ir::kRepetitiveParamAttr,
new std::vector<std::string>(repetitive_params));
return graph;
}
std::string GenerateAnakinEngineKey(const std::set<std::string> &engine_inputs,
const std::set<std::string> &engine_outputs,
std::string id) {
std::string engine_hash_key = "";
for (auto name : engine_inputs) {
engine_hash_key += name;
}
for (auto name : engine_outputs) {
engine_hash_key += name;
}
engine_hash_key += id;
auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
return engine_key;
}
void AnakinSubgraphPass::CreateAnakinOp(
framework::ir::Node *node, Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const {
auto *op_desc = node->Op();
auto &subgraph = *Agent(node).subgraph();
PADDLE_ENFORCE(!subgraph.empty());
framework::ProgramDesc *program_desc =
Get<framework::ProgramDesc *>("program");
// Add new block for TensorRTEngineOP
const framework::BlockDesc &main_block =
program_desc->Block(framework::kRootBlockIndex);
// const framework::BlockDesc& main_block = program_desc->Block(0);
framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
// An fake block desc.
framework::proto::BlockDesc block_proto;
framework::BlockDesc block_desc(nullptr, &block_proto);
block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0);
string::PrettyLogDetail("--- detect a sub-graph with %d nodes",
subgraph.size());
for (auto *node : subgraph) {
auto *new_block_op = new_block->AppendOp();
auto *op = block_desc.AppendOp();
*new_block_op->Proto() = *node->Op()->Proto();
*op->Proto() = *node->Op()->Proto();
}
// Then, we will use the input_names_with_id and output_names_with_id to
// generate the eigine key.
// So, We use set instead of unordered_set here to ensure that the engine key
// is unique.
std::set<std::string> input_names;
std::set<std::string> input_names_with_id;
std::vector<std::string> params;
for (auto *x : node->inputs) {
input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id()));
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
params.push_back(x->Name());
}
}
std::copy(params.begin(), params.end(),
std::back_inserter(*repetitive_params));
op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
std::set<std::string> output_names;
std::set<std::string> output_names_with_id;
for (auto *x : node->outputs) {
output_names.insert(x->Name());
output_names_with_id.insert(x->Name() + std::to_string(x->id()));
}
op_desc->SetOutput(
"Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
op_desc->SetType("anakin_engine");
std::unordered_map<std::string, std::string> output_name_map;
auto &subgraph_nodes = *Agent(node).subgraph();
// The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph.
RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
&output_names_with_id, &output_names, &output_name_map,
false);
// When anakin engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor
// to Tensor.
std::vector<std::string> output_mapping;
for (auto name : output_names) {
PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]);
}
auto *vars = block_desc.Proto()->mutable_vars();
for (framework::ir::Node *node : graph->Nodes()) {
if (node->IsVar() && node->Var()) {
*vars->Add() = *node->Var()->Proto();
}
}
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc");
PADDLE_ENFORCE(!output_mapping.empty());
op_desc->SetBlockAttr("sub_block", new_block);
SetAttr(op_desc->Proto(), "subgraph",
block_desc.Proto()->SerializeAsString());
// Set attrs
SetAttr(op_desc->Proto(), "parameters", params);
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
int predictor_id = Get<int>("predictor_id");
auto engine_key = GenerateAnakinEngineKey(
input_names_with_id, output_names_with_id, std::to_string(predictor_id));
SetAttr(op_desc->Proto(), "engine_key", engine_key);
auto max_input_shape =
Get<std::map<std::string, std::vector<int>>>("max_input_shape");
auto max_batch_size = Get<int>("max_batch_size");
auto *anakin_engine =
inference::Singleton<anakin::AnakinEngineManager>::Global().Create(
true, Get<int>("gpu_device_id"), max_batch_size, max_input_shape,
engine_key);
auto *scope = param_scope();
std::unordered_set<std::string> param_set(params.begin(), params.end());
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
.ConvertBlockToAnakinEngine(
&block_desc_temp, scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, anakin_engine);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
REGISTER_PASS(anakin_subgraph_pass,
paddle::inference::analysis::AnakinSubgraphPass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/anakin/engine.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
using anakin::Precision;
using anakin::saber::NV;
namespace paddle {
namespace inference {
namespace analysis {
class AnakinSubgraphPass : public framework::ir::FusePassBase {
public:
std::unique_ptr<framework::ir::Graph> ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const override;
private:
void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const;
void CleanIntermediateOutputs(framework::ir::Node *node);
};
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...@@ -418,7 +420,7 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() { ...@@ -418,7 +420,7 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
// as deleted. 3. Replace the deleted node with the new Block Node. // as deleted. 3. Replace the deleted node with the new Block Node.
framework::OpDesc empty_desc; framework::OpDesc empty_desc;
empty_desc.SetType("tensorrt_engine"); empty_desc.SetType("anakin_engine");
auto *block_node = graph_->CreateOpNode(&empty_desc); auto *block_node = graph_->CreateOpNode(&empty_desc);
Agent(block_node).set_subgraph({}); Agent(block_node).set_subgraph({});
auto io = ExtractInputAndOutputOfSubGraph(subgraph); auto io = ExtractInputAndOutputOfSubGraph(subgraph);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file defines the the class to partition a graph.
*/
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
#include <algorithm>
#include <string>
namespace paddle {
namespace inference {
namespace analysis {
using framework::ir::Node;
std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes) {
// We can judge whether a variable is a parameter by
// its presistable property, but sometimes the presistable
// of the feed op output is true, so we have to identify it.
std::vector<std::string> feed_outputs;
for (const auto &node : nodes) {
if (!node->IsOp()) continue;
std::string op_type = node->Op()->Type();
if (op_type == "feed" || op_type == "fetch") {
std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
std::copy(output_names.begin(), output_names.end(),
std::back_inserter(feed_outputs));
}
}
std::vector<std::string> parameters;
for (const auto &node : nodes) {
if (!node->IsVar()) continue;
if (node->Var()->Persistable() &&
std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
feed_outputs.end()) {
parameters.push_back(node->Name());
}
}
return parameters;
}
void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map,
bool is_trt) {
//// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
same_hierarchy_conv2d_num_map;
for (size_t index = 0; index < block_desc->OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
framework::OpDesc op_desc(*op, nullptr);
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
std::unordered_map<std::string, size_t> var2id;
std::unordered_map<std::string, framework::ir::Node *> in_vars;
for (auto *in_var : correspond_node->inputs) {
var2id[in_var->Name()] = in_var->id();
in_vars[in_var->Name()] = in_var;
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
// one input
auto *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments
std::string arg_value = in_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id();
}
if (op_desc.Type() == "conv2d" && is_trt) {
auto input_var_name = op_desc.Input("Input").front();
auto filter_var_name = op_desc.Input("Filter").front();
auto out_var_name = op_desc.Output("Output").front();
auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
const std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
const std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
(*output_names_with_id)
.insert(out_var_name + std::to_string(var2id[out_var_name]));
(*output_names).insert(out_var_name);
} else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
paddings[1] == 0) {
same_hierarchy_conv2d_num_map[input_var_name] += 1;
}
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id->count(arg_value_with_id)) {
(*output_name_map)[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file defines the the class to partition a graph.
*/
#pragma once
#include <set>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/node.h"
namespace paddle {
namespace inference {
namespace analysis {
using framework::ir::Node;
std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes);
void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map,
bool is_trt = true);
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include <map>
#include <set> #include <set>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...@@ -30,17 +31,6 @@ namespace analysis { ...@@ -30,17 +31,6 @@ namespace analysis {
using framework::ir::Node; using framework::ir::Node;
std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes);
void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map);
std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const { std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
...@@ -209,186 +199,86 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -209,186 +199,86 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr(op_desc->Proto(), "parameters", params); SetAttr(op_desc->Proto(), "parameters", params);
auto enable_int8 = Get<bool>("enable_int8"); auto enable_int8 = Get<bool>("enable_int8");
auto use_static_engine = Get<bool>("use_static_engine");
auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
std::to_string(0)); std::to_string(0));
// Get "" when there is no cached calibration table data. // Get "" when there is no cached calibration table data.
std::string calibration_data = GetTrtCalibTableData( bool load_from_memory = Get<bool>("model_from_memory");
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8); std::string calibration_data = "";
if (enable_int8) {
calibration_data = GetTrtCalibTableData(
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
}
SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "engine_key", engine_key);
SetAttr(op_desc->Proto(), "engine_serialized_data", std::string("")); std::string trt_engine_serialized_data = "";
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator; std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
if (enable_int8 && calibration_data.size() != 0) { if (enable_int8 && calibration_data.size() != 0) {
calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data)); calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
} }
bool use_static_engine = Get<bool>("use_static_engine");
// When in int8 mode and calibration_mode, the program just produce the // When in int8 mode and calibration_mode, the program just produce the
// calibration table data. // calibration table data.
bool calibration_mode = (enable_int8 && calibration_data.size() == 0); bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
if (!calibration_mode && use_static_engine) { if (calibration_mode) {
std::copy(params.begin(), params.end(), // calibraion mode means generate int8 calibration table data process.
std::back_inserter(*repetitive_params)); return;
std::string trt_engine_serialized_data = GetTrtEngineSerializedData( }
Get<std::string>("model_opt_cache_dir"), engine_key);
if (trt_engine_serialized_data.empty()) { std::copy(params.begin(), params.end(),
LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " std::back_inserter(*repetitive_params));
"kernel etc). This process may cost a lot of time."; bool need_serialize = (use_static_engine && !load_from_memory);
std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
new tensorrt::TensorRTEngine( if (need_serialize) {
Get<int>("max_batch_size"), Get<int>("workspace_size"), trt_engine_serialized_data = GetTrtEngineSerializedData(
enable_int8, calibrator.get(), Get<int>("gpu_device_id"))); Get<std::string>("model_opt_cache_dir"), engine_key);
auto *scope = param_scope(); // we can load the engine info serialized before from the disk.
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); if (!trt_engine_serialized_data.empty()) {
std::unordered_set<std::string> param_set(params.begin(), params.end()); SetAttr(op_desc->Proto(), "engine_serialized_data",
inference::Singleton<inference::tensorrt::OpConverter>::Global() trt_engine_serialized_data);
.ConvertBlockToTRTEngine(
&block_desc_temp, *scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, trt_engine.get());
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
trt_engine_serialized_data =
std::string((const char *)serialized_engine_data->data(),
serialized_engine_data->size());
SaveTrtEngineSerializedDataToFile(
GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
engine_key),
trt_engine_serialized_data);
} else {
LOG(INFO) << "Load TRT Optimized Info from " LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath( << GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key); Get<std::string>("model_opt_cache_dir"), engine_key);
} return;
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
}
}
std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes) {
// We can judge whether a variable is a parameter by
// its presistable property, but sometimes the presistable
// of the feed op output is true, so we have to identify it.
std::vector<std::string> feed_outputs;
for (const auto &node : nodes) {
if (!node->IsOp()) continue;
std::string op_type = node->Op()->Type();
if (op_type == "feed" || op_type == "fetch") {
std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
std::copy(output_names.begin(), output_names.end(),
std::back_inserter(feed_outputs));
} }
} }
std::vector<std::string> parameters; // the following code will NOT run in following situation:
for (const auto &node : nodes) { // 1. calibraion mode (generate trt int8 calibraiton table data)
if (!node->IsVar()) continue; // 2. already load serialized trt engine info.
if (node->Var()->Persistable() && LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == "kernel etc). This process may cost a lot of time.";
feed_outputs.end()) { std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
parameters.push_back(node->Name()); new tensorrt::TensorRTEngine(
} Get<int>("max_batch_size"), Get<int>("workspace_size"), enable_int8,
} calibrator.get(), Get<int>("gpu_device_id")));
return parameters; auto *scope = param_scope();
} framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
std::unordered_set<std::string> param_set(params.begin(), params.end());
void RenameAndGetOutputs( inference::Singleton<inference::tensorrt::OpConverter>::Global()
const std::vector<framework::ir::Node *> &subgraph_nodes, .ConvertBlockToTRTEngine(
framework::BlockDesc *block_desc, &block_desc_temp, *scope,
const std::set<std::string> &input_names_with_id, std::vector<std::string>(input_names.begin(), input_names.end()),
std::set<std::string> *output_names_with_id, param_set, output_mapping, trt_engine.get());
std::set<std::string> *output_names, nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
std::unordered_map<std::string, std::string> *output_name_map) { trt_engine_serialized_data =
//// In the normal case, the paddle-trt exists bug when runing the googlenet. std::string((const char *)serialized_engine_data->data(),
// When there are more than two convolutions of 1 * 1 with the same input, the serialized_engine_data->size());
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid if (need_serialize) {
// this optimization for the time being. This bug will be fixed in the future. SaveTrtEngineSerializedDataToFile(
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/> GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
same_hierarchy_conv2d_num_map; engine_key),
trt_engine_serialized_data);
for (size_t index = 0; index < block_desc->OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
framework::OpDesc op_desc(*op, nullptr);
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
std::unordered_map<std::string, size_t> var2id;
std::unordered_map<std::string, framework::ir::Node *> in_vars;
for (auto *in_var : correspond_node->inputs) {
var2id[in_var->Name()] = in_var->id();
in_vars[in_var->Name()] = in_var;
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
// one input
auto *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments
std::string arg_value = in_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id();
}
if (op_desc.Type() == "conv2d") {
auto input_var_name = op_desc.Input("Input").front();
auto filter_var_name = op_desc.Input("Filter").front();
auto out_var_name = op_desc.Output("Output").front();
auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
const std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
const std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
(*output_names_with_id)
.insert(out_var_name + std::to_string(var2id[out_var_name]));
(*output_names).insert(out_var_name);
} else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
paddings[1] == 0) {
same_hierarchy_conv2d_num_map[input_var_name] += 1;
}
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id->count(arg_value_with_id)) {
(*output_name_map)[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
} }
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
} }
} // namespace analysis } // namespace analysis
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -27,6 +27,10 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -27,6 +27,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
endif() endif()
if (ANAKIN_FOUND)
set(inference_deps ${inference_deps} anakin_op_converter anakin_engine)
endif()
add_subdirectory(details) add_subdirectory(details)
cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder) cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
extern const std::vector<std::string> kAnakinSubgraphPasses;
PassStrategy *AnalysisConfig::pass_builder() const { PassStrategy *AnalysisConfig::pass_builder() const {
if (!pass_builder_.get()) { if (!pass_builder_.get()) {
...@@ -108,6 +109,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -108,6 +109,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_mkldnn_); CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_); CP_MEMBER(mkldnn_enabled_op_types_);
CP_MEMBER(use_anakin_);
CP_MEMBER(anakin_max_batchsize_);
CP_MEMBER(anakin_max_input_shape_);
// Ir related. // Ir related.
CP_MEMBER(enable_ir_optim_); CP_MEMBER(enable_ir_optim_);
CP_MEMBER(use_feed_fetch_ops_); CP_MEMBER(use_feed_fetch_ops_);
...@@ -230,6 +235,20 @@ void AnalysisConfig::Update() { ...@@ -230,6 +235,20 @@ void AnalysisConfig::Update() {
} }
} }
if (use_anakin_) {
PADDLE_ENFORCE(!use_tensorrt_,
"Anakin sub-graph and TensorRT sub-graph are not allowed to "
"run at the same time!");
PADDLE_ENFORCE(
use_gpu_,
"Anakin sub-graph engine need gpu, please use the EnableGpu API.");
pass_builder()->ClearPasses();
for (const auto &pass : kAnakinSubgraphPasses) {
pass_builder()->AppendPass(pass);
}
}
if (ir_debug_) { if (ir_debug_) {
pass_builder()->TurnOnDebug(); pass_builder()->TurnOnDebug();
} }
...@@ -266,7 +285,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -266,7 +285,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << specify_input_name_; ss << specify_input_name_;
ss << cpu_math_library_num_threads_; ss << cpu_math_library_num_threads_;
ss << use_anakin_;
return ss.str(); return ss.str();
} }
...@@ -316,6 +335,11 @@ void AnalysisConfig::SetModelBuffer(const char *prog_buffer, ...@@ -316,6 +335,11 @@ void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
Update(); Update();
} }
void AnalysisConfig::SetEngineOptInfo(
std::map<std::string, std::string> engine_opt_info) {
engine_opt_info_ = engine_opt_info;
}
NativeConfig AnalysisConfig::ToNativeConfig() const { NativeConfig AnalysisConfig::ToNativeConfig() const {
NativeConfig config; NativeConfig config;
config.model_dir = model_dir_; config.model_dir = model_dir_;
...@@ -332,5 +356,12 @@ void AnalysisConfig::SwitchIrDebug(int x) { ...@@ -332,5 +356,12 @@ void AnalysisConfig::SwitchIrDebug(int x) {
ir_debug_ = x; ir_debug_ = x;
Update(); Update();
} }
void AnalysisConfig::EnableAnakinEngine(
int max_batch_size,
std::map<std::string, std::vector<int>> max_input_shape) {
anakin_max_batchsize_ = max_batch_size;
anakin_max_input_shape_ = max_input_shape;
use_anakin_ = true;
Update();
}
} // namespace paddle } // namespace paddle
...@@ -40,7 +40,10 @@ ...@@ -40,7 +40,10 @@
#if PADDLE_WITH_TENSORRT #if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#endif
#if PADDLE_WITH_ANAKIN
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#endif #endif
DECLARE_bool(profile); DECLARE_bool(profile);
...@@ -349,7 +352,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -349,7 +352,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetStaticMemoryOptimForceUpdate( argument_.SetStaticMemoryOptimForceUpdate(
config_.static_memory_optim_force_update_); config_.static_memory_optim_force_update_);
argument_.SetModelFromMemory(config_.model_from_memory_); argument_.SetModelFromMemory(config_.model_from_memory_);
argument_.SetEngineOptInfo(config_.engine_opt_info_);
// Analyze inference_program // Analyze inference_program
argument_.SetUseAnakin(config_.anakin_engine_enabled());
argument_.SetPredictorID(predictor_id_);
if (!config_.model_dir().empty()) { if (!config_.model_dir().empty()) {
argument_.SetModelDir(config_.model_dir()); argument_.SetModelDir(config_.model_dir());
} else { } else {
...@@ -373,6 +379,12 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -373,6 +379,12 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
} }
if (config_.use_gpu() && config_.anakin_engine_enabled()) {
argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
LOG(INFO) << "Anakin subgraph engine is enabled";
}
if (config_.use_mkldnn_) { if (config_.use_mkldnn_) {
LOG(INFO) << "MKLDNN is enabled"; LOG(INFO) << "MKLDNN is enabled";
argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
...@@ -402,7 +414,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -402,7 +414,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG(3) << "create AnalysisConfig"; VLOG(3) << "create AnalysisConfig";
if (config.use_gpu()) { if (config.use_gpu()) {
// 1. GPU memory // 1. GPU memory
PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f); PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
config.gpu_device_id()); config.gpu_device_id());
std::vector<std::string> flags; std::vector<std::string> flags;
...@@ -805,3 +817,27 @@ USE_TRT_CONVERTER(prelu); ...@@ -805,3 +817,27 @@ USE_TRT_CONVERTER(prelu);
USE_TRT_CONVERTER(conv2d_transpose); USE_TRT_CONVERTER(conv2d_transpose);
USE_TRT_CONVERTER(leaky_relu); USE_TRT_CONVERTER(leaky_relu);
#endif #endif
#if PADDLE_WITH_ANAKIN
USE_ANAKIN_CONVERTER(mul);
USE_ANAKIN_CONVERTER(fc);
USE_ANAKIN_CONVERTER(conv2d);
USE_ANAKIN_CONVERTER(conv2d_fusion);
USE_ANAKIN_CONVERTER(concat);
USE_ANAKIN_CONVERTER(split);
USE_ANAKIN_CONVERTER(relu);
USE_ANAKIN_CONVERTER(sigmoid);
USE_ANAKIN_CONVERTER(tanh);
USE_ANAKIN_CONVERTER(pool2d);
USE_ANAKIN_CONVERTER(elementwise_add);
USE_ANAKIN_CONVERTER(elementwise_mul);
USE_ANAKIN_CONVERTER(batch_norm);
USE_ANAKIN_CONVERTER(flatten);
USE_ANAKIN_CONVERTER(reshape);
USE_ANAKIN_CONVERTER(transpose);
USE_ANAKIN_CONVERTER(softmax);
USE_ANAKIN_CONVERTER(detection_out);
USE_ANAKIN_CONVERTER(density_prior_box);
USE_ANAKIN_CONVERTER(dropout);
USE_ANAKIN_CONVERTER(sum);
#endif
...@@ -45,7 +45,9 @@ using framework::NaiveExecutor; ...@@ -45,7 +45,9 @@ using framework::NaiveExecutor;
*/ */
class AnalysisPredictor : public PaddlePredictor { class AnalysisPredictor : public PaddlePredictor {
public: public:
explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {} explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
predictor_id_ = inference::GetUniqueId();
}
~AnalysisPredictor(); ~AnalysisPredictor();
bool Init(const std::shared_ptr<framework::Scope> &parent_scope, bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
...@@ -152,6 +154,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -152,6 +154,7 @@ class AnalysisPredictor : public PaddlePredictor {
const size_t max_shape_collect_count_{1000}; const size_t max_shape_collect_count_{1000};
int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true. int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true.
std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_; std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
int predictor_id_;
private: private:
// Some status here that help to determine the status inside the predictor. // Some status here that help to determine the status inside the predictor.
......
...@@ -74,6 +74,21 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { ...@@ -74,6 +74,21 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
return res; return res;
} }
PaddleDType ZeroCopyTensor::type() const {
EAGER_GET_TENSOR;
auto type = tensor->type();
if (type == framework::proto::VarType::FP32) {
return PaddleDType::FLOAT32;
} else if (type == framework::proto::VarType::INT64) {
return PaddleDType::INT64;
} else if (type == framework::proto::VarType::INT32) {
return PaddleDType::INT32;
} else {
LOG(ERROR) << "unknown type, only support float32 and int64 now.";
}
return PaddleDType::FLOAT32;
}
template <typename T> template <typename T>
void ZeroCopyTensor::copy_from_cpu(const T *data) { void ZeroCopyTensor::copy_from_cpu(const T *data) {
EAGER_GET_TENSOR; EAGER_GET_TENSOR;
...@@ -119,6 +134,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { ...@@ -119,6 +134,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place)); static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place, memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
t_data, ele_num * sizeof(T), dev_ctx->stream()); t_data, ele_num * sizeof(T), dev_ctx->stream());
cudaDeviceSynchronize();
#else #else
PADDLE_THROW("Not compile with CUDA, should not reach here."); PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif #endif
......
...@@ -14,9 +14,11 @@ ...@@ -14,9 +14,11 @@
#pragma once #pragma once
#include <cassert> #include <cassert>
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include <vector> #include <vector>
/*! \file */ /*! \file */
...@@ -136,10 +138,20 @@ struct AnalysisConfig { ...@@ -136,10 +138,20 @@ struct AnalysisConfig {
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3, int max_batch_size = 1, int min_subgraph_size = 3,
Precision precision = Precision::kFloat32, Precision precision = Precision::kFloat32,
bool use_static = true); bool use_static = false);
/** A boolean state telling whether the TensorRT engine is used. /** A boolean state telling whether the TensorRT engine is used.
*/ */
bool tensorrt_engine_enabled() const { return use_tensorrt_; } bool tensorrt_engine_enabled() const { return use_tensorrt_; }
/**
* \brief Turn on the usage of Anakin sub-graph engine.
*/
void EnableAnakinEngine(
int max_batch_size = 1,
std::map<std::string, std::vector<int>> max_input_shape = {});
/** A boolean state indicating whether the Anakin sub-graph engine is used.
*/
bool anakin_engine_enabled() const { return use_anakin_; }
/** \brief Control whether to debug IR graph analysis phase. /** \brief Control whether to debug IR graph analysis phase.
* *
...@@ -185,6 +197,7 @@ struct AnalysisConfig { ...@@ -185,6 +197,7 @@ struct AnalysisConfig {
/** A boolean state telling whether the model is set from the CPU memory. /** A boolean state telling whether the model is set from the CPU memory.
*/ */
bool model_from_memory() const { return model_from_memory_; } bool model_from_memory() const { return model_from_memory_; }
void SetEngineOptInfo(std::map<std::string, std::string> engine_opt_info);
/** Turn on memory optimize /** Turn on memory optimize
* NOTE still in development, will release latter. * NOTE still in development, will release latter.
...@@ -258,6 +271,10 @@ struct AnalysisConfig { ...@@ -258,6 +271,10 @@ struct AnalysisConfig {
std::string serialized_info_cache_; std::string serialized_info_cache_;
mutable std::unique_ptr<PassStrategy> pass_builder_; mutable std::unique_ptr<PassStrategy> pass_builder_;
bool use_anakin_{false};
int anakin_max_batchsize_;
std::map<std::string, std::vector<int>> anakin_max_input_shape_;
std::map<std::string, std::string> engine_opt_info_;
}; };
} // namespace paddle } // namespace paddle
...@@ -177,6 +177,8 @@ class ZeroCopyTensor { ...@@ -177,6 +177,8 @@ class ZeroCopyTensor {
device_ = device; device_ = device;
} }
PaddleDType type() const;
protected: protected:
explicit ZeroCopyTensor(void* scope) : scope_{scope} {} explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
void SetName(const std::string& name) { name_ = name; } void SetName(const std::string& name) { name_ = name; }
...@@ -191,6 +193,7 @@ class ZeroCopyTensor { ...@@ -191,6 +193,7 @@ class ZeroCopyTensor {
// performance. // performance.
mutable void* tensor_{nullptr}; mutable void* tensor_{nullptr};
PaddlePlace place_; PaddlePlace place_;
PaddleDType dtype_;
int device_; int device_;
}; };
......
...@@ -68,10 +68,26 @@ void GpuPassStrategy::EnableMKLDNN() { ...@@ -68,10 +68,26 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet"; LOG(ERROR) << "GPU not support MKLDNN yet";
} }
// The following passes works for Anakin sub-graph engine.
const std::vector<std::string> kAnakinSubgraphPasses({
"infer_clean_graph_pass", //
"simplify_anakin_detection_pattern_pass5", //
"simplify_anakin_detection_pattern_pass4", //
"simplify_anakin_detection_pattern_pass3", //
"simplify_anakin_detection_pattern_pass2", //
"anakin_fillconstant_elementwisemul_fuse", //
"fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"fc_gru_fuse_pass", //
"anakin_subgraph_pass",
});
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"identity_scale_op_clean_pass", // // "identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", // "conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
...@@ -84,7 +100,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { ...@@ -84,7 +100,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
#endif #endif
}); });
for (int i = 6; i >= 3; i--) { for (int i = 6; i >= 2; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) + passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass"); "_concat_fuse_pass");
} }
...@@ -124,4 +140,5 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { ...@@ -124,4 +140,5 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
}); });
use_gpu_ = false; use_gpu_ = false;
} }
void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
} // namespace paddle } // namespace paddle
...@@ -45,6 +45,7 @@ class PaddlePassBuilder { ...@@ -45,6 +45,7 @@ class PaddlePassBuilder {
/** Delete all the passes that has type `pass_type`. */ /** Delete all the passes that has type `pass_type`. */
void DeletePass(const std::string &pass_type); void DeletePass(const std::string &pass_type);
void ClearPasses();
/** Append an analysis pass. */ /** Append an analysis pass. */
void AppendAnalysisPass(const std::string &pass); void AppendAnalysisPass(const std::string &pass);
...@@ -157,4 +158,6 @@ class GpuPassStrategy : public PassStrategy { ...@@ -157,4 +158,6 @@ class GpuPassStrategy : public PassStrategy {
virtual ~GpuPassStrategy() = default; virtual ~GpuPassStrategy() = default;
}; };
extern const std::vector<std::string> kAnakinSubgraphPasses;
} // namespace paddle } // namespace paddle
...@@ -45,7 +45,7 @@ class EngineIOConverter { ...@@ -45,7 +45,7 @@ class EngineIOConverter {
static void ConvertInput(const std::string& op_type, const LoDTensor& in, static void ConvertInput(const std::string& op_type, const LoDTensor& in,
void* out, size_t max_size, cudaStream_t* stream) { void* out, size_t max_size, cudaStream_t* stream) {
PADDLE_ENFORCE(stream != nullptr); PADDLE_ENFORCE(stream != nullptr);
auto* converter = Registry<EngineIOConverter>::Lookup( auto* converter = Registry<EngineIOConverter>::Global().Lookup(
op_type, "default" /* default_type */); op_type, "default" /* default_type */);
PADDLE_ENFORCE_NOT_NULL(converter); PADDLE_ENFORCE_NOT_NULL(converter);
converter->SetStream(stream); converter->SetStream(stream);
...@@ -56,7 +56,7 @@ class EngineIOConverter { ...@@ -56,7 +56,7 @@ class EngineIOConverter {
LoDTensor* out, size_t max_size, LoDTensor* out, size_t max_size,
cudaStream_t* stream) { cudaStream_t* stream) {
PADDLE_ENFORCE(stream != nullptr); PADDLE_ENFORCE(stream != nullptr);
auto* converter = Registry<EngineIOConverter>::Lookup( auto* converter = Registry<EngineIOConverter>::Global().Lookup(
op_type, "default" /* default_type */); op_type, "default" /* default_type */);
PADDLE_ENFORCE_NOT_NULL(converter); PADDLE_ENFORCE_NOT_NULL(converter);
converter->SetStream(stream); converter->SetStream(stream);
...@@ -69,12 +69,12 @@ class EngineIOConverter { ...@@ -69,12 +69,12 @@ class EngineIOConverter {
cudaStream_t* stream_{nullptr}; cudaStream_t* stream_{nullptr};
}; };
#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__) \ #define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__) \
struct trt_io_##op_type__##_converter { \ struct trt_io_##op_type__##_converter { \
trt_io_##op_type__##_converter() { \ trt_io_##op_type__##_converter() { \
Registry<EngineIOConverter>::Register<Converter__>(#op_type__); \ Registry<EngineIOConverter>::Global().Register<Converter__>(#op_type__); \
} \ } \
}; \ }; \
trt_io_##op_type__##_converter trt_io_##op_type__##_converter__; trt_io_##op_type__##_converter trt_io_##op_type__##_converter__;
} // namespace tensorrt } // namespace tensorrt
......
...@@ -86,7 +86,7 @@ class OpConverter { ...@@ -86,7 +86,7 @@ class OpConverter {
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
std::string Y = op_desc.Input("Y")[0]; std::string Y = op_desc.Input("Y")[0];
if (parameters.count(Y)) { if (parameters.count(Y)) {
it = Registry<OpConverter>::Lookup("fc"); it = Registry<OpConverter>::Global().Lookup("fc");
} }
} }
if (op_desc.Type().find("elementwise") != std::string::npos) { if (op_desc.Type().find("elementwise") != std::string::npos) {
...@@ -103,28 +103,28 @@ class OpConverter { ...@@ -103,28 +103,28 @@ class OpConverter {
if (parameters.count(Y)) { if (parameters.count(Y)) {
PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0, PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
"Unsupported elementwise type" + op_type); "Unsupported elementwise type" + op_type);
it = it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight"); "_weight");
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
op_desc.Type()); op_desc.Type());
} else { } else {
PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0, PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
"Unsupported elementwise type" + op_type); "Unsupported elementwise type" + op_type);
it = it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor"); "_tensor");
} }
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
op_desc.Type()); op_desc.Type());
} }
if (op_desc.Type() == "depthwise_conv2d") { if (op_desc.Type() == "depthwise_conv2d") {
it = Registry<OpConverter>::Lookup("conv2d"); it = Registry<OpConverter>::Global().Lookup("conv2d");
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
op_desc.Type()); op_desc.Type());
} }
if (!it) { if (!it) {
it = Registry<OpConverter>::Lookup(op_desc.Type()); it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
} }
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
op_desc.Type()); op_desc.Type());
...@@ -198,9 +198,9 @@ class OpConverter { ...@@ -198,9 +198,9 @@ class OpConverter {
#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \
struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \ struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
trt_##op_type__##_converter() { \ trt_##op_type__##_converter() { \
::paddle::inference:: \ ::paddle::inference::Registry< \
Registry<paddle::inference::tensorrt::OpConverter>::Register< \ paddle::inference::tensorrt::OpConverter>::Global() \
::paddle::inference::tensorrt::Converter__>(#op_type__); \ .Register<::paddle::inference::tensorrt::Converter__>(#op_type__); \
} \ } \
}; \ }; \
trt_##op_type__##_converter trt_##op_type__##_converter__; \ trt_##op_type__##_converter trt_##op_type__##_converter__; \
......
...@@ -214,23 +214,28 @@ TEST(Analyzer_Transformer, fuse_statis) { ...@@ -214,23 +214,28 @@ TEST(Analyzer_Transformer, fuse_statis) {
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
void compare(bool use_mkldnn = false) { // void compare(bool use_mkldnn = false) {
AnalysisConfig cfg; // AnalysisConfig cfg;
SetConfig(&cfg); // SetConfig(&cfg);
if (use_mkldnn) { // if (use_mkldnn) {
cfg.EnableMKLDNN(); // cfg.EnableMKLDNN();
} // }
//
std::vector<std::vector<PaddleTensor>> input_slots_all; // std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); // SetInput(&input_slots_all);
CompareNativeAndAnalysis( // CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all); // reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
} // input_slots_all);
// }
TEST(Analyzer_Transformer, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN // TODO(yihuaxu):
TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); } // Disable compare and compare_mkldnn temporary, see
#endif // https://github.com/paddlePaddle/Paddle/issues/16316 for details.
// TEST(Analyzer_Transformer, compare) { compare(); }
// #ifdef PADDLE_WITH_MKLDNN
// TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */);
// }
// #endif
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -45,13 +45,13 @@ struct Registry { ...@@ -45,13 +45,13 @@ struct Registry {
} }
template <typename ItemChild> template <typename ItemChild>
static void Register(const std::string& name) { void Register(const std::string& name) {
PADDLE_ENFORCE_EQ(items_.count(name), 0); PADDLE_ENFORCE_EQ(items_.count(name), 0);
items_[name] = new ItemChild; items_[name] = new ItemChild;
} }
static ItemParent* Lookup(const std::string& name, ItemParent* Lookup(const std::string& name,
const std::string& default_name = "") { const std::string& default_name = "") {
auto it = items_.find(name); auto it = items_.find(name);
if (it == items_.end()) { if (it == items_.end()) {
if (default_name == "") if (default_name == "")
...@@ -70,11 +70,8 @@ struct Registry { ...@@ -70,11 +70,8 @@ struct Registry {
private: private:
Registry() = default; Registry() = default;
static std::unordered_map<std::string, ItemParent*> items_; std::unordered_map<std::string, ItemParent*> items_;
}; };
template <typename ItemParent>
std::unordered_map<std::string, ItemParent*> Registry<ItemParent>::items_;
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -34,6 +34,10 @@ if (WITH_GPU AND TENSORRT_FOUND) ...@@ -34,6 +34,10 @@ if (WITH_GPU AND TENSORRT_FOUND)
add_subdirectory(tensorrt) add_subdirectory(tensorrt)
endif() endif()
if (ANAKIN_FOUND)
add_subdirectory(anakin)
endif()
SET(OP_HEADER_DEPS xxhash) SET(OP_HEADER_DEPS xxhash)
if (WITH_GPU) if (WITH_GPU)
SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub) SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
......
op_library(anakin_engine_op DEPS anakin_engine anakin_op_converter)
# file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(anakin_engine);\n")
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <string>
#include <vector>
#include "paddle/fluid/operators/anakin/anakin_engine_op.h"
namespace paddle {
namespace operators {
class AnakinEngineOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Xs", "A list of inputs.").AsDuplicable();
AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph.");
AddAttr<std::string>(
"engine_key",
"The engine_key here is used to distinguish different TRT Engines");
AddAttr<framework::BlockDesc *>("sub_block", "the trt block");
AddComment("Anakin engine operator.");
}
};
class AnakinEngineInferVarType : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(anakin_engine, ops::AnakinEngineOp, ops::AnakinEngineOpMaker,
ops::AnakinEngineOpMaker);
#endif // PADDLE_WITH_CUDA
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <fstream>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/anakin/convert/op_converter.h"
#include "paddle/fluid/inference/anakin/engine.h"
#include "paddle/fluid/inference/analysis/helper.h"
namespace paddle {
namespace operators {
using FluidDT = framework::proto::VarType_Type;
using inference::Singleton;
using anakin::graph::GraphGlobalMem;
using anakin::AK_FLOAT;
using anakin::Precision;
using anakin::saber::NV;
using anakin::saber::X86;
using anakin::saber::Shape;
using anakin::PBlock;
using anakin::PTuple;
using inference::anakin::AnakinEngine;
class AnakinEngineOp : public framework::OperatorBase {
using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
private:
std::vector<std::string> input_names_;
std::unordered_set<std::string> param_names_;
mutable AnakinNvEngineT *anakin_engine_;
std::string engine_key_;
std::string engine_serialized_data_;
public:
AnakinEngineOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {
input_names_ = Inputs("Xs");
engine_key_ = Attr<std::string>("engine_key");
auto params = Attr<std::vector<std::string>>("parameters");
for (const auto &param : params) {
param_names_.insert(param);
}
anakin_engine_ = nullptr;
}
protected:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
RunAnakin(scope, dev_place);
}
void RunAnakin(const framework::Scope &scope,
const platform::Place &dev_place) const {
auto *engine = GetEngine(scope, dev_place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
std::vector<std::string> output_maps =
Attr<std::vector<std::string>>("output_name_mapping");
std::map<std::string, framework::LoDTensor *> inputs;
// Convert input tensor from fluid to engine.
for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
inputs.insert({x, &t});
}
std::map<std::string, framework::LoDTensor *> outputs;
int output_index = 0;
for (const auto &y : Outputs("Ys")) {
auto *fluid_v = scope.FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
outputs.insert({output_maps[output_index], fluid_t});
output_index += 1;
}
engine->Execute(inputs, outputs, stream);
}
AnakinNvEngineT *GetEngine(const framework::Scope &scope,
const platform::Place &dev_place) const {
if (anakin_engine_ == nullptr) {
anakin_engine_ =
inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
.Get(engine_key_);
}
return anakin_engine_;
}
void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
AnakinNvEngineT *engine) const {
LOG(INFO) << "Prepare Anakin engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time.";
framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(Attr<std::string>("subgraph"));
std::vector<std::string> output_maps =
Attr<std::vector<std::string>>("output_name_mapping");
inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
.ConvertBlock(block_desc, param_names_, scope, engine);
engine->Freeze();
for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
auto t_shape = framework::vectorize2int(t.dims());
// all input shape should be 4 dims
if (t_shape.size() == 2) {
t_shape.push_back(1);
t_shape.push_back(1);
}
engine->SetInputShape(x, t_shape);
}
engine->Optimize();
engine->InitGraph();
}
};
} // namespace operators
} // namespace paddle
#endif // PADDLE_WITH_CUDA
...@@ -586,14 +586,10 @@ std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const { ...@@ -586,14 +586,10 @@ std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
return std::unique_ptr<framework::OpDesc>(op); return std::unique_ptr<framework::OpDesc>(op);
} }
class BatchNormInplaceInToOut : public framework::InplaceInToOut { class BatchNormInplaceInToOut : public framework::InplaceOpInference {
public: public:
using InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const framework::OpDesc &op_desc) const override {
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = { std::unordered_map<std::string, std::string> inplace_in_to_out = {
{"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"}, {"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"},
}; };
...@@ -601,14 +597,10 @@ class BatchNormInplaceInToOut : public framework::InplaceInToOut { ...@@ -601,14 +597,10 @@ class BatchNormInplaceInToOut : public framework::InplaceInToOut {
} }
}; };
class BatchNormGradInplaceInToOut : public framework::InplaceInToOut { class BatchNormGradInplaceInToOut : public framework::InplaceOpInference {
public: public:
using InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const framework::OpDesc &op_desc) const override {
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = { std::unordered_map<std::string, std::string> inplace_in_to_out = {
// Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C] // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C]
{framework::GradVarName("Y"), framework::GradVarName("X")}, {framework::GradVarName("Y"), framework::GradVarName("X")},
......
...@@ -44,6 +44,7 @@ class WhileOp : public framework::OperatorBase { ...@@ -44,6 +44,7 @@ class WhileOp : public framework::OperatorBase {
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override { const platform::Place &dev_place) const override {
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition))); PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>(); auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1})); PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
...@@ -63,13 +64,34 @@ class WhileOp : public framework::OperatorBase { ...@@ -63,13 +64,34 @@ class WhileOp : public framework::OperatorBase {
VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
while (cond.data<bool>()[0]) { if (!is_test) {
while (cond.data<bool>()[0]) {
auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope);
executor.RunPreparedContext(ctx.get(), &current_scope, false, true,
true);
}
} else {
auto &current_scope = scope.NewScope(); auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope); executor.CreateVariables(*program, &current_scope, block->ID());
executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true); while (cond.data<bool>()[0]) {
if (is_test) { for (auto &name : current_scope.LocalVarNames()) {
scope.DeleteScope(&current_scope); auto *var = current_scope.Var(name);
if (var->IsType<framework::LoDTensor>()) {
// Clear all lod information for all lod_tensors.
auto *t = var->GetMutable<framework::LoDTensor>();
framework::LoD empty_lod;
t->set_lod(empty_lod);
} else if (var->IsType<framework::LoDTensorArray>()) {
// Clear elements of all tensor arrays.
auto *t = var->GetMutable<framework::LoDTensorArray>();
t->clear();
}
}
executor.RunPreparedContext(ctx.get(), &current_scope, false, false,
false);
} }
scope.DeleteScope(&current_scope);
} }
} }
}; };
......
...@@ -252,34 +252,23 @@ class ElemwiseGradKernel : public framework::OpKernel<T> { ...@@ -252,34 +252,23 @@ class ElemwiseGradKernel : public framework::OpKernel<T> {
} }
}; };
class ElementwiseOpInplace : public framework::InplaceInToOut { class ElementwiseOpInplace : public framework::InplaceOpInference {
public: public:
using framework::InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const framework::OpDesc &op_desc) const override {
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
return std::unordered_map<std::string, std::string>{ return std::unordered_map<std::string, std::string>{
{"X", "Out"}, {"X", "Out"},
}; };
} }
}; };
class ElementwiseGradOpInplace : public framework::InplaceInToOut { class ElementwiseGradOpInplace : public framework::InplaceOpInference {
public: public:
using framework::InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const framework::OpDesc &op_desc) const override {
protected: return std::unordered_map<std::string, std::string>{
std::unordered_map<std::string, std::string> Apply( {framework::GradVarName("Out"), framework::GradVarName("X")},
const framework::OpDesc &op_desc, };
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> ret;
if (block->HasVar(framework::GradVarName("X")) &&
block->HasVar(framework::GradVarName("Out"))) {
ret[framework::GradVarName("Out")] = framework::GradVarName("X");
}
return ret;
} }
}; };
......
...@@ -267,14 +267,10 @@ class Flatten2GradOp : public framework::OperatorBase { ...@@ -267,14 +267,10 @@ class Flatten2GradOp : public framework::OperatorBase {
} }
}; };
class FlattenOpInplaceInToOut : public framework::InplaceInToOut { class FlattenOpInplaceInToOut : public framework::InplaceOpInference {
public: public:
using InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const framework::OpDesc &op_desc) const override {
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = { std::unordered_map<std::string, std::string> inplace_in_to_out = {
{"X", "Out"}, {"X", "Out"},
}; };
...@@ -282,13 +278,10 @@ class FlattenOpInplaceInToOut : public framework::InplaceInToOut { ...@@ -282,13 +278,10 @@ class FlattenOpInplaceInToOut : public framework::InplaceInToOut {
} }
}; };
class FlattenGradInplaceinToOut : public framework::InplaceInToOut { class FlattenGradInplaceinToOut : public framework::InplaceOpInference {
using InplaceInToOut::InplaceInToOut; public:
std::unordered_map<std::string, std::string> operator()(
protected: const framework::OpDesc &op_desc) const override {
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = { std::unordered_map<std::string, std::string> inplace_in_to_out = {
{framework::GradVarName("Out"), framework::GradVarName("X")}, {framework::GradVarName("Out"), framework::GradVarName("X")},
}; };
......
...@@ -64,6 +64,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, ...@@ -64,6 +64,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
const T* p_src = src.data<T>(); const T* p_src = src.data<T>();
// why must be int?
const int* p_index = index.data<int>(); const int* p_index = index.data<int>();
T* p_output = output->data<T>(); T* p_output = output->data<T>();
......
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/group_norm_op.h" #include "paddle/fluid/operators/group_norm_op.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -170,26 +172,18 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker { ...@@ -170,26 +172,18 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
} }
}; };
class GroupNormInplaceInToOut : public framework::InplaceInToOut { class GroupNormInplaceInToOut : public framework::InplaceOpInference {
public: public:
using InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const framework::OpDesc &op_desc) const override {
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
return {{"X", "Y"}}; return {{"X", "Y"}};
} }
}; };
class GroupNormGradInplaceInToOut : public framework::InplaceInToOut { class GroupNormGradInplaceInToOut : public framework::InplaceOpInference {
public: public:
using InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const framework::OpDesc &op_desc) const override {
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
return {{framework::GradVarName("Y"), framework::GradVarName("X")}}; return {{framework::GradVarName("Y"), framework::GradVarName("X")}};
} }
}; };
......
# JIT Kernel # JIT Kernel
JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic. JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic.
Each implementations has its own condition to use, defined in `UseMe`. Each implementation has its own condition to use, defined in `CanBeUsed`.
They are combined together to get the best performance of one single independent function. They are combined together to get the best performance of one single independent function.
They could be some very simple functions like vector multiply, or some complicated functions like LSTM. They could be some very simple functions like vector multiply, or some complicated functions like LSTM.
And they can be composed with some other exited jit kernels to build up a complex function. And they can be composed with some other exited jit kernels to build up a complex function.
...@@ -42,35 +42,62 @@ All basical definations of jit kernels are addressed in `paddle/fluid/operators/ ...@@ -42,35 +42,62 @@ All basical definations of jit kernels are addressed in `paddle/fluid/operators/
## How to use ## How to use
One simple function `jit::Get`, which is very easy to use, is supported to get the kernel. We present these methods to get the functions:
It can automatically return the expected function with best performance under the given attributes. - `GetAllCandidateFuncs`. It can return all the implementations supported. All of the implementations can get the same result. You can do some runtime benchmark to choose which should actually be used.
All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, you can only include this one header to get all the registered kernels. - `GetDefaultBestFunc`. It only return one default function pointer, which is tuning offline with some genenal configures and attributes. This should cover most situations.
- `KernelFuncs::Cache()`. It can get the default functions and save it for next time with the same attribute.
- `GetReferFunc`. It can only get the reference code in CPU, and all the others implementations have same logic with this reference code.
And here are some examples:
Get from cache:
```cpp
using T = float;
jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
auto seqpool_func = jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(attr);
seqpool_func(src_data, dst_data, &attr);
```
Get all implementations and run once:
```cpp
using T = float;
jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
auto funcs = jit::GetAllCandidateFuncsWithTypes<jit::SeqPoolTuple<T>, platform::CPUPlace>(attr);
for (auto f : funcs) {
LOG(INFO) << "Kernel implementation type: " << f.first;
f.second(src_data, dst_data, &attr);
}
```
All kernels are inlcuded in `paddle/fluid/operators/jit/kernels.h`, which is automatically generated in compile time, you can only include this one header to get all the registered kernels.
## Solid Test ## Solid Test
- Unit Test - Unit Test
All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`. All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`.
- Benchmark - Benchmark
All functions should be tested, and make sure the `jit::Get` function obtain the best performance with all attributes. All functions should be tested, and make sure the `jit::GetDefaultBestFunc` function obtain the best performance with all attributes.
# How to add new kernel # How to add new kernel
## Required ## Required
1. Add `your_key` at `KernelType`. 1. Add `your_key` at `KernelType`.
2. Add reference function of `your_key`. 2. Add your new `KernelTuple` which must include `your_key`. It should be a combination of the data type, attribute type and function type. You can refer `SeqPoolTuple`.
3. Add reference function of `your_key`.
Note: Note:
- this should be run on CPU and do not depend on any third-party. - this should be run on CPU and do not depend on any third-party.
- Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used. - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
3. Add unit test in `test.cc`, and verfiy at least `float` and `double`. 4. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
Test more data type for some special functions if necessary, for example `int8`. Test more data type for some special functions if necessary, for example `int8`.
4. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `jit::Get` always get the best one. 5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one.
## Optional ## Optional
Add more implementations of `your_kery` for performance enhancement. Add more implementations of `your_kery` for performance enhancement.
1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have corepsonding creator from `JitCodeCreator` which will be registered on the `your_key`. 1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have correpsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
Note: Add new `KernelTuples` if necessary,your can refer to `XYZNTuples`. 2. If new attribute type is added, you should specialize `JitCodeKey` of this type.
Specialie method `JitCodeKey` when add new attribute type。 3. Add more functions in `more`,you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
2. Add more functions in `more`,you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
# JIT Kernel # JIT Kernel
结合函数模板和JIT生成需要的kernel函数。 结合函数模板和JIT生成需要的kernel函数。
这里的kernel是比Operator中kernel更小级别的算子单元,更侧重的是在不同硬件上的性能。可以有多重第三方库的实现,每种实现有自己的`UseMe`函数负责什么条件下可以被调用。 这里的kernel是比Operator中kernel更小级别的算子单元,更侧重的是在不同硬件上的性能。可以有多重第三方库的实现,每种实现有自己的`CanBeUsed`函数负责什么条件下可以被调用。
这里实现的函数可以非常细粒度的函数方法,比如Vector MUL, 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。 这里实现的函数可以非常细粒度的函数方法,比如Vector MUL, 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。
目前仅支持CPU上的高性能计算。 目前仅支持CPU上的高性能计算。
...@@ -39,27 +39,55 @@ PaddlePaddle/Paddle/paddle/fluid/ ...@@ -39,27 +39,55 @@ PaddlePaddle/Paddle/paddle/fluid/
## 动态获取 ## 动态获取
提供一个`jit::Get`方法,根据kernel类别获取,每种实现都有自己的使用范围,根据范围动态和当前条件选择需要的kernel函数。 - 提供`GetAllCandidateFuncs`方法,根据输入的kernel类别,获取满足要求的所有函数实现。所有实现保证结果一致,但是速度不一致,可以根据具体输入属性大小,动态测试得到当前最优实现,手动选择最优函数。
- 提供`GetDefaultBestFunc`方法,返回一个默认最优的函数实现。该函数是根据一些通用配置离线tuning之后的结果,能覆盖大多数情况下最优结果。
- 提供`KernelFuncs::Cache()`方法,该方法会返回默认最优的函数,同时会缓存该函数指针,如果出现属性一致的情况,直接返回上次的函数指针,如果不存在则根据属性新建。
- 提供`GetReferFunc` 方法,返回该kernel最原始的逻辑函数。该方法与kernel的输入大小和属性没有任何关系,有且并只有一个在CPU上的实现。该方法表征了kernel的原始逻辑,其他所有实现的逻辑与它保持一致。
### 例子
所有kernel的调用只需要在头文件中包含`"paddle/fluid/operators/jit/kernels.h"`, 该文件是编译时自动生成的。
直接从缓存中获取默认最优的函数。
```cpp
using T = float;
jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
auto seqpool_func = jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(attr);
seqpool_func(src_data, dst_data, &attr);
```
跑一遍所有实现,并输出实现类别。
```cpp
using T = float;
jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
auto funcs = jit::GetAllCandidateFuncsWithTypes<jit::SeqPoolTuple<T>, platform::CPUPlace>(attr);
for (auto f : funcs) {
LOG(INFO) << "Kernel implementation type: " << f.first;
f.second(src_data, dst_data, &attr);
}
```
## 测试 ## 测试
- 逻辑测试 - 逻辑测试
所有实现都要与refer的code对比,需要满足精度要求, 包括float和double的数据类型 所有实现都要与refer的code对比,需要满足精度要求, 包括float和double的数据类型
- 性能测试 - 性能测试
所有实现的性能对比,并且与最终的`jit::Get`方法对比,该方法拿到的性能需要在各种条件下都是最好的。 所有实现的性能对比,并且与最终的`jit::GetDefaultBestFunc`方法对比,该方法拿到的性能需要在各种条件下都是最好的。
# 如何添加新的算子 # 如何添加新的算子
-`KernelType` 中添加 `your_key` . 1.`KernelType` 中添加 `your_key`
- 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel. 2. 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。
- (optional) 实现更多的算法在`more`目录下,可以依赖mkl,intrinsic或者mkldnn等第三方库。 3. (optional) 实现更多的算法在`more`目录下,可以依赖mkl,intrinsic或者mkldnn等第三方库。
- (optional) 实现基于Xbyak的生成code,在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`,并注册在与refer相同的`KernelType`上。 4. (optional) 实现基于Xbyak的生成code,在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`,并注册在与refer相同的`KernelType`上。
- 必要时可以添加新的`KernelTuples`,可以参考`XYZNTuples`,新加的Attr类型需要特例化`JitCodeKey`方法。 5. 添加新的`KernelTuple`,需要与`KernelType`一一对应,是所有类型的一个打包,包括数据类型,属性的类型,以及返回的函数类型。可以参考`SeqPoolTuple`,新加的Attr类型需要特例化`JitCodeKey`方法。
-`test.cc`中添加unit test,至少需要测试`float``double`两种数据类型,如有必要需要支持额外的数据类型,比如`int8`的相关函数。 6.`test.cc`中添加unit test,至少需要测试`float``double`两种数据类型,如有必要需要支持额外的数据类型,比如`int8`的相关函数。
-`benchmark.cc`中添加相应的性能对比,同一种kernel需要对比所有实现,并且确保`jit::Get`得到的实现一直是速度最快的。 7.`benchmark.cc`中添加相应的性能对比,同一种kernel需要对比所有实现,并且确保`GetDefaultBestFunc`得到的实现一直是速度最快的。
# 优点 # 优点
- 统一的Get方法,接口简单 - 接口方便,灵活调用
- 同一套逻辑可以有多套实现,可以依赖多套第三方库,互不影响。 - 同一套逻辑可以有多套实现,可以依赖多套第三方库,互不影响。
- 目录结构清晰,不会在某个文件中有多个宏定义,导致的可读性差问题。 - 目录结构清晰,不会在某个文件中有多个宏定义,导致的可读性差问题。
- 优化方便,可以直接针对某种属性针对性优化,并不影响其他属性下的性能。 - 优化方便,可以直接针对某种属性针对性优化,并不影响其他属性下的性能。
......
...@@ -88,4 +88,5 @@ REGISTER_OP_CPU_KERNEL( ...@@ -88,4 +88,5 @@ REGISTER_OP_CPU_KERNEL(
ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, float>, ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, double>, ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int>, ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>); ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -64,4 +64,5 @@ REGISTER_OP_CPU_KERNEL( ...@@ -64,4 +64,5 @@ REGISTER_OP_CPU_KERNEL(
load, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, float>, load, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::LoadOpKernel<paddle::platform::CPUDeviceContext, double>, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int>, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int64_t>); ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -33,7 +33,7 @@ class LookupTableOp : public framework::OperatorWithKernel { ...@@ -33,7 +33,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
auto table_dims = ctx->GetInputDim("W"); auto table_dims = ctx->GetInputDim("W");
auto ids_dims = ctx->GetInputDim("Ids"); auto ids_dims = ctx->GetInputDim("Ids");
int ids_rank = ids_dims.size(); int ids_rank = ids_dims.size();
VLOG(5) << "ids rank is " << ids_rank << std::endl;
PADDLE_ENFORCE_EQ(table_dims.size(), 2); PADDLE_ENFORCE_EQ(table_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
"The last dimension of the 'Ids' tensor must be 1."); "The last dimension of the 'Ids' tensor must be 1.");
......
...@@ -322,14 +322,10 @@ class Reshape2GradOp : public framework::OperatorWithKernel { ...@@ -322,14 +322,10 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
} }
}; };
class ReshapeOpInplaceInToOut : public framework::InplaceInToOut { class ReshapeOpInplaceInToOut : public framework::InplaceOpInference {
public: public:
using InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const framework::OpDesc &op_desc) const override {
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = { std::unordered_map<std::string, std::string> inplace_in_to_out = {
{"X", "Out"}, {"X", "Out"},
}; };
...@@ -337,13 +333,10 @@ class ReshapeOpInplaceInToOut : public framework::InplaceInToOut { ...@@ -337,13 +333,10 @@ class ReshapeOpInplaceInToOut : public framework::InplaceInToOut {
} }
}; };
class ReshapeGradInplaceInToOut : public framework::InplaceInToOut { class ReshapeGradInplaceInToOut : public framework::InplaceOpInference {
using InplaceInToOut::InplaceInToOut; public:
std::unordered_map<std::string, std::string> operator()(
protected: const framework::OpDesc &op_desc) const override {
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> inplace_in_to_out = { std::unordered_map<std::string, std::string> inplace_in_to_out = {
{framework::GradVarName("Out"), framework::GradVarName("X")}, {framework::GradVarName("Out"), framework::GradVarName("X")},
}; };
......
...@@ -19,11 +19,27 @@ limitations under the License. */ ...@@ -19,11 +19,27 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class SaveCombineOp : public framework::OperatorWithKernel { class SaveCombineOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {} void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(framework::proto::VarType::FP32,
ctx.GetPlace());
}
// TODO(lujun): The override here is just to bypass transform
// in operator impl, which is not elegant enough.
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
return expected_kernel_type;
}
}; };
class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
...@@ -54,7 +70,7 @@ to a file on disk. ...@@ -54,7 +70,7 @@ to a file on disk.
"(string)" "(string)"
"The \"file_path\" where the LoDTensor variables will be saved.") "The \"file_path\" where the LoDTensor variables will be saved.")
.AddCustomChecker( .AddCustomChecker(
[](const std::string &path) { return !path.empty(); }); [](const std::string& path) { return !path.empty(); });
} }
}; };
...@@ -70,5 +86,4 @@ REGISTER_OP_CPU_KERNEL( ...@@ -70,5 +86,4 @@ REGISTER_OP_CPU_KERNEL(
save_combine, save_combine,
ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, float>, ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, double>, ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int>, ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int>);
ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -20,6 +20,4 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -20,6 +20,4 @@ REGISTER_OP_CUDA_KERNEL(
save_combine, save_combine,
ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, float>, ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, double>, ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int>, ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int>);
ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/operators/softmax_op.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
...@@ -199,14 +201,10 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker { ...@@ -199,14 +201,10 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
} }
}; };
class SoftmaxInplaceInToOut : public framework::InplaceInToOut { class SoftmaxInplaceInToOut : public framework::InplaceOpInference {
public: public:
using framework::InplaceInToOut::InplaceInToOut; std::unordered_map<std::string, std::string> operator()(
const framework::OpDesc& op_desc) const override {
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
return std::unordered_map<std::string, std::string>{ return std::unordered_map<std::string, std::string>{
{"X", "Out"}, {"X", "Out"},
}; };
......
...@@ -212,6 +212,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -212,6 +212,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
CudnnHolder::CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place) CudnnHolder::CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place)
: workspace_(nullptr), stream_(stream), place_(place) { : workspace_(nullptr), stream_(stream), place_(place) {
PADDLE_ENFORCE(cudaSetDevice(place_.device));
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_)); PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_));
} }
...@@ -252,10 +253,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) ...@@ -252,10 +253,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
#endif #endif
} }
if (dynload::HasCUDNN()) {
cudnn_holder_.reset(new CudnnHolder(&stream_, place));
}
driver_version_ = GetCUDADriverVersion(place_.device); driver_version_ = GetCUDADriverVersion(place_.device);
runtime_version_ = GetCUDARuntimeVersion(place_.device); runtime_version_ = GetCUDARuntimeVersion(place_.device);
...@@ -348,12 +345,21 @@ bool CUDADeviceContext::tensor_core_available() const { ...@@ -348,12 +345,21 @@ bool CUDADeviceContext::tensor_core_available() const {
return cublas_tensor_core_handle_ != nullptr; return cublas_tensor_core_handle_ != nullptr;
} }
CudnnHolder* CUDADeviceContext::cudnn_holder() const {
std::call_once(init_cudnn_, [&]() {
if (dynload::HasCUDNN()) {
cudnn_holder_.reset(new CudnnHolder(&stream_, place_));
}
});
return cudnn_holder_.get();
}
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
return cudnn_holder_->cudnn_handle(); return cudnn_holder()->cudnn_handle();
} }
CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return CudnnWorkspaceHandle(cudnn_holder_.get()); return CudnnWorkspaceHandle(cudnn_holder());
} }
cudaStream_t CUDADeviceContext::stream() const { return stream_; } cudaStream_t CUDADeviceContext::stream() const { return stream_; }
......
...@@ -292,9 +292,11 @@ class CUDADeviceContext : public DeviceContext { ...@@ -292,9 +292,11 @@ class CUDADeviceContext : public DeviceContext {
private: private:
CUDAPlace place_; CUDAPlace place_;
mutable std::once_flag init_cudnn_;
std::unique_ptr<Eigen::GpuDevice> eigen_device_; std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_; std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
std::unique_ptr<CudnnHolder> cudnn_holder_; mutable std::unique_ptr<CudnnHolder> cudnn_holder_;
cudaStream_t stream_; cudaStream_t stream_;
std::unique_ptr<CublasHandleHolder> cublas_handle_; std::unique_ptr<CublasHandleHolder> cublas_handle_;
...@@ -317,6 +319,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -317,6 +319,7 @@ class CUDADeviceContext : public DeviceContext {
// StreamCallbackManager is thread-safe // StreamCallbackManager is thread-safe
std::unique_ptr<StreamCallbackManager> callback_manager_; std::unique_ptr<StreamCallbackManager> callback_manager_;
CudnnHolder* cudnn_holder() const;
DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
}; };
......
...@@ -242,6 +242,7 @@ PYBIND11_MODULE(core, m) { ...@@ -242,6 +242,7 @@ PYBIND11_MODULE(core, m) {
self.forward_id_ = forward_id; self.forward_id_ = forward_id;
}, },
py::return_value_policy::reference) py::return_value_policy::reference)
.def_property_readonly("type", &imperative::OpBase::Type)
.def_property( .def_property(
"backward_id", "backward_id",
[](const imperative::OpBase &self) { return self.backward_id_; }, [](const imperative::OpBase &self) { return self.backward_id_; },
...@@ -355,7 +356,8 @@ PYBIND11_MODULE(core, m) { ...@@ -355,7 +356,8 @@ PYBIND11_MODULE(core, m) {
.def("_set_double_element", TensorSetElement<double>) .def("_set_double_element", TensorSetElement<double>)
.def("_get_double_element", TensorGetElement<double>) .def("_get_double_element", TensorGetElement<double>)
.def("_place", [](Tensor &self) { return self.place(); }) .def("_place", [](Tensor &self) { return self.place(); })
.def("_dtype", [](Tensor &self) { return self.type(); }); .def("_dtype", [](Tensor &self) { return self.type(); })
.def("__getitem__", PySliceTensor, py::return_value_policy::reference);
py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC( py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
LoDTensor is a Tensor with optional LoD information. LoDTensor is a Tensor with optional LoD information.
...@@ -507,6 +509,13 @@ PYBIND11_MODULE(core, m) { ...@@ -507,6 +509,13 @@ PYBIND11_MODULE(core, m) {
Returns: Returns:
out (bool): whether the lod is valid. out (bool): whether the lod is valid.
)DOC")
.def("__getitem__", PySliceTensor, py::return_value_policy::reference,
R"DOC(
Slice the original Tensor, and remove the LoD information.
Returns:
out (Tensor): new Tensor(NOT LoDTensor).
)DOC"); )DOC");
py::class_<SelectedRows>(m, "SelectedRows") py::class_<SelectedRows>(m, "SelectedRows")
...@@ -779,7 +788,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -779,7 +788,11 @@ All parameter, weight, gradient are variables in Paddle.
#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) #if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>()); py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
#endif #endif
py::class_<platform::CUDAPlace>(m, "CUDAPlace") py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
CUDAPlace is a descriptor of a device. It represents a GPU, and each CUDAPlace
has a dev_id to indicate the number of cards represented by the current CUDAPlace.
The memory of CUDAPlace with different dev_id is not accessible.
)DOC")
.def("__init__", .def("__init__",
[](platform::CUDAPlace &self, int dev_id) { [](platform::CUDAPlace &self, int dev_id) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -800,7 +813,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -800,7 +813,10 @@ All parameter, weight, gradient are variables in Paddle.
&IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>) &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::CUDAPlace &>); .def("__str__", string::to_string<const platform::CUDAPlace &>);
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace") py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
CPUPlace is a descriptor of a device. It represents a CPU, and the memory
CPUPlace can be accessed by CPU.
)DOC")
.def(py::init<>()) .def(py::init<>())
.def("_type", &PlaceIndex<platform::CPUPlace>) .def("_type", &PlaceIndex<platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>) .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
...@@ -810,7 +826,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -810,7 +826,10 @@ All parameter, weight, gradient are variables in Paddle.
&IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>) &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::CPUPlace &>); .def("__str__", string::to_string<const platform::CPUPlace &>);
py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace") py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace", R"DOC(
CUDAPinnedPlace is a descriptor of a device. The memory of CUDAPinnedPlace
can be accessed by GPU and CPU.
)DOC")
.def("__init__", .def("__init__",
[](platform::CUDAPinnedPlace &self) { [](platform::CUDAPinnedPlace &self) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
......
...@@ -14,16 +14,22 @@ limitations under the License. */ ...@@ -14,16 +14,22 @@ limitations under the License. */
#pragma once #pragma once
#include <Python.h> #include <Python.h>
#include <algorithm>
#include <memory>
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
namespace py = pybind11;
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
namespace details { namespace details {
...@@ -191,6 +197,253 @@ inline void PyCPUTensorSetFromArray( ...@@ -191,6 +197,253 @@ inline void PyCPUTensorSetFromArray(
std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size()); std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
} }
template <typename T, size_t D>
void _sliceCompute(const framework::Tensor *in, framework::Tensor *out,
const platform::CPUDeviceContext &ctx,
const std::vector<int> &axes,
const std::vector<int> &starts) {
auto &eigen_place = *ctx.eigen_device();
auto place = in->place();
auto out_dims = out->dims();
auto in_dims = in->dims();
auto offsets = Eigen::array<int, D>();
auto extents = Eigen::array<int, D>();
for (size_t i = 0; i < D; ++i) {
offsets[i] = 0;
extents[i] = out_dims[i];
}
int start;
for (size_t i = 0; i < axes.size(); ++i) {
start = starts[i];
if (start < 0) {
start = (start + in_dims[axes[i]]);
}
start = std::max(start, 0);
offsets[axes[i]] = start;
}
auto in_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*in);
auto out_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out);
out_t.device(eigen_place) = in_t.slice(offsets, extents);
}
template <typename T>
void _concatCompute(const std::vector<paddle::framework::Tensor> &ins,
paddle::framework::Tensor *out,
const platform::CPUDeviceContext &ctx, int64_t axis) {
if (axis == 0 && ins.size() < 10) {
size_t output_offset = 0;
for (auto &in : ins) {
auto in_stride = framework::stride_numel(in.dims());
auto out_stride = framework::stride_numel(out->dims());
paddle::operators::StridedNumelCopyWithAxis<T>(
ctx, axis, out->data<T>() + output_offset, out_stride, in.data<T>(),
in_stride, in_stride[axis]);
output_offset += in_stride[axis];
}
} else {
paddle::operators::math::ConcatFunctor<platform::CPUDeviceContext, T>
concat_functor;
concat_functor(ctx, ins, static_cast<int>(axis), out);
}
}
void _getSliceinfo(const framework::Tensor &self, py::object obj,
const int64_t dim, int64_t *pstart, int64_t *pstop,
int64_t *pstep, int64_t *pslicelength) {
auto &start = *pstart;
auto &stop = *pstop;
auto &step = *pstep;
auto &slicelength = *pslicelength;
const framework::DDim &srcDDim = self.dims();
if (dim < 0 || dim >= srcDDim.size()) {
throw py::index_error();
}
if (py::isinstance<py::slice>(obj)) {
size_t lstart, lstop, lstep, lslicelength;
py::slice s = static_cast<py::slice>(obj);
if (!s.compute(srcDDim[dim], &lstart, &lstop, &lstep, &lslicelength)) {
throw py::index_error();
}
start = static_cast<int64_t>(lstart);
stop = static_cast<int64_t>(lstop);
step = static_cast<int64_t>(lstep);
slicelength = static_cast<int64_t>(lslicelength);
} else if (py::isinstance<py::int_>(obj)) {
start = static_cast<int64_t>(static_cast<py::int_>(obj));
if (std::abs(start) >= srcDDim[dim]) {
throw py::index_error();
}
start = (start >= 0) ? start : srcDDim[dim] - start;
stop = start + 1;
step = 1;
slicelength = 1;
} else {
throw py::index_error();
}
}
inline framework::Tensor *_getTensor(const framework::Tensor &self,
const framework::DDim &ddim) {
framework::Tensor *output = new framework::Tensor();
output->Resize(ddim);
auto place = self.place();
if (platform::is_cpu_place(place)) {
output->mutable_data(boost::get<platform::CPUPlace>(place), self.type());
#ifdef PADDLE_WITH_CUDA
} else {
if (platform::is_cuda_pinned_place(place)) {
output->mutable_data(boost::get<platform::CUDAPinnedPlace>(place),
self.type());
} else if ((platform::is_gpu_place(place))) {
output->mutable_data(boost::get<platform::CUDAPlace>(place), self.type());
}
#endif
}
return output;
}
template <typename T>
void _sliceDapper(const framework::Tensor *in, framework::Tensor *out,
const platform::CPUDeviceContext &ctx,
const std::vector<int> &axes, const std::vector<int> &starts,
int size) {
switch (size) {
case 1:
_sliceCompute<T, 1>(in, out, ctx, axes, starts);
break;
case 2:
_sliceCompute<T, 2>(in, out, ctx, axes, starts);
break;
case 3:
_sliceCompute<T, 3>(in, out, ctx, axes, starts);
break;
case 4:
_sliceCompute<T, 4>(in, out, ctx, axes, starts);
break;
case 5:
_sliceCompute<T, 5>(in, out, ctx, axes, starts);
break;
case 6:
_sliceCompute<T, 6>(in, out, ctx, axes, starts);
break;
case 7:
_sliceCompute<T, 7>(in, out, ctx, axes, starts);
break;
case 8:
_sliceCompute<T, 8>(in, out, ctx, axes, starts);
break;
case 9:
_sliceCompute<T, 9>(in, out, ctx, axes, starts);
break;
default:
PADDLE_THROW("dim size not exepected, current is %d", size);
break;
}
}
template <typename T>
inline framework::Tensor *_sliceWrapper(const framework::Tensor &self,
const platform::CPUDeviceContext &ctx,
py::object obj, int dim, int64_t start,
int64_t slicelength) {
framework::DDim dstDDim = self.dims();
dstDDim[dim] = static_cast<int64_t>(slicelength);
std::vector<int> axes({dim});
std::vector<int> starts({static_cast<int>(start)});
framework::Tensor *output = _getTensor(self, dstDDim);
_sliceDapper<T>(&self, output, ctx, axes, starts, dstDDim.size());
return output;
}
template <typename T>
inline framework::Tensor *_sliceAndConcat(const framework::Tensor &self,
py::object obj, int dim) {
platform::CPUDeviceContext ctx;
int64_t start, stop, step, slicelength;
_getSliceinfo(self, obj, dim, &start, &stop, &step, &slicelength);
if (step == 1 || slicelength == 1) {
return _sliceWrapper<T>(self, ctx, obj, dim, start, slicelength);
} else {
std::vector<framework::Tensor> ins;
for (auto i = 0; i < slicelength; ++i, start += step) {
ins.emplace_back(*_sliceWrapper<T>(self, ctx, obj, dim, start, 1));
}
// do the concat operation
framework::DDim dstDDim = self.dims();
dstDDim[dim] = static_cast<int64_t>(slicelength);
framework::Tensor *output1 = _getTensor(self, dstDDim);
_concatCompute<T>(ins, output1, ctx, dim);
return output1;
}
}
inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
py::object obj, int dim) {
auto src_type = self.type();
switch (src_type) {
case framework::proto::VarType::FP16:
return _sliceAndConcat<paddle::platform::float16>(self, obj, dim);
case framework::proto::VarType::FP32:
return _sliceAndConcat<float>(self, obj, dim);
case framework::proto::VarType::FP64:
return _sliceAndConcat<double>(self, obj, dim);
case framework::proto::VarType::INT32:
return _sliceAndConcat<int>(self, obj, dim);
case framework::proto::VarType::INT64:
return _sliceAndConcat<int64_t>(self, obj, dim);
case framework::proto::VarType::BOOL:
return _sliceAndConcat<bool>(self, obj, dim);
case framework::proto::VarType::INT16:
return _sliceAndConcat<bool>(self, obj, dim);
case framework::proto::VarType::UINT8:
return _sliceAndConcat<bool>(self, obj, dim);
default:
PADDLE_THROW("Not support type %d", src_type);
}
}
inline framework::Tensor *_pySliceTensor(const framework::Tensor &self,
py::object obj) {
if (py::isinstance<py::tuple>(obj)) {
py::list l = static_cast<py::list>(obj);
std::unique_ptr<framework::Tensor> target;
framework::Tensor *src = const_cast<framework::Tensor *>(&self);
for (auto i = 0; i < static_cast<int>(l.size()); ++i) {
src = _sliceTensor(*src, l[i], i);
if (i + 1 == static_cast<int>(l.size())) {
return src;
} else {
target.reset(src);
}
}
return nullptr;
} else {
return _sliceTensor(self, obj, 0);
}
}
inline framework::Tensor *PySliceTensor(const framework::Tensor &self,
py::object obj) {
if (platform::is_gpu_place(self.place())) {
std::unique_ptr<framework::Tensor> holder;
framework::Tensor src;
framework::TensorCopySync(self, platform::CPUPlace(), &src);
framework::Tensor *output = _pySliceTensor(src, obj);
holder.reset(output);
framework::Tensor *dst = _getTensor(*output, output->dims());
framework::TensorCopySync(*output, self.place(), dst);
return dst;
} else {
return _pySliceTensor(self, obj);
}
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
template <typename T> template <typename T>
void PyCUDATensorSetFromArray( void PyCUDATensorSetFromArray(
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
WMT14 dataset. WMT14 dataset.
The original WMT14 dataset is too large and a small set of data for set is The original WMT14 dataset is too large and a small set of data for set is
provided. This module will download dataset from provided. This module will download dataset from
http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
parse training set and test set into paddle reader creators. parse training set and test set into paddle reader creators.
""" """
......
...@@ -59,13 +59,14 @@ from .parallel_executor import * ...@@ -59,13 +59,14 @@ from .parallel_executor import *
from . import compiler from . import compiler
from .compiler import * from .compiler import *
from paddle.fluid.layers.math_op_patch import monkey_patch_variable from paddle.fluid.layers.math_op_patch import monkey_patch_variable
from . import install_check
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + \ __all__ = framework.__all__ + executor.__all__ + \
trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
parallel_executor.__all__ + lod_tensor.__all__ + \ parallel_executor.__all__ + lod_tensor.__all__ + \
data_feed_desc.__all__ + async_executor.__all__ + compiler.__all__ + [ data_feed_desc.__all__ + async_executor.__all__ + compiler.__all__ + [
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
...@@ -91,6 +92,7 @@ __all__ = framework.__all__ + executor.__all__ + \ ...@@ -91,6 +92,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'unique_name', 'unique_name',
'recordio_writer', 'recordio_writer',
'Scope', 'Scope',
'install_check',
] ]
......
...@@ -65,7 +65,7 @@ Please note that [full ImageNet validation dataset](http://www.image-net.org/cha ...@@ -65,7 +65,7 @@ Please note that [full ImageNet validation dataset](http://www.image-net.org/cha
Notes: Notes:
* The accuracy measurement requires the model with `label`. * The accuracy measurement requires the model with `label`.
* The INT8 theoretical speedup is 4X on Intel® Xeon® Cascadelake Server (please refer to `providing a theoretical peak compute gain of 4x int8 OPS over fp32 OPS` in [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). However, the actual test results at the model level will be less than 4X, and in general the average is about 2X. In addition, the calculation library optimization of batch size 1 is not as good as the large batch size. * The INT8 theoretical speedup is 4X on Intel® Xeon® Cascadelake Server (please refer to `The theoretical peak compute gains are 4x int8 OPS over fp32 OPS.` in [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). Therefore, op-level gain is 4X and topology-level is smaller.
## 4. How to reproduce the results ## 4. How to reproduce the results
* Small dataset (Single core) * Small dataset (Single core)
......
...@@ -14,15 +14,10 @@ ...@@ -14,15 +14,10 @@
import collections import collections
import numpy as np import numpy as np
import six
from ..... import compat as cpt from ..... import compat as cpt
from .... import core from .... import core
from .... import Executor
from ....framework import IrGraph from ....framework import IrGraph
from ....framework import IrNode from ....framework import IrNode
from ....framework import Program
from ....initializer import Constant
from ....initializer import NumpyArrayInitializer
from .... import unique_name from .... import unique_name
__all__ = [ __all__ = [
...@@ -107,7 +102,6 @@ class QuantizationTransformPass(object): ...@@ -107,7 +102,6 @@ class QuantizationTransformPass(object):
self._window_size = window_size self._window_size = window_size
self._moving_rate = moving_rate self._moving_rate = moving_rate
self._need_initialized = collections.OrderedDict()
self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
self._conv_ops = ['conv2d', 'depthwise_conv2d'] self._conv_ops = ['conv2d', 'depthwise_conv2d']
self._quantizable_grad_ops = [ self._quantizable_grad_ops = [
...@@ -127,7 +121,8 @@ class QuantizationTransformPass(object): ...@@ -127,7 +121,8 @@ class QuantizationTransformPass(object):
""" """
assert isinstance(graph, assert isinstance(graph,
IrGraph), 'graph must be the instance of IrGraph.' IrGraph), 'graph must be the instance of IrGraph.'
self._need_initialized.clear() #sequential_execution = core.get_pass('sequential_execution_pass')
#sequential_execution.apply(graph.graph)
self._is_test = graph.is_test() self._is_test = graph.is_test()
# marked the variable which has been dequantized. # marked the variable which has been dequantized.
dequantized_vars = collections.OrderedDict() dequantized_vars = collections.OrderedDict()
...@@ -135,6 +130,8 @@ class QuantizationTransformPass(object): ...@@ -135,6 +130,8 @@ class QuantizationTransformPass(object):
def _transform_forward(graph, op): def _transform_forward(graph, op):
for var_node in op.inputs: for var_node in op.inputs:
if var_node.name() not in op.input_arg_names():
continue
if var_node.name() in dequantized_vars: if var_node.name() in dequantized_vars:
dequant_var_node = dequantized_vars[var_node.name()] dequant_var_node = dequantized_vars[var_node.name()]
else: else:
...@@ -168,6 +165,8 @@ class QuantizationTransformPass(object): ...@@ -168,6 +165,8 @@ class QuantizationTransformPass(object):
def _transform_backward(graph, op): def _transform_backward(graph, op):
no_dequanted_input_vars = True no_dequanted_input_vars = True
for var_node in op.inputs: for var_node in op.inputs:
if var_node.name() not in op.input_arg_names():
continue
if var_node.name() in dequantized_vars: if var_node.name() in dequantized_vars:
dequant_var_node = dequantized_vars[var_node.name()] dequant_var_node = dequantized_vars[var_node.name()]
graph.update_input_link(var_node, dequant_var_node, op) graph.update_input_link(var_node, dequant_var_node, op)
...@@ -188,25 +187,7 @@ class QuantizationTransformPass(object): ...@@ -188,25 +187,7 @@ class QuantizationTransformPass(object):
for op in ops: for op in ops:
if op.name() in self._quantizable_grad_ops: if op.name() in self._quantizable_grad_ops:
_transform_backward(graph, op) _transform_backward(graph, op)
graph.resolve_hazard()
if len(self._need_initialized) > 0:
assert self._scope is not None, \
'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
assert self._place is not None, \
'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
init_program = Program()
for var_desc, initializer in six.iteritems(self._need_initialized):
var = init_program.global_block().create_var(
name=var_desc.name(),
shape=var_desc.shape(),
dtype=var_desc.dtype(),
type=var_desc.type(),
lod_level=var_desc.lod_level(),
persistable=var_desc.persistable())
initializer(var, init_program.global_block())
exe = Executor(self._place)
exe.run(program=init_program, scope=self._scope)
return graph return graph
def _create_global_step(self, graph): def _create_global_step(self, graph):
...@@ -222,8 +203,9 @@ class QuantizationTransformPass(object): ...@@ -222,8 +203,9 @@ class QuantizationTransformPass(object):
var_type=core.VarDesc.VarType.LOD_TENSOR, var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=[1], shape=[1],
var_dtype=core.VarDesc.VarType.INT64) var_dtype=core.VarDesc.VarType.INT64)
self._need_initialized[global_step_in.var()] = \ self._init_var_node(
Constant(value=0, force_cpu=True) global_step_in, np.zeros(
[1], dtype='int64'))
global_step_out = graph.create_var_node_from_desc( global_step_out = graph.create_var_node_from_desc(
global_step_in.var()) global_step_in.var())
# The attribute of `op_role` is needed by ParallelExecutor. # The attribute of `op_role` is needed by ParallelExecutor.
...@@ -300,7 +282,9 @@ class QuantizationTransformPass(object): ...@@ -300,7 +282,9 @@ class QuantizationTransformPass(object):
var_type=core.VarDesc.VarType.LOD_TENSOR, var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=[1], shape=[1],
var_dtype=var_node.dtype()) var_dtype=var_node.dtype())
self._need_initialized[scale_in_node.var()] = Constant(value=0.001) data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32'
self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type))
scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
inputs = {'X': var_node, 'InScale': scale_in_node} inputs = {'X': var_node, 'InScale': scale_in_node}
...@@ -313,7 +297,11 @@ class QuantizationTransformPass(object): ...@@ -313,7 +297,11 @@ class QuantizationTransformPass(object):
var_type=core.VarDesc.VarType.LOD_TENSOR, var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=[self._window_size], shape=[self._window_size],
var_dtype=var_node.dtype()) var_dtype=var_node.dtype())
self._need_initialized[scales_node.var()] = Constant(value=0) data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32'
self._init_var_node(
scales_node, np.zeros(
[self._window_size], dtype=data_type))
inputs['Iter'] = self._global_step inputs['Iter'] = self._global_step
outputs['OutScales'] = scales_node outputs['OutScales'] = scales_node
attrs = { attrs = {
...@@ -353,7 +341,9 @@ class QuantizationTransformPass(object): ...@@ -353,7 +341,9 @@ class QuantizationTransformPass(object):
var_type=core.VarDesc.VarType.LOD_TENSOR, var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=[1], shape=[1],
var_dtype=var_node.dtype()) var_dtype=var_node.dtype())
self._need_initialized[scale_in_node.var()] = Constant(value=0.001) data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32'
self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type))
scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
ins = {'X': var_node, 'InScale': scale_in_node} ins = {'X': var_node, 'InScale': scale_in_node}
...@@ -364,13 +354,15 @@ class QuantizationTransformPass(object): ...@@ -364,13 +354,15 @@ class QuantizationTransformPass(object):
var_type=core.VarDesc.VarType.LOD_TENSOR, var_type=core.VarDesc.VarType.LOD_TENSOR,
var_dtype=var_node.dtype(), var_dtype=var_node.dtype(),
shape=[1]) shape=[1])
self._need_initialized[state_in_node.var()] = Constant(value=1) data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32'
self._init_var_node(scale_in_node, np.ones([1], dtype=data_type))
accum_in_node = graph.create_persistable_node( accum_in_node = graph.create_persistable_node(
name=unique_name.generate('accum'), name=unique_name.generate('accum'),
var_type=core.VarDesc.VarType.LOD_TENSOR, var_type=core.VarDesc.VarType.LOD_TENSOR,
var_dtype=var_node.dtype(), var_dtype=var_node.dtype(),
shape=[1]) shape=[1])
self._need_initialized[accum_in_node.var()] = Constant(value=1) self._init_var_node(accum_in_node, np.ones([1], dtype=data_type))
state_out_node = graph.create_var_node_from_desc(state_in_node.var( state_out_node = graph.create_var_node_from_desc(state_in_node.var(
)) ))
accum_out_node = graph.create_var_node_from_desc(accum_in_node.var( accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
...@@ -490,6 +482,16 @@ class QuantizationTransformPass(object): ...@@ -490,6 +482,16 @@ class QuantizationTransformPass(object):
graph.link_to(dequant_op_node, dequant_var_node) graph.link_to(dequant_op_node, dequant_var_node)
return dequant_var_node return dequant_var_node
def _init_var_node(self, var_node, value):
assert isinstance(
value, np.ndarray), 'The type of value should be numpy array.'
assert self._scope is not None, \
'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
assert self._place is not None, \
'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
tensor = self._scope.var(var_node.name()).get_tensor()
tensor.set(value, self._place)
def _quantized_var_name(self, var_name): def _quantized_var_name(self, var_name):
""" """
Return quantized variable name for the input `var_name`. Return quantized variable name for the input `var_name`.
...@@ -592,7 +594,8 @@ class QuantizationFreezePass(object): ...@@ -592,7 +594,8 @@ class QuantizationFreezePass(object):
self._weight_bits) self._weight_bits)
self._restore_var(input_arg_name, quantized_param_v) self._restore_var(input_arg_name, quantized_param_v)
else: else:
scale_v = graph.var_node(op_node.output('OutScale')[0]) scale_v = self._to_node(op_node.outputs,
op_node.output('OutScale')[0])
self._var_scale_map[input_arg_name] = scale_v self._var_scale_map[input_arg_name] = scale_v
ops = graph.all_op_nodes() ops = graph.all_op_nodes()
...@@ -613,32 +616,35 @@ class QuantizationFreezePass(object): ...@@ -613,32 +616,35 @@ class QuantizationFreezePass(object):
for op_node in ops: for op_node in ops:
# insert dequant_op after fc/conv, need to rename inputs of the followed ops # insert dequant_op after fc/conv, need to rename inputs of the followed ops
for var_node in op_node.inputs: for var_node in op_node.inputs:
name = var_node.name() if var_node.node in self._op_output_rename_map:
if name in self._op_output_rename_map: old_in = var_node
old_in = graph.var_node(name) new_in = self._op_output_rename_map[var_node.node]
new_in = self._op_output_rename_map[name]
graph.update_input_link(old_in, new_in, op_node) graph.update_input_link(old_in, new_in, op_node)
# remove the unused var node in the graph # remove the unused var node in the graph
self._remove_unused_var_nodes(graph) self._remove_unused_var_nodes(graph)
graph.resolve_hazard()
return graph return graph
def _remove_fake_quant_and_dequant_op(self, graph, op_node): def _remove_fake_quant_and_dequant_op(self, graph, op_node):
k = op_node.output('Out')[0] k = self._to_node(op_node.outputs, op_node.output('Out')[0])
v = op_node.input('X')[0] v = self._to_node(op_node.inputs, op_node.input('X')[0])
if v not in self._op_input_rename_map: if v.node not in self._op_input_rename_map:
self._op_input_rename_map[k] = v self._op_input_rename_map[k.node] = v
else: else:
self._op_input_rename_map[k] = self._op_input_rename_map[v] self._op_input_rename_map[k.node] = self._op_input_rename_map[
v.node]
graph.safe_remove_nodes(op_node) graph.safe_remove_nodes(op_node)
def _insert_post_channel_dequant_op(self, graph, op_node): def _insert_post_channel_dequant_op(self, graph, op_node):
persistable_vars = [p.name() for p in graph.all_persistable_nodes()] persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
for var_node in op_node.inputs: for var_node in op_node.inputs:
name = var_node.name() name = var_node.name()
if name in self._op_input_rename_map: if name not in op_node.input_arg_names():
old_in = graph.var_node(name) continue
new_in = graph.var_node(self._op_input_rename_map[name]) if var_node.node in self._op_input_rename_map:
old_in = var_node
new_in = self._op_input_rename_map[var_node.node]
new_in.clear_outputs() new_in.clear_outputs()
graph.update_input_link(old_in, new_in, op_node) graph.update_input_link(old_in, new_in, op_node)
original_var_name = self._original_var_name(name) original_var_name = self._original_var_name(name)
...@@ -653,28 +659,20 @@ class QuantizationFreezePass(object): ...@@ -653,28 +659,20 @@ class QuantizationFreezePass(object):
assert isinstance(scale_v, IrNode) assert isinstance(scale_v, IrNode)
scale_var_node = self._var_scale_map[original_var_name] scale_var_node = self._var_scale_map[original_var_name]
if len(op_node.outputs) != 1: if len(op_node.output_arg_names()) != 1:
raise ValueError("Only support one output, but op %s has" raise ValueError("Only support one output, but op %s has"
" more than one output." % (op_node.name())) " more than one output." % (op_node.name()))
output_var_node = op_node.outputs[0] output_var_node = self._to_node(op_node.outputs,
op_node.output_arg_names()[0])
weight_scale_node = graph.create_persistable_node( weight_scale_node = graph.create_persistable_node(
name=unique_name.generate('channel_scale'), name=unique_name.generate('channel_scale'),
var_type=core.VarDesc.VarType.LOD_TENSOR, var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=[channel_scale.shape[0]], shape=[channel_scale.shape[0]],
var_dtype=output_var_node.dtype()) var_dtype=output_var_node.dtype())
init_program = Program() data_type = 'float64' if output_var_node.dtype(
weight_scale_var = init_program.global_block().create_var( ) == core.VarDesc.VarType.FP64 else 'float32'
name=weight_scale_node.name(), self._init_var_node(weight_scale_node, channel_scale.astype(data_type))
shape=weight_scale_node.shape(),
dtype=weight_scale_node.dtype(),
type=weight_scale_node.type(),
lod_level=weight_scale_node.var().lod_level(),
persistable=weight_scale_node.persistable())
initializer = NumpyArrayInitializer(value=channel_scale)
initializer(weight_scale_var, init_program.global_block())
exe = Executor(self._place)
exe.run(program=init_program, scope=self._scope)
dequant_var_node = graph.create_var_node( dequant_var_node = graph.create_var_node(
name=self._dequantized_var_name(output_var_node.name()), name=self._dequantized_var_name(output_var_node.name()),
var_type=output_var_node.type(), var_type=output_var_node.type(),
...@@ -695,16 +693,18 @@ class QuantizationFreezePass(object): ...@@ -695,16 +693,18 @@ class QuantizationFreezePass(object):
graph.link_to(scale_var_node, dequant_op_node) graph.link_to(scale_var_node, dequant_op_node)
graph.link_to(weight_scale_node, dequant_op_node) graph.link_to(weight_scale_node, dequant_op_node)
graph.link_to(dequant_op_node, dequant_var_node) graph.link_to(dequant_op_node, dequant_var_node)
self._op_output_rename_map[output_var_node.name()] = dequant_var_node self._op_output_rename_map[output_var_node.node] = dequant_var_node
return dequant_var_node return dequant_var_node
def _insert_post_dequant_op(self, graph, op_node): def _insert_post_dequant_op(self, graph, op_node):
persistable_vars = [p.name() for p in graph.all_persistable_nodes()] persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
for var_node in op_node.inputs: for var_node in op_node.inputs:
name = var_node.name() name = var_node.name()
if name in self._op_input_rename_map: if name not in op_node.input_arg_names():
old_in = graph.var_node(name) continue
new_in = graph.var_node(self._op_input_rename_map[name]) if var_node.node in self._op_input_rename_map:
old_in = var_node
new_in = self._op_input_rename_map[var_node.node]
new_in.clear_outputs() new_in.clear_outputs()
graph.update_input_link(old_in, new_in, op_node) graph.update_input_link(old_in, new_in, op_node)
original_var_name = self._original_var_name(name) original_var_name = self._original_var_name(name)
...@@ -720,11 +720,12 @@ class QuantizationFreezePass(object): ...@@ -720,11 +720,12 @@ class QuantizationFreezePass(object):
assert isinstance(scale_v, IrNode) assert isinstance(scale_v, IrNode)
scale_var_node = self._var_scale_map[original_var_name] scale_var_node = self._var_scale_map[original_var_name]
if len(op_node.outputs) != 1: if len(op_node.output_arg_names()) != 1:
raise ValueError("Only support one output, but op %s has" raise ValueError("Only support one output, but op %s has"
" more than one output." % (op_node.name())) " more than one output." % (op_node.name()))
output_var_node = op_node.outputs[0] output_var_node = self._to_node(op_node.outputs,
op_node.output_arg_names()[0])
dequant_var_node = graph.create_var_node( dequant_var_node = graph.create_var_node(
name=self._dequantized_var_name(output_var_node.name()), name=self._dequantized_var_name(output_var_node.name()),
var_type=output_var_node.type(), var_type=output_var_node.type(),
...@@ -742,9 +743,27 @@ class QuantizationFreezePass(object): ...@@ -742,9 +743,27 @@ class QuantizationFreezePass(object):
graph.link_to(output_var_node, dequant_op_node) graph.link_to(output_var_node, dequant_op_node)
graph.link_to(scale_var_node, dequant_op_node) graph.link_to(scale_var_node, dequant_op_node)
graph.link_to(dequant_op_node, dequant_var_node) graph.link_to(dequant_op_node, dequant_var_node)
self._op_output_rename_map[output_var_node.name()] = dequant_var_node self._op_output_rename_map[output_var_node.node] = dequant_var_node
return dequant_var_node return dequant_var_node
def _init_var_node(self, var_node, value):
assert isinstance(
value, np.ndarray), 'The type of value should be numpy array.'
assert self._scope is not None, \
'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
assert self._place is not None, \
'The place cannot be set None when activation_quantize_type equals to range_abs_max.'
tensor = self._scope.var(var_node.name()).get_tensor()
tensor.set(value, self._place)
def _to_node(self, nodes, node_name):
target_node = None
for n in nodes:
if n.name() == node_name:
target_node = n
assert target_node is not None, "Cannot find the target node in the giving set."
return target_node
def _load_var(self, name): def _load_var(self, name):
return np.array(self._scope.find_var(name).get_tensor()) return np.array(self._scope.find_var(name).get_tensor())
...@@ -848,6 +867,7 @@ class ConvertToInt8Pass(object): ...@@ -848,6 +867,7 @@ class ConvertToInt8Pass(object):
# remove the unused var node in the graph # remove the unused var node in the graph
self._remove_unused_var_nodes(graph) self._remove_unused_var_nodes(graph)
graph.resolve_hazard()
return graph return graph
def _convert_to_int8(self, graph, var_node): def _convert_to_int8(self, graph, var_node):
...@@ -930,5 +950,5 @@ class TransformForMobilePass(object): ...@@ -930,5 +950,5 @@ class TransformForMobilePass(object):
for output_node in op_node.outputs: for output_node in op_node.outputs:
graph.link_to(dequant_node, output_node) graph.link_to(dequant_node, output_node)
graph.safe_remove_nodes(op_node) graph.safe_remove_nodes(op_node)
graph.resolve_hazard()
return graph return graph
...@@ -86,7 +86,11 @@ class TestGraphWrapper(unittest.TestCase): ...@@ -86,7 +86,11 @@ class TestGraphWrapper(unittest.TestCase):
def test_all_vars(self): def test_all_vars(self):
self.build_program() self.build_program()
self.assertEquals(len(self.train_graph.vars()), 90) # self.assertEquals(len(self.train_graph.vars()), 90)
# activation inplace has been disabled in python side
# which may produce more variable in program_desc
# update 90 => 94
self.assertEquals(len(self.train_graph.vars()), 94)
def test_numel_params(self): def test_numel_params(self):
self.build_program() self.build_program()
......
...@@ -627,6 +627,183 @@ class Variable(object): ...@@ -627,6 +627,183 @@ class Variable(object):
""" """
self.error_clip = error_clip self.error_clip = error_clip
def _slice_indices(self, slice, length):
"""
Reference implementation for the slice.indices method.
"""
# Compute step and length as integers.
step = 1 if slice.step is None else slice.step
# Raise ValueError for negative length or zero step.
if length < 0:
raise ValueError("length should not be negative")
if step == 0:
raise ValueError("slice step cannot be zero")
# Find lower and upper bounds for start and stop.
lower = -1 if step < 0 else 0
upper = length - 1 if step < 0 else length
# Compute start.
if slice.start is None:
start = upper if step < 0 else lower
else:
start = slice.start
start = max(start + length, lower) if start < 0 else min(start,
upper)
# Compute stop.
if slice.stop is None:
stop = lower if step < 0 else upper
else:
stop = slice.stop
stop = max(stop + length, lower) if stop < 0 else min(stop, upper)
return start, stop, step
def _detectEllipsis(self, item):
has_ellipsis = False
start = 0
end = len(self.shape)
for index, o in enumerate(item):
if o is Ellipsis:
if has_ellipsis:
raise ValueError("Index can have one ellipsis only.")
has_ellipsis = True
start = index
else:
if has_ellipsis:
end = index
return has_ellipsis, start, end
def _reconstructSliceinfo(self, item):
has_ellipsis, start, end = self._detectEllipsis(item)
if has_ellipsis:
newitem = []
for i in range(start):
newitem.append(item[i])
for i in range(start, end):
newitem.append(slice(None, None, None))
for i in range(end, len(item)):
newitem.append(item[i])
return newitem
else:
return None
def _detectContinuesSlice(self, item):
starts = []
ends = []
for index, o in enumerate(item):
if isinstance(o, int):
start = int(o)
if (index > 0 and index >= self.shape[index]) \
or (index < 0 and (index + self.shape[index]) < 0):
raise IndexError("invalid index")
start = max(start + self.shape[index], 0) if start < 0 else min(
start, self.shape[index])
starts.append(start)
ends.append(start + 1)
elif isinstance(o, slice):
start, stop, step = self._slice_indices(o, self.shape[index])
if step == 1 or step == -1:
starts.append(start)
ends.append(stop)
else:
return False, None
else:
raise IndexError("Valid index accept int or slice or ellipsis")
return True, [starts, ends]
def _cloneVar(self, copy=False):
if not copy:
return self.block.create_var(
name=unique_name.generate(".".join(self.name)),
dtype=self.dtype,
persistable=self.persistable,
stop_gradient=self._stop_gradient, )
else:
return self
def _sliceVar(self, axes, starts, ends):
new_var = self._cloneVar()
self.block.append_op(
type="slice",
inputs={'Input': [self]},
outputs={'Out': [new_var]},
attrs={'axes': axes,
'starts': starts,
'ends': ends})
return new_var
def _concatVar(self, inputs, axis):
new_var = self._cloneVar()
self.block.append_op(
type="concat",
inputs={'X': inputs},
outputs={'Out': [new_var]},
attrs={'axis': axis, })
return new_var
def _sliceAndConcatVar(self, item, axis):
if isinstance(item, slice):
if self.shape[axis] < 0:
return self._cloneVar(True)
start, stop, step = self._slice_indices(item, self.shape[axis])
if step == 1:
return self._sliceVar([axis], [start], [stop])
else:
vars = []
if step > 0:
while start < stop:
vars.append(
self._sliceVar([axis], [start], [start + 1]))
start += step
else:
while start > stop:
vars.append(
self._sliceVar([axis], [start], [start + 1]))
start += step
return self._concatVar(vars, axis)
elif isinstance(item, int):
if self.shape[axis] < 0:
return self._cloneVar(True)
index = int(item)
if (index > 0 and index >= self.shape[axis])\
or (index < 0 and (index + self.shape[axis]) < 0):
raise IndexError("invalid index")
return self._sliceVar([axis], [index], [index + 1])
else:
raise IndexError("Valid index accept int or slice or tuple")
def __getitem__(self, item):
"""
Slice the variable.
Args:
item(int/slice/tuple) : the index.
Returns:
Sliced variable
"""
new_var = None
if isinstance(item, tuple):
if len(item) > len(self.shape):
raise IndexError("Too many indexes")
newitem = self._reconstructSliceinfo(item) or item
check, info = self._detectContinuesSlice(newitem)
if check:
starts = info[0]
ends = info[1]
axes = [i for i in range(len(starts))]
return self._sliceVar(axes, starts, ends)
else:
new_var = self
for index, o in enumerate(newitem):
new_var = new_var._sliceAndConcatVar(o, index)
else:
new_var = self._sliceAndConcatVar(item, 0)
return new_var
def get_all_op_protos(): def get_all_op_protos():
""" """
...@@ -744,7 +921,7 @@ class Operator(object): ...@@ -744,7 +921,7 @@ class Operator(object):
if _in_imperative_mode(): if _in_imperative_mode():
if type is None: if type is None:
raise ValueError( raise ValueError(
"`type` to initilized an Operator can not be None.") "`type` to initialized an Operator can not be None.")
self.iop = core.OpBase(type) self.iop = core.OpBase(type)
# TODO(minqiyang): remove these lines after we take apart all # TODO(minqiyang): remove these lines after we take apart all
...@@ -906,7 +1083,10 @@ class Operator(object): ...@@ -906,7 +1083,10 @@ class Operator(object):
@property @property
def type(self): def type(self):
return self.desc.type() if _in_imperative_mode():
return self.iop.type
else:
return self.desc.type()
def input(self, name): def input(self, name):
""" """
...@@ -2052,6 +2232,28 @@ class IrOpNode(IrNode): ...@@ -2052,6 +2232,28 @@ class IrOpNode(IrNode):
else: else:
desc._set_attr(name, val) desc._set_attr(name, val)
def input_arg_names(self):
"""
Return input arguments' names of this op node.
Returns:
list(str): input arguments' names of this op node.
"""
assert self.node.op() is not None, \
"The node operator description cannot be None."
return self.node.op().input_arg_names()
def output_arg_names(self):
"""
Return output arguments' names of this op node.
Returns:
list(str): output arguments' names of this op node.
"""
assert self.node.op() is not None, \
"The node operator description cannot be None."
return self.node.op().output_arg_names()
@property @property
def inputs(self): def inputs(self):
""" """
...@@ -2142,31 +2344,38 @@ class IrGraph(object): ...@@ -2142,31 +2344,38 @@ class IrGraph(object):
""" """
return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()} return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()}
def var_node(self, name): def _find_var_node(self, key):
""" """
Get a variable node by name from the graph. Get a variable node by the `key` from this graph. The key
can be a node name or a node id.
WARNS:
There are some nodes may have the same name. So, be
cautious about using this method when you find the
target var node by its name.
Args: Args:
name(str): the name of the variable node. key(str|int): The str type denotes that the target variable node's name.
And the int type denotes that the target variable node's id.
Raises: Raises:
ValueError: The If input's type is not str, or this graph ValueError: If this graph doesn't have a variable with the giving name or id.
doesn't have a variable with the giving name.
Returns: Returns:
IrVarNode: the variable node with the giving name. IrVarNode: the variable node with the giving name or id.
""" """
if not isinstance(name, six.string_types):
raise TypeError(
"var require string as parameter, but get %s instead." %
(type(name)))
target_var_node = None target_var_node = None
var_nodes = self.all_var_nodes() var_nodes = self.all_var_nodes()
for var_node in var_nodes: if isinstance(key, six.string_types):
if var_node.name() == name: for var_node in var_nodes:
target_var_node = var_node if var_node.name() == key:
target_var_node = var_node
elif isinstance(key, int):
for var_node in var_nodes:
if var_node.id() == key:
target_var_node = var_node
if target_var_node is None: if target_var_node is None:
raise ValueError("var_node %s not in this graph" % name) raise ValueError("var_node %s not in this graph" % key)
return target_var_node return target_var_node
def create_persistable_node(self, name, var_type, shape, var_dtype): def create_persistable_node(self, name, var_type, shape, var_dtype):
...@@ -2312,6 +2521,34 @@ class IrGraph(object): ...@@ -2312,6 +2521,34 @@ class IrGraph(object):
original_nodes = {n.node for n in remove_nodes} original_nodes = {n.node for n in remove_nodes}
core.graph_safe_remove_nodes(self.graph, original_nodes) core.graph_safe_remove_nodes(self.graph, original_nodes)
def resolve_hazard(self):
def _to_node(nodes, node_name):
target_node = None
for n in nodes:
if n.name() == node_name:
target_node = n
assert target_node is not None, "Cannot find the target node in the giving set."
return target_node
ordered_nodes = core.topology_sort(self.graph)
var_nodes = dict()
for node in ordered_nodes:
if node.is_op() and node.op() is not None:
for each_var_name in node.op().input_arg_names():
if each_var_name not in var_nodes:
var_nodes[each_var_name] = [
_to_node(node.inputs, each_var_name)
]
for each_var_name in node.op().output_arg_names():
if each_var_name not in var_nodes:
var_nodes[each_var_name] = [
_to_node(node.outputs, each_var_name)
]
else:
var_nodes[each_var_name].append(
_to_node(node.outputs, each_var_name))
self.graph.resolve_hazard(var_nodes)
def has_circle(self): def has_circle(self):
""" """
Check if the graph has a circle. Check if the graph has a circle.
......
...@@ -44,7 +44,7 @@ def guard(place=None): ...@@ -44,7 +44,7 @@ def guard(place=None):
yield yield
def to_variable(value, block=None): def to_variable(value, block=None, name=None):
if isinstance(value, np.ndarray): if isinstance(value, np.ndarray):
assert enabled(), "to_variable could only be called in imperative mode" assert enabled(), "to_variable could only be called in imperative mode"
...@@ -53,9 +53,10 @@ def to_variable(value, block=None): ...@@ -53,9 +53,10 @@ def to_variable(value, block=None):
py_var = framework.Variable( py_var = framework.Variable(
block, block,
type=core.VarDesc.VarType.LOD_TENSOR, type=core.VarDesc.VarType.LOD_TENSOR,
name=None, name=name,
shape=value.shape, shape=value.shape,
dtype=value.dtype) dtype=value.dtype,
stop_gradient=True)
var = py_var._ivar.value() var = py_var._ivar.value()
tensor = var.get_tensor() tensor = var.get_tensor()
tensor.set(value, framework._current_expected_place()) tensor.set(value, framework._current_expected_place())
......
...@@ -105,6 +105,7 @@ class LayerObjectHelper(LayerHelperBase): ...@@ -105,6 +105,7 @@ class LayerObjectHelper(LayerHelperBase):
Returns dtype of the input Returns dtype of the input
""" """
inputs_in = inputs_in if (inputs_in is not None) else []
inputs = self._multiple_input(inputs_in) inputs = self._multiple_input(inputs_in)
dtype = None dtype = None
for each in inputs: for each in inputs:
...@@ -191,13 +192,7 @@ class LayerObjectHelper(LayerHelperBase): ...@@ -191,13 +192,7 @@ class LayerObjectHelper(LayerHelperBase):
act['use_mkldnn'] = use_mkl_dnn act['use_mkldnn'] = use_mkl_dnn
act_type = act.pop('type') act_type = act.pop('type')
tmp = input_var tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
# NOTE(dzhwinter): some activation support inplace compution.
# NOTE(minqiyang): currently, we don't support inplace in imperative mode
if not _in_imperative_mode() and core.IsInplace(act_type):
tmp = input_var
else:
tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
self.append_op( self.append_op(
type=act_type, type=act_type,
inputs={"X": [input_var]}, inputs={"X": [input_var]},
......
...@@ -17,10 +17,12 @@ import contextlib ...@@ -17,10 +17,12 @@ import contextlib
import sys import sys
import numpy as np import numpy as np
import collections import collections
import six
from .. import unique_name from .. import unique_name
from paddle.fluid import core from paddle.fluid import core
from .layer_object_helper import LayerObjectHelper from .layer_object_helper import LayerObjectHelper
from paddle.fluid import framework from paddle.fluid import framework
from ..param_attr import ParamAttr
__all__ = ['Layer', 'PyLayer'] __all__ = ['Layer', 'PyLayer']
...@@ -72,6 +74,10 @@ class Layer(core.Layer): ...@@ -72,6 +74,10 @@ class Layer(core.Layer):
Returns created parameter Variable. Returns created parameter Variable.
""" """
if isinstance(attr, ParamAttr) and (attr.name is not None):
attr.name = ".".join([self._full_name, attr.name])
elif isinstance(attr, six.string_types):
attr = ".".join([self._full_name, attr])
return self._helper.create_parameter(attr, shape, dtype, is_bias, return self._helper.create_parameter(attr, shape, dtype, is_bias,
default_initializer) default_initializer)
...@@ -164,6 +170,7 @@ class Layer(core.Layer): ...@@ -164,6 +170,7 @@ class Layer(core.Layer):
the sublayer passed in. the sublayer passed in.
""" """
assert isinstance(sublayer, core.Layer) assert isinstance(sublayer, core.Layer)
self._sub_layers[name] = sublayer self._sub_layers[name] = sublayer
return sublayer return sublayer
......
...@@ -15,15 +15,20 @@ ...@@ -15,15 +15,20 @@
from __future__ import print_function from __future__ import print_function
from six.moves import reduce from six.moves import reduce
import numpy as np
from .. import core from .. import core
from ..layers import utils from ..layers import utils
from . import layers from . import layers
from ..framework import Variable, OpProtoHolder from ..framework import Variable, OpProtoHolder
from ..layers import layer_function_generator
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from ..initializer import Normal, Constant from ..initializer import Normal, Constant, NumpyArrayInitializer
__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit'] __all__ = [
'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm',
'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', 'SequenceConv'
]
class Conv2D(layers.Layer): class Conv2D(layers.Layer):
...@@ -438,7 +443,6 @@ class Embedding(layers.Layer): ...@@ -438,7 +443,6 @@ class Embedding(layers.Layer):
self._size = size self._size = size
self._is_sparse = is_sparse self._is_sparse = is_sparse
self._is_distributed = is_distributed self._is_distributed = is_distributed
self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else ( self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
size[0] + padding_idx) size[0] + padding_idx)
...@@ -471,6 +475,131 @@ class Embedding(layers.Layer): ...@@ -471,6 +475,131 @@ class Embedding(layers.Layer):
return out return out
class LayerNorm(layers.Layer):
def __init__(self,
name_scope,
scale=True,
shift=True,
begin_norm_axis=1,
epsilon=1e-05,
param_attr=None,
bias_attr=None,
act=None):
"""
${comment}
The formula is as follows:
.. math::
\\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
\\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
* :math:`a`: the vector representation of the summed inputs to the neurons
in that layer.
* :math:`H`: the number of hidden units in a layers
* :math:`g`: the trainable scale parameter.
* :math:`b`: the trainable bias parameter.
Args:
input(Variable): The input tensor variable.
scale(bool): Whether to learn the adaptive gain :math:`g` after
normalization. Default True.
shift(bool): Whether to learn the adaptive bias :math:`b` after
normalization. Default True.
begin_norm_axis(int): The normalization will be performed along
dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
Default 1.
epsilon(float): The small value added to the variance to prevent
division by zero. Default 1e-05.
param_attr(ParamAttr|None): The parameter attribute for the learnable
gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
a default :code:`ParamAttr` would be added as scale. The
:attr:`param_attr` is initialized as 1 if it is added. Default None.
bias_attr(ParamAttr|None): The parameter attribute for the learnable
bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
a default :code:`ParamAttr` would be added as bias. The
:attr:`bias_attr` is initialized as 0 if it is added. Default None.
act(str): Activation to be applied to the output of layer normalizaiton.
Default None.
Returns:
${y_comment}
Examples:
>>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
>>> dtype='float32')
>>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
"""
super(LayerNorm, self).__init__(name_scope)
self._scale = scale
self._shift = shift
self._begin_norm_axis = begin_norm_axis
self._epsilon = epsilon
self._param_attr = param_attr
self._bias_attr = bias_attr
self._act = act
def _build_once(self, input):
self._dtype = self._helper.input_dtype(input)
input_shape = input.shape
param_shape = [
reduce(lambda x, y: x * y, input_shape[self._begin_norm_axis:])
]
if self._scale:
self._scale_w = self.create_parameter(
attr=self._param_attr,
shape=param_shape,
dtype=self._dtype,
default_initializer=Constant(1.0))
if self._shift:
assert self._bias_attr is not False
self._bias_w = self.create_parameter(
attr=self._bias_attr,
shape=param_shape,
dtype=self._dtype,
is_bias=True)
def forward(self, input):
inputs = dict()
inputs['X'] = input
if self._scale:
inputs['Scale'] = self._scale_w
if self._shift:
inputs['Bias'] = self._bias_w
# create output
mean_out = self._helper.create_variable_for_type_inference(
dtype=self._dtype, stop_gradient=True)
variance_out = self._helper.create_variable_for_type_inference(
dtype=self._dtype, stop_gradient=True)
layer_norm_out = self._helper.create_variable_for_type_inference(
self._dtype)
self._helper.append_op(
type="layer_norm",
inputs=inputs,
outputs={
"Y": layer_norm_out,
"Mean": mean_out,
"Variance": variance_out,
},
attrs={
"epsilon": self._epsilon,
"begin_norm_axis": self._begin_norm_axis
})
return self._helper.append_activation(layer_norm_out)
class GRUUnit(layers.Layer): class GRUUnit(layers.Layer):
""" """
**GRU unit layer** **GRU unit layer**
...@@ -603,3 +732,668 @@ class GRUUnit(layers.Layer): ...@@ -603,3 +732,668 @@ class GRUUnit(layers.Layer):
}) })
return updated_hidden, reset_hidden_pre, gate return updated_hidden, reset_hidden_pre, gate
class NCE(layers.Layer):
"""
${comment}
Args:
input (Variable): input variable.
label (Variable): label.
num_total_classes (int):${num_total_classes_comment}
sample_weight (Variable|None): A Variable of shape [batch_size, 1]
storing a weight for each sample. The default weight for each
sample is 1.0.
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
of nce. If it is set to None or one attribute of ParamAttr, nce
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of nce.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, nce
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
num_neg_samples (int): ${num_neg_samples_comment}
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None.
sampler (str): The sampler used to sample class from negtive classes.
It can be 'uniform', 'log_uniform' or 'custom_dist'.
default: 'uniform'.
custom_dist (float[]): A float[] with size=num_total_classes.
It is used when sampler is set to 'custom_dist'.
custom_dist[i] is the probsbility of i-th class to be sampled.
default: None.
seed (int): The seed used in sampler. default: 0.
is_sparse(bool): The flag indicating whether to use sparse update, the weight@GRAD and bias@GRAD will be changed to SelectedRows.
Returns:
Variable: The output nce loss.
Examples:
.. code-block:: python
window_size = 5
words = []
for i in xrange(window_size):
words.append(layers.data(
name='word_{0}'.format(i), shape=[1], dtype='int64'))
dict_size = 10000
label_word = int(window_size / 2) + 1
embs = []
for i in xrange(window_size):
if i == label_word:
continue
emb = layers.embedding(input=words[i], size=[dict_size, 32],
param_attr='emb.w', is_sparse=True)
embs.append(emb)
embs = layers.concat(input=embs, axis=1)
loss = layers.nce(input=embs, label=words[label_word],
num_total_classes=dict_size, param_attr='nce.w',
bias_attr='nce.b')
#or use custom distribution
dist = fluid.layers.assign(input=np.array([0.05,0.5,0.1,0.3,0.05]).astype("float32"))
loss = layers.nce(input=embs, label=words[label_word],
num_total_classes=5, param_attr='nce.w',
bias_attr='nce.b',
num_neg_samples=3,
sampler="custom_dist",
custom_dist=dist)
"""
def __init__(self,
name_scope,
num_total_classes,
param_attr=None,
bias_attr=None,
num_neg_samples=None,
sampler="uniform",
custom_dist=None,
seed=0,
is_sparse=False):
super(NCE, self).__init__(name_scope)
self._param_attr = param_attr
self._bias_attr = bias_attr
self._num_total_classes = num_total_classes
self._inputs = dict()
if sampler == "uniform":
sampler = 0
elif sampler == "log_uniform":
sampler = 1
elif sampler == "custom_dist":
assert custom_dist is not None
# assert isinstance(custom_dist, Variable)
custom_dist_len = len(custom_dist)
alias_probs_ = [0] * custom_dist_len
alias_ = [0] * custom_dist_len
bigs = []
littles = []
for i in range(custom_dist_len):
normal_prob = custom_dist[i] * custom_dist_len
if normal_prob - 1.0 > 0:
bigs.append((i, normal_prob))
elif 1.0 - normal_prob > 0:
littles.append((i, normal_prob))
else:
alias_probs_[i] = normal_prob
alias_[i] = -1
while len(bigs) and len(littles):
big = bigs.pop(0)
little = littles.pop(0)
big_idx = big[0]
big_prob = big[1]
alias_probs_[little[0]] = little[1]
alias_[little[0]] = big_idx
big_left = big[1] + little[1] - 1
if big_left - 1.0 > 0:
bigs.append((big_idx, big_left))
elif 1.0 - big_left > 0:
littles.append((big_idx, big_left))
else:
alias_probs_[big_idx] = big_left
alias_[big_idx] = -1
if len(bigs):
big = bigs.pop(0)
alias_probs_[big[0]] = 1.0
alias_[big[0]] = -1
if len(littles):
little = littles.pop(0)
alias_probs_[little[0]] = 1.0
alias_[little[0]] = -1
def _init_by_numpy_array(numpy_array):
ret = self.create_parameter(
attr=ParamAttr(),
shape=numpy_array.shape,
dtype=numpy_array.dtype,
default_initializer=NumpyArrayInitializer(numpy_array))
ret.stop_gradient = True
return ret
self._inputs['CustomDistProbs'] = _init_by_numpy_array(
np.array(custom_dist).astype('float32'))
self._inputs['CustomDistAlias'] = _init_by_numpy_array(
np.array(alias_).astype('int32'))
self._inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
np.array(alias_probs_).astype('float32'))
sampler = 2
else:
raise Exception("Unsupported sampler type.")
if num_neg_samples is None:
num_neg_samples = 10
else:
num_neg_samples = int(num_neg_samples)
self._num_neg_samples = num_neg_samples
remote_prefetch = is_sparse
print(
"With sparse mode, if your models has only small parameter prefetch may cause speed down"
)
self._attrs = {
'num_total_classes': int(num_total_classes),
'num_neg_samples': num_neg_samples,
'seed': seed,
'sampler': sampler,
'is_sparse': is_sparse,
'remote_prefetch': remote_prefetch
}
def _build_once(self, input, label, sample_weight=None):
assert isinstance(input, Variable)
assert isinstance(label, Variable)
dim = input.shape[1]
num_true_class = label.shape[1]
self._w = self.create_parameter(
attr=self._param_attr,
shape=[self._num_total_classes, dim],
is_bias=False,
dtype=input.dtype)
if self._bias_attr:
self._b = self.create_parameter(
attr=self._bias_attr,
shape=[self._num_total_classes, 1],
is_bias=True,
dtype=input.dtype)
self._inputs['Bias'] = self._b
self._inputs['Weight'] = self._w
def forward(self, input, label, sample_weight=None):
assert isinstance(input, Variable)
assert isinstance(label, Variable)
self._inputs['Input'] = input
self._inputs['Label'] = label
self._inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
cost = self._helper.create_variable_for_type_inference(
dtype=input.dtype)
sample_logits = self._helper.create_variable_for_type_inference(
dtype=input.dtype)
sample_labels = self._helper.create_variable_for_type_inference(
dtype=label.dtype)
self._helper.append_op(
type='nce',
inputs=self._inputs,
outputs={
'Cost': cost,
'SampleLogits': sample_logits,
'SampleLabels': sample_labels
},
attrs=self._attrs)
return cost / (self._num_neg_samples + 1)
class PRelu(layers.Layer):
"""
Equation:
.. math::
y = \max(0, x) + \\alpha * \min(0, x)
Args:
x (Variable): The input tensor.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
mode (string): The mode for weight sharing. It supports all, channel
and element. all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
mode = 'channel'
output = fluid.layers.prelu(x,mode)
"""
def __init__(self, name_scope, mode, param_attr=None):
super(PRelu, self).__init__(name_scope)
self._mode = mode
self._param_attr = param_attr
if self._mode not in ['all', 'channel', 'element']:
raise ValueError('mode should be one of all, channel, element.')
self._alpha_shape = [1]
def _build_once(self, input):
if self._mode == 'channel':
self._alpha_shape = [1, input.shape[1], 1, 1]
elif self._mode == 'element':
self._alpha_shape = input.shape
self._dtype = self._helper.input_dtype(input)
self._alpha = self.create_parameter(
attr=self._param_attr,
shape=self._alpha_shape,
dtype='float32',
is_bias=False,
default_initializer=Constant(1.0))
def forward(self, input):
out = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op(
type="prelu",
inputs={"X": input,
'Alpha': self._alpha},
attrs={"mode": self._mode},
outputs={"Out": out})
return out
class BilinearTensorProduct(layers.Layer):
"""
**Add Bilinear Tensor Product Layer**
This layer performs bilinear tensor product on two inputs.
For example:
.. math::
out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
In this formula:
- :math:`x`: the first input contains M elements, shape is [batch_size, M].
- :math:`y`: the second input contains N elements, shape is [batch_size, N].
- :math:`W_{i}`: the i-th learned weight, shape is [M, N]
- :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
- :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
Args:
x (Variable): 2-D input tensor with shape [batch_size, M]
y (Variable): 2-D input tensor with shape [batch_size, N]
size (int): The dimension of this layer.
act (str, default None): Activation to be applied to the output of this layer.
name (str, default None): The name of this layer.
param_attr (ParamAttr, default None): The parameter attribute for the learnable w.
parameters/weights of this layer.
bias_attr (ParamAttr, default None): The parameter attribute for the bias
of this layer. If it is set to False, no bias will be added to the output units.
If it is set to None, the bias is initialized zero. Default: None.
Returns:
Variable: A 2-D Tensor of shape [batch_size, size].
Examples:
.. code-block:: python
tensor = bilinear_tensor_product(x=layer1, y=layer2, size=1000)
"""
def __init__(self,
name_scope,
size,
name=None,
act=None,
param_attr=None,
bias_attr=None):
super(BilinearTensorProduct, self).__init__(name_scope)
self._param_attr = param_attr
self._bias_attr = bias_attr
self._act = act
self._size = size
self._name = name
self._inputs = dict()
def _build_once(self, x, y):
self._dtype = self._helper.input_dtype(x)
param_shape = [self._size, x.shape[1], y.shape[1]]
self._w = self.create_parameter(
attr=self._param_attr,
shape=param_shape,
dtype=self._dtype,
is_bias=False)
if self._bias_attr:
bias_size = [1, self._size]
bias = self.create_parameter(
attr=self._bias_attr,
shape=bias_size,
dtype=self._dtype,
is_bias=True)
self._inputs["Bias"] = bias
def forward(self, x, y):
self._inputs = {"X": x, "Y": y, "Weight": self._w}
if self._name is not None:
out = self._helper.create_variable(
name=".".join([self.full_name(), self._name]),
dtype=self._dtype,
persistable=False)
else:
out = self._helper.create_variable(
dtype=self._dtype, persistable=False)
self._helper.append_op(
type="bilinear_tensor_product",
inputs=self._inputs,
outputs={"Out": out})
# add activation
return self._helper.append_activation(out)
class Conv2DTranspose(layers.Layer):
"""
**Convlution2D transpose layer**
The convolution2D transpose layer calculates the output based on the input,
filter, and dilations, strides, paddings. Input(Input) and output(Output)
are in NCHW format. Where N is batch size, C is the number of channels,
H is the height of the feature, and W is the width of the feature.
Parameters(dilations, strides, paddings) are two elements. These two elements
represent height and width, respectively. The details of convolution transpose
layer, please refer to the following explanation and references
`therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
If bias attribution and activation type are provided, bias is added to
the output of the convolution, and the corresponding activation function
is applied to the final result.
For each input :math:`X`, the equation is:
.. math::
Out = \sigma (W \\ast X + b)
Where:
* :math:`X`: Input value, a tensor with NCHW format.
* :math:`W`: Filter value, a tensor with MCHW format.
* :math:`\\ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
* :math:`\\sigma`: Activation function.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
Example:
- Input:
Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
- Output:
Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
Where
.. math::
H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
Args:
input(Variable): The input image with [N, C, H, W] format.
num_filters(int): The number of the filter. It is as same as the output
image channel.
output_size(int|tuple|None): The output image size. If output size is a
tuple, it must contain two integers, (image_H, image_W). None if use
filter_size, padding, and stride to calculate output_size.
if output_size and filter_size are specified at the same time, They
should follow the formula above.
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
it must contain two integers, (filter_size_H, filter_size_W).
Otherwise, the filter will be a square. None if use output size to
calculate filter_size.
padding(int|tuple): The padding size. If padding is a tuple, it must
contain two integers, (padding_H, padding_W). Otherwise, the
padding_H = padding_W = padding. Default: padding = 0.
stride(int|tuple): The stride size. If stride is a tuple, it must
contain two integers, (stride_H, stride_W). Otherwise, the
stride_H = stride_W = stride. Default: stride = 1.
dilation(int|tuple): The dilation size. If dilation is a tuple, it must
contain two integers, (dilation_H, dilation_W). Otherwise, the
dilation_H = dilation_W = dilation. Default: dilation = 1.
groups(int): The groups number of the Conv2d transpose layer. Inspired by
grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
when group=2, the first half of the filters is only connected to the
first half of the input channels, while the second half of the
filters is only connected to the second half of the input channels.
Default: groups = 1.
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, conv2d_transpose
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True.
act (str): Activation type, if it is set to None, activation is not appended.
Default: None.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: True.
Returns:
Variable: The tensor variable storing the convolution transpose result.
Raises:
ValueError: If the shapes of input, filter_size, stride, padding and
groups mismatch.
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
"""
def __init__(self,
name_scope,
num_filters,
output_size=None,
filter_size=None,
padding=0,
stride=1,
dilation=1,
groups=None,
param_attr=None,
bias_attr=None,
use_cudnn=True,
act=None):
super(Conv2DTranspose, self).__init__(name_scope)
assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
self._param_attr = param_attr
self._bias_attr = bias_attr
self._groups = groups
self._num_filters = num_filters
self._use_cudnn = use_cudnn
self._padding = padding
self._stride = stride
self._dilation = dilation
self._filter_size = filter_size
self._output_size = output_size
self._op_type = 'conv2d_transpose'
def _build_once(self, input):
input_channel = input.shape[1]
if (input_channel == self._groups and
self._num_filters == input_channel and not self._use_cudnn):
self._op_type = 'depthwise_conv2d_transpose'
if not isinstance(input, Variable):
raise TypeError("Input of conv2d_transpose must be Variable")
self._padding = utils.convert_to_list(self._padding, 2, 'padding')
self._stride = utils.convert_to_list(self._stride, 2, 'stride')
self._dilation = utils.convert_to_list(self._dilation, 2, 'dilation')
if not isinstance(self._use_cudnn, bool):
raise ValueError("use_cudnn should be True or False")
if self._filter_size is None:
if self._output_size is None:
raise ValueError(
"output_size must be set when filter_size is None")
if isinstance(self._output_size, int):
self._output_size = [self._output_size, self._output_size]
h_in = input.shape[2]
w_in = input.shape[3]
filter_size_h = (self._output_size[0] -
(h_in - 1) * self._stride[0] + 2 * self._padding[0]
- 1) // self._dilation[0] + 1
filter_size_w = (self._output_size[1] -
(w_in - 1) * self._stride[1] + 2 * self._padding[1]
- 1) // self._dilation[1] + 1
self._filter_size = [filter_size_h, filter_size_w]
else:
self._filter_size = utils.convert_to_list(
self._output_size, 2, 'conv2d_transpose.filter_size')
if self._output_size is None:
self._output_size = []
elif isinstance(self._output_size, list) or isinstance(
self._output_size, int):
self._output_size = utils.convert_to_list(self._output_size, 2,
'output_size')
else:
raise ValueError("output_size should be list or int")
self._padding = utils.convert_to_list(self._padding, 2, 'padding')
self._groups = 1 if self._groups is None else self._groups
filter_shape = [input_channel, self._num_filters // self._groups
] + self._filter_size
self._img_filter = self.create_parameter(
dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
def forward(self, input):
pre_bias = self._helper.create_variable_for_type_inference(
dtype=input.dtype)
self._helper.append_op(
type=self._op_type,
inputs={'Input': [input],
'Filter': [self._img_filter]},
outputs={'Output': pre_bias},
attrs={
'output_size': self._output_size,
'strides': self._stride,
'paddings': self._padding,
'dilations': self._dilation,
'groups': self._groups,
'use_cudnn': self._use_cudnn
})
pre_act = self._helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
out = self._helper.append_activation(pre_act)
return out
class SequenceConv(layers.Layer):
"""
This function creates the op for sequence_conv, using the inputs and
other convolutional configurations for the filters and stride as given
in the input parameters to the function.
Args:
input (Variable): ${x_comment}
num_filters (int): number of filters.
filter_size (int): the filter size (H and W).
filter_stride (int): stride of the filter.
padding (bool): if True, add paddings.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, sequence_conv
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
of sequence_conv. If it is set to None or one attribute of ParamAttr, sequence_conv
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
act (str): Activation type, if it is set to None, activation is not appended.
Default: None.
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None.
Returns:
Variable: output of sequence_conv
"""
def __init__(self,
name_scope,
num_filters,
filter_size=3,
filter_stride=1,
padding=None,
bias_attr=None,
param_attr=None,
act=None):
super(SequenceConv, self).__init__(name_scope)
self._num_filters = num_filters
self._filter_size = filter_size
self._filter_stride = filter_stride
self._padding = padding
self._bias_attr = bias_attr
self._param_attr = param_attr
def _build_once(self, input):
self._dtype = self._helper.input_dtype(input)
print(self._filter_size)
filter_shape = [self._filter_size * input.shape[1], self._num_filters]
self._filter_param = self.create_parameter(
attr=self.param_attr, shape=filter_shape, dtype=self._dtype)
def forward(self, input):
pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op(
type='sequence_conv',
inputs={
'X': [input],
'Filter': [self._filter_param],
},
outputs={"Out": pre_bias},
attrs={
'contextStride': self._filter_stride,
'contextStart': -int(self._filter_size // 2),
'contextLength': self._filter_size
})
pre_act = self._helper.append_bias_op(pre_bias)
return self._helper.append_activation(pre_act)
...@@ -62,7 +62,7 @@ class Tracer(core.Tracer): ...@@ -62,7 +62,7 @@ class Tracer(core.Tracer):
if len(backward_refs) > 0: if len(backward_refs) > 0:
op.iop.register_backward_hooks(release_op) op.iop.register_backward_hooks(release_op)
# TODO(minqiyang): remove all inputs and outputs after seperate # TODO(minqiyang): remove all inputs and outputs after separate
# var and grad # var and grad
op.backward_refs = defaultdict(list) op.backward_refs = defaultdict(list)
for k, v in six.iteritems(op.inputs): for k, v in six.iteritems(op.inputs):
......
...@@ -212,7 +212,7 @@ class UniformInitializer(Initializer): ...@@ -212,7 +212,7 @@ class UniformInitializer(Initializer):
if self._seed == 0: if self._seed == 0:
self._seed = block.program.random_seed self._seed = block.program.random_seed
# to be compatible of fp16 initalizers # to be compatible of fp16 initializers
if var.dtype == VarDesc.VarType.FP16: if var.dtype == VarDesc.VarType.FP16:
out_dtype = VarDesc.VarType.FP32 out_dtype = VarDesc.VarType.FP32
out_var = block.create_var( out_var = block.create_var(
...@@ -756,7 +756,7 @@ class NumpyArrayInitializer(Initializer): ...@@ -756,7 +756,7 @@ class NumpyArrayInitializer(Initializer):
values = [int(v) for v in self._value.flat] values = [int(v) for v in self._value.flat]
else: else:
raise ValueError("Unsupported dtype %s", self._value.dtype) raise ValueError("Unsupported dtype %s", self._value.dtype)
if self._value.size > 1024 * 1024 * 5: if self._value.size > 1024 * 1024 * 1024:
raise ValueError("The size of input is too big. Please consider " raise ValueError("The size of input is too big. Please consider "
"saving it to file and 'load_op' to load it") "saving it to file and 'load_op' to load it")
op = block._prepend_op( op = block._prepend_op(
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .framework import Program, program_guard, unique_name, default_startup_program
from .param_attr import ParamAttr
from .initializer import Constant
from . import layers
from . import backward
from .imperative import Layer, nn
from . import executor
from . import core
import numpy as np
__all__ = ['run_check']
class SimpleLayer(Layer):
def __init__(self, name_scope):
super(SimpleLayer, self).__init__(name_scope)
self._fc1 = nn.FC(self.full_name(),
3,
ParamAttr(initializer=Constant(value=0.1)))
def forward(self, inputs):
x = self._fc1(inputs)
x = layers.reduce_sum(x)
return x
def run_check():
''' intall check to verify if install is success
This func should not be called only if you need to verify installation
'''
print("Running Verify Fluid Program ... ")
prog = Program()
startup_prog = Program()
scope = core.Scope()
with executor.scope_guard(scope):
with program_guard(prog, startup_prog):
with unique_name.guard():
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
inp = layers.data(
name="inp", shape=[2, 2], append_batch_size=False)
simple_layer = SimpleLayer("simple_layer")
out = simple_layer(inp)
param_grads = backward.append_backward(
out, parameter_list=[simple_layer._fc1._w.name])[0]
exe = executor.Executor(core.CPUPlace(
) if not core.is_compiled_with_cuda() else core.CUDAPlace(0))
exe.run(default_startup_program())
exe.run(feed={inp.name: np_inp},
fetch_list=[out.name, param_grads[1].name])
print(
"Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now"
)
...@@ -151,13 +151,7 @@ class LayerHelper(LayerHelperBase): ...@@ -151,13 +151,7 @@ class LayerHelper(LayerHelperBase):
act['use_mkldnn'] = self.kwargs.get('use_mkldnn') act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
act_type = act.pop('type') act_type = act.pop('type')
tmp = input_var tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
# NOTE(dzhwinter): some activation support inplace compution.
# NOTE(minqiyang): currently, we don't support inplace in imperative mode
if not _in_imperative_mode() and core.IsInplace(act_type):
tmp = input_var
else:
tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
self.append_op( self.append_op(
type=act_type, type=act_type,
inputs={"X": [input_var]}, inputs={"X": [input_var]},
......
...@@ -268,11 +268,9 @@ class LayerHelperBase(object): ...@@ -268,11 +268,9 @@ class LayerHelperBase(object):
""" """
# Deepcopy the attr so that parameters can be shared in program # Deepcopy the attr so that parameters can be shared in program
attr = copy.deepcopy(attr) attr = copy.deepcopy(attr)
if attr is None: attr = ParamAttr._to_attr(attr)
attr = ParamAttr._to_attr(attr)
if not attr: if not attr:
return None return None
assert isinstance(attr, ParamAttr) assert isinstance(attr, ParamAttr)
suffix = 'b' if is_bias else 'w' suffix = 'b' if is_bias else 'w'
if attr.name is None: if attr.name is None:
......
...@@ -6206,7 +6206,8 @@ def one_hot(input, depth): ...@@ -6206,7 +6206,8 @@ def one_hot(input, depth):
type="one_hot", type="one_hot",
inputs={'X': input}, inputs={'X': input},
attrs={'depth': depth}, attrs={'depth': depth},
outputs={'Out': one_hot_out}) outputs={'Out': one_hot_out},
stop_gradient=True)
return one_hot_out return one_hot_out
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
from six.moves import reduce
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from ..framework import convert_np_dtype_to_dtype_ from ..framework import convert_np_dtype_to_dtype_
......
...@@ -165,6 +165,8 @@ class Optimizer(object): ...@@ -165,6 +165,8 @@ class Optimizer(object):
name = self._name + "_" + name name = self._name + "_" + name
if (name in self._accumulators and if (name in self._accumulators and
param.name in self._accumulators[name]): param.name in self._accumulators[name]):
if framework._in_imperative_mode():
return self._accumulators[name][param.name]
raise Exception("Accumulator {} already exists for parameter {}". raise Exception("Accumulator {} already exists for parameter {}".
format(name, param.name)) format(name, param.name))
if shape == None: if shape == None:
...@@ -397,13 +399,14 @@ class Optimizer(object): ...@@ -397,13 +399,14 @@ class Optimizer(object):
for param in parameters: for param in parameters:
if not param.trainable: if not param.trainable:
continue continue
# create gradient variable if param._ivar._grad_ivar() is not None:
grad_var = Variable( # create gradient variable
block=loss.block, grad_var = Variable(
name=param._ivar._grad_name(), block=loss.block,
stop_gradient=True, name=param._ivar._grad_name(),
ivar=param._ivar._grad_ivar()) stop_gradient=True,
params_grads.append((param, grad_var)) ivar=param._ivar._grad_ivar())
params_grads.append((param, grad_var))
with program_guard(framework.default_main_program(), with program_guard(framework.default_main_program(),
framework.default_startup_program()): framework.default_startup_program()):
optimize_ops = self._create_optimization_pass(params_grads) optimize_ops = self._create_optimization_pass(params_grads)
......
...@@ -68,9 +68,9 @@ class TestDistSaveLoadDense2x2(TestDistBase): ...@@ -68,9 +68,9 @@ class TestDistSaveLoadDense2x2(TestDistBase):
train0_np = np.array(tr0_var) train0_np = np.array(tr0_var)
train1_np = np.array(tr1_var) train1_np = np.array(tr1_var)
self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta) np.testing.assert_almost_equal(local_np, train0_np, decimal=2)
self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta) np.testing.assert_almost_equal(local_np, train1_np, decimal=2)
self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta) np.testing.assert_almost_equal(train0_np, train1_np, decimal=2)
def test_dist(self): def test_dist(self):
need_envs = { need_envs = {
...@@ -134,10 +134,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase): ...@@ -134,10 +134,8 @@ class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
train0_2_np = np.array(tr0_var_2) train0_2_np = np.array(tr0_var_2)
train1_2_np = np.array(tr1_var_2) train1_2_np = np.array(tr1_var_2)
self.assertAlmostEqual( np.testing.assert_almost_equal(train0_1_np, train0_2_np, decimal=2)
train0_1_np.all(), train0_2_np.all(), delta=delta) np.testing.assert_almost_equal(train1_1_np, train1_2_np, decimal=2)
self.assertAlmostEqual(
train1_1_np.all(), train1_2_np.all(), delta=delta)
def test_dist(self): def test_dist(self):
need_envs = { need_envs = {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import unittest import unittest
import numpy as np import numpy as np
import random import random
import os
import sys import sys
import paddle import paddle
...@@ -23,16 +24,17 @@ import paddle.fluid.core as core ...@@ -23,16 +24,17 @@ import paddle.fluid.core as core
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from paddle.fluid.imperative.base import to_variable from paddle.fluid.imperative.base import to_variable
NUM_USERS = 100 # Can use Amusic dataset as the DeepCF describes.
NUM_ITEMS = 1000 DATA_PATH = os.environ.get('DATA_PATH', '')
BATCH_SIZE = 32 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 128))
NUM_BATCHES = 2 NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5))
NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
class MLP(fluid.imperative.Layer): class DMF(fluid.imperative.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(MLP, self).__init__(name_scope) super(DMF, self).__init__(name_scope)
self._user_latent = fluid.imperative.FC(self.full_name(), 256) self._user_latent = fluid.imperative.FC(self.full_name(), 256)
self._item_latent = fluid.imperative.FC(self.full_name(), 256) self._item_latent = fluid.imperative.FC(self.full_name(), 256)
...@@ -61,9 +63,9 @@ class MLP(fluid.imperative.Layer): ...@@ -61,9 +63,9 @@ class MLP(fluid.imperative.Layer):
return fluid.layers.elementwise_mul(users, items) return fluid.layers.elementwise_mul(users, items)
class DMF(fluid.imperative.Layer): class MLP(fluid.imperative.Layer):
def __init__(self, name_scope): def __init__(self, name_scope):
super(DMF, self).__init__(name_scope) super(MLP, self).__init__(name_scope)
self._user_latent = fluid.imperative.FC(self.full_name(), 256) self._user_latent = fluid.imperative.FC(self.full_name(), 256)
self._item_latent = fluid.imperative.FC(self.full_name(), 256) self._item_latent = fluid.imperative.FC(self.full_name(), 256)
self._match_layers = [] self._match_layers = []
...@@ -87,21 +89,30 @@ class DMF(fluid.imperative.Layer): ...@@ -87,21 +89,30 @@ class DMF(fluid.imperative.Layer):
class DeepCF(fluid.imperative.Layer): class DeepCF(fluid.imperative.Layer):
def __init__(self, name_scope): def __init__(self, name_scope, num_users, num_items, matrix):
super(DeepCF, self).__init__(name_scope) super(DeepCF, self).__init__(name_scope)
self._num_users = num_users
self._user_emb = fluid.imperative.Embedding(self.full_name(), self._num_items = num_items
[NUM_USERS, 256]) self._rating_matrix = self.create_parameter(
self._item_emb = fluid.imperative.Embedding(self.full_name(), fluid.ParamAttr(trainable=False),
[NUM_ITEMS, 256]) matrix.shape,
matrix.dtype,
is_bias=False,
default_initializer=fluid.initializer.NumpyArrayInitializer(matrix))
self._rating_matrix._stop_gradient = True
self._mlp = MLP(self.full_name()) self._mlp = MLP(self.full_name())
self._dmf = DMF(self.full_name()) self._dmf = DMF(self.full_name())
self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid') self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid')
def forward(self, users, items): def forward(self, users, items):
users_emb = self._user_emb(users) # users_emb = self._user_emb(users)
items_emb = self._item_emb(items) # items_emb = self._item_emb(items)
users_emb = fluid.layers.gather(self._rating_matrix, users)
items_emb = fluid.layers.gather(
fluid.layers.transpose(self._rating_matrix, [1, 0]), items)
users_emb.stop_gradient = True
items_emb.stop_gradient = True
mlp_predictive = self._mlp(users_emb, items_emb) mlp_predictive = self._mlp(users_emb, items_emb)
dmf_predictive = self._dmf(users_emb, items_emb) dmf_predictive = self._dmf(users_emb, items_emb)
...@@ -116,27 +127,79 @@ def get_data(): ...@@ -116,27 +127,79 @@ def get_data():
user_ids = [] user_ids = []
item_ids = [] item_ids = []
labels = [] labels = []
NUM_USERS = 100
NUM_ITEMS = 1000
matrix = np.zeros([NUM_USERS, NUM_ITEMS], dtype=np.float32)
for uid in range(NUM_USERS): for uid in range(NUM_USERS):
for iid in range(NUM_ITEMS): for iid in range(NUM_ITEMS):
# 10% positive label = float(random.randint(1, 6) == 1)
label = float(random.randint(1, 10) == 1)
user_ids.append(uid) user_ids.append(uid)
item_ids.append(iid) item_ids.append(iid)
labels.append(label) labels.append(label)
indices = np.arange(NUM_USERS * NUM_ITEMS) matrix[uid, iid] = label
indices = np.arange(len(user_ids))
np.random.shuffle(indices)
users_np = np.array(user_ids, dtype=np.int32)[indices]
items_np = np.array(item_ids, dtype=np.int32)[indices]
labels_np = np.array(labels, dtype=np.float32)[indices]
return np.expand_dims(users_np, -1), \
np.expand_dims(items_np, -1), \
np.expand_dims(labels_np, -1), NUM_USERS, NUM_ITEMS, matrix
def load_data(DATA_PATH):
sys.stderr.write('loading from %s\n' % DATA_PATH)
likes = dict()
num_users = -1
num_items = -1
with open(DATA_PATH, 'r') as f:
for l in f.readlines():
uid, iid, rating = [int(v) for v in l.split('\t')]
num_users = max(num_users, uid + 1)
num_items = max(num_items, iid + 1)
if float(rating) > 0.0:
likes[(uid, iid)] = 1.0
user_ids = []
item_ids = []
labels = []
matrix = np.zeros([num_users, num_items], dtype=np.float32)
for uid, iid in likes.keys():
user_ids.append(uid)
item_ids.append(iid)
labels.append(1.0)
matrix[uid, iid] = 1.0
negative = 0
while negative < 3:
nuid = random.randint(0, num_users - 1)
niid = random.randint(0, num_items - 1)
if (nuid, niid) not in likes:
negative += 1
user_ids.append(nuid)
item_ids.append(niid)
labels.append(0.0)
indices = np.arange(len(user_ids))
np.random.shuffle(indices) np.random.shuffle(indices)
users_np = np.array(user_ids, dtype=np.int64)[indices] users_np = np.array(user_ids, dtype=np.int32)[indices]
items_np = np.array(item_ids, dtype=np.int64)[indices] items_np = np.array(item_ids, dtype=np.int32)[indices]
labels_np = np.array(labels, dtype=np.float32)[indices] labels_np = np.array(labels, dtype=np.float32)[indices]
return np.expand_dims(users_np, -1), \ return np.expand_dims(users_np, -1), \
np.expand_dims(items_np, -1), \ np.expand_dims(items_np, -1), \
np.expand_dims(labels_np, -1) np.expand_dims(labels_np, -1), num_users, num_items, matrix
class TestImperativeDeepCF(unittest.TestCase): class TestImperativeDeepCF(unittest.TestCase):
def test_gan_float32(self): def test_deefcf(self):
seed = 90 seed = 90
users_np, items_np, labels_np = get_data() if DATA_PATH:
(users_np, items_np, labels_np, num_users, num_items,
matrix) = load_data(DATA_PATH)
else:
(users_np, items_np, labels_np, num_users, num_items,
matrix) = get_data()
startup = fluid.Program() startup = fluid.Program()
startup.random_seed = seed startup.random_seed = seed
...@@ -145,11 +208,11 @@ class TestImperativeDeepCF(unittest.TestCase): ...@@ -145,11 +208,11 @@ class TestImperativeDeepCF(unittest.TestCase):
scope = fluid.core.Scope() scope = fluid.core.Scope()
with new_program_scope(main=main, startup=startup, scope=scope): with new_program_scope(main=main, startup=startup, scope=scope):
users = fluid.layers.data('users', [1], dtype='int64') users = fluid.layers.data('users', [1], dtype='int32')
items = fluid.layers.data('items', [1], dtype='int64') items = fluid.layers.data('items', [1], dtype='int32')
labels = fluid.layers.data('labels', [1], dtype='float32') labels = fluid.layers.data('labels', [1], dtype='float32')
deepcf = DeepCF('deepcf') deepcf = DeepCF('deepcf', num_users, num_items, matrix)
prediction = deepcf(users, items) prediction = deepcf(users, items)
loss = fluid.layers.reduce_sum( loss = fluid.layers.reduce_sum(
fluid.layers.log_loss(prediction, labels)) fluid.layers.log_loss(prediction, labels))
...@@ -159,35 +222,44 @@ class TestImperativeDeepCF(unittest.TestCase): ...@@ -159,35 +222,44 @@ class TestImperativeDeepCF(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
exe.run(startup) exe.run(startup)
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): for e in range(NUM_EPOCHES):
static_loss = exe.run( sys.stderr.write('epoch %d\n' % e)
main, for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
feed={ if slice + BATCH_SIZE >= users_np.shape[0]:
users.name: users_np[slice:slice + BATCH_SIZE], break
items.name: items_np[slice:slice + BATCH_SIZE], static_loss = exe.run(
labels.name: labels_np[slice:slice + BATCH_SIZE] main,
}, feed={
fetch_list=[loss])[0] users.name: users_np[slice:slice + BATCH_SIZE],
sys.stderr.write('static loss %s\n' % static_loss) items.name: items_np[slice:slice + BATCH_SIZE],
labels.name: labels_np[slice:slice + BATCH_SIZE]
},
fetch_list=[loss])[0]
sys.stderr.write('static loss %s\n' % static_loss)
with fluid.imperative.guard(): with fluid.imperative.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
deepcf = DeepCF('deepcf') deepcf = DeepCF('deepcf', num_users, num_items, matrix)
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): adam = fluid.optimizer.AdamOptimizer(0.01)
prediction = deepcf( for e in range(NUM_EPOCHES):
to_variable(users_np[slice:slice + BATCH_SIZE]), sys.stderr.write('epoch %d\n' % e)
to_variable(items_np[slice:slice + BATCH_SIZE])) for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
loss = fluid.layers.reduce_sum( if slice + BATCH_SIZE >= users_np.shape[0]:
fluid.layers.log_loss(prediction, break
to_variable(labels_np[slice:slice + prediction = deepcf(
BATCH_SIZE]))) to_variable(users_np[slice:slice + BATCH_SIZE]),
loss._backward() to_variable(items_np[slice:slice + BATCH_SIZE]))
adam = fluid.optimizer.AdamOptimizer(0.01) loss = fluid.layers.reduce_sum(
adam.minimize(loss) fluid.layers.log_loss(prediction,
deepcf.clear_gradients() to_variable(labels_np[
dy_loss = loss._numpy() slice:slice + BATCH_SIZE])))
loss._backward()
adam.minimize(loss)
deepcf.clear_gradients()
dy_loss = loss._numpy()
sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
self.assertEqual(static_loss, dy_loss) self.assertEqual(static_loss, dy_loss)
......
...@@ -59,7 +59,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer): ...@@ -59,7 +59,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
dtype="float32", dtype="float32",
default_initializer=fluid.initializer.UniformInitializer( default_initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)) low=-self._init_scale, high=self._init_scale))
self.weight_1_arr.append(weight_1) self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
bias_1 = self.create_parameter( bias_1 = self.create_parameter(
attr=fluid.ParamAttr( attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer( initializer=fluid.initializer.UniformInitializer(
...@@ -67,7 +67,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer): ...@@ -67,7 +67,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
shape=[self._hidden_size * 4], shape=[self._hidden_size * 4],
dtype="float32", dtype="float32",
default_initializer=fluid.initializer.Constant(0.0)) default_initializer=fluid.initializer.Constant(0.0))
self.bias_arr.append(bias_1) self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
def forward(self, input_embedding, init_hidden=None, init_cell=None): def forward(self, input_embedding, init_hidden=None, init_cell=None):
self.cell_array = [] self.cell_array = []
...@@ -242,7 +242,7 @@ class TestImperativePtbRnn(unittest.TestCase): ...@@ -242,7 +242,7 @@ class TestImperativePtbRnn(unittest.TestCase):
dy_loss = None dy_loss = None
last_hidden = None last_hidden = None
last_cell = None last_cell = None
batch_num = 50 batch_num = 200
for i in range(batch_num): for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64') x_data = np.arange(12).reshape(4, 3).astype('int64')
...@@ -264,8 +264,10 @@ class TestImperativePtbRnn(unittest.TestCase): ...@@ -264,8 +264,10 @@ class TestImperativePtbRnn(unittest.TestCase):
dy_param_init[param.name] = param._numpy() dy_param_init[param.name] = param._numpy()
dy_loss._backward() dy_loss._backward()
sgd.minimize(dy_loss) sgd.minimize(dy_loss)
for param in ptb_model.parameters(): ptb_model.clear_gradients()
dy_param_updated[param.name] = param._numpy() if i == batch_num - 1:
for param in ptb_model.parameters():
dy_param_updated[param.name] = param._numpy()
with new_program_scope(): with new_program_scope():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
...@@ -323,25 +325,28 @@ class TestImperativePtbRnn(unittest.TestCase): ...@@ -323,25 +325,28 @@ class TestImperativePtbRnn(unittest.TestCase):
}, },
fetch_list=fetch_list) fetch_list=fetch_list)
static_loss_value = out[0] static_loss_value = out[0]
static_last_cell_value = out[1] static_last_hidden_value = out[1]
static_last_hidden_value = out[2] static_last_cell_value = out[2]
for k in range(3, len(out)):
static_param_updated[static_param_name_list[k - 3]] = out[k]
if i == batch_num - 1:
for k in range(3, len(out)):
static_param_updated[static_param_name_list[k -
3]] = out[k]
self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy()))
self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy()))
self.assertTrue(
np.allclose(static_last_hidden_value, last_hidden._numpy()))
for key, value in six.iteritems(static_param_init):
# print("static_init name: {}, value {}".format(key, value))
# print("dy_init name: {}, value {}".format(key, dy_param_init[key]))
self.assertTrue(np.allclose(value, dy_param_init[key], atol=1e-5))
for key, value in six.iteritems(static_param_updated):
# print("static name: {}, value {}".format(key, value))
# print("dy name: {}, value {}".format(key, dy_param_updated[key]))
self.assertTrue( self.assertTrue(
np.allclose(static_loss_value.all(), dy_loss._numpy().all())) np.allclose(
self.assertTrue( value, dy_param_updated[key], atol=1e-5))
np.allclose(static_last_cell_value.all(),
last_cell._numpy().all()))
self.assertTrue(
np.allclose(static_last_hidden_value.all(),
last_hidden._numpy().all()))
for key, value in six.iteritems(static_param_init):
self.assertTrue(
np.allclose(value.all(), dy_param_init[key].all()))
for key, value in six.iteritems(static_param_updated):
self.assertTrue(
np.allclose(value.all(), dy_param_updated[key].all()))
if __name__ == '__main__': if __name__ == '__main__':
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
from paddle.fluid.imperative import Embedding, LayerNorm, FC, to_variable, Layer, guard
from test_imperative_base import new_program_scope
from paddle.fluid import core
import numpy as np
import six
np.set_printoptions(suppress=True)
# Copy from models
class TrainTaskConfig(object):
# support both CPU and GPU now.
use_gpu = True
# the epoch number to train.
pass_num = 30
# the number of sequences contained in a mini-batch.
# deprecated, set batch_size in args.
batch_size = 32
# the hyper parameters for Adam optimizer.
# This static learning_rate will be multiplied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate = 2.0
beta1 = 0.9
beta2 = 0.997
eps = 1e-9
# the parameters for learning rate scheduling.
warmup_steps = 8000
# the weight used to mix up the ground-truth distribution and the fixed
# uniform distribution in label smoothing when training.
# Set this as zero if label smoothing is not wanted.
label_smooth_eps = 0.1
# the directory for saving trained models.
model_dir = "trained_models"
# the directory for saving checkpoints.
ckpt_dir = "trained_ckpts"
# the directory for loading checkpoint.
# If provided, continue training from the checkpoint.
ckpt_path = None
# the parameter to initialize the learning rate scheduler.
# It should be provided if use checkpoints, since the checkpoint doesn't
# include the training step counter currently.
start_step = 0
# the frequency to save trained models.
save_freq = 10000
class InferTaskConfig(object):
use_gpu = True
# the number of examples in one run for sequence generation.
batch_size = 10
# the parameters for beam search.
beam_size = 5
max_out_len = 256
# the number of decoded sentences to output.
n_best = 1
# the flags indicating whether to output the special tokens.
output_bos = False
output_eos = False
output_unk = True
# the directory for loading the trained model.
model_path = "trained_models/pass_1.infer.model"
class ModelHyperParams(object):
# These following five vocabularies related configurations will be set
# automatically according to the passed vocabulary path and special tokens.
# size of source word dictionary.
src_vocab_size = 10000
# size of target word dictionay
trg_vocab_size = 10000
# index for <bos> token
bos_idx = 0
# index for <eos> token
eos_idx = 1
# index for <unk> token
unk_idx = 2
# max length of sequences deciding the size of position encoding table.
max_length = 4
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model = 512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid = 2048
# the dimension that keys are projected to for dot-product attention.
d_key = 64
# the dimension that values are projected to for dot-product attention.
d_value = 64
# number of head used in multi-head attention.
n_head = 8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer = 6
# dropout rates of different modules.
prepostprocess_dropout = 0.1
attention_dropout = 0.1
relu_dropout = 0.1
# to process before each sub-layer
preprocess_cmd = "n" # layer normalization
# to process after each sub-layer
postprocess_cmd = "da" # dropout + residual connection
# random seed used in dropout for CE.
dropout_seed = 1
# the flag indicating whether to share embedding and softmax weights.
# vocabularies in source and target should be same for weight sharing.
weight_sharing = True
def merge_cfg_from_list(cfg_list, g_cfgs):
"""
Set the above global configurations using the cfg_list.
"""
assert len(cfg_list) % 2 == 0
for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
for g_cfg in g_cfgs:
if hasattr(g_cfg, key):
try:
value = eval(value)
except Exception: # for file path
pass
setattr(g_cfg, key, value)
break
def position_encoding_init(n_position, d_pos_vec):
"""
Generate the initial values for the sinusoid position encoding table.
"""
channels = d_pos_vec
position = np.arange(n_position)
num_timescales = channels // 2
log_timescale_increment = (np.log(float(1e4) / float(1)) /
(num_timescales - 1))
inv_timescales = np.exp(np.arange(
num_timescales)) * -log_timescale_increment
scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
0)
signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
position_enc = signal
return position_enc.astype("float32")
def create_data(is_static=False):
if is_static:
return [
src_word_np, src_pos_np, src_slf_attn_bias_np, trg_word_np,
trg_pos_np, trg_slf_attn_bias_np, trg_src_attn_bias_np, lbl_word_np,
lbl_weight_np
]
else:
enc_inputs = [
to_variable(src_word_np), to_variable(src_pos_np),
to_variable(src_slf_attn_bias_np)
]
dec_inputs = [
to_variable(trg_word_np), to_variable(trg_pos_np),
to_variable(trg_slf_attn_bias_np), to_variable(trg_src_attn_bias_np)
]
label = to_variable(lbl_word_np)
weight = to_variable(lbl_weight_np)
return enc_inputs, dec_inputs, label, weight
def create_feed_dict_list(data, init=False):
if init:
data_input_names = encoder_data_input_fields + \
decoder_data_input_fields[:-1] + label_data_input_fields + pos_enc_param_names
else:
data_input_names = encoder_data_input_fields + \
decoder_data_input_fields[:-1] + label_data_input_fields
feed_dict_list = dict()
for i in range(len(data_input_names)):
feed_dict_list[data_input_names[i]] = data[i]
return feed_dict_list
def make_all_inputs(input_fields):
"""
Define the input data layers for the transformer model.
"""
inputs = []
for input_field in input_fields:
input_var = fluid.layers.data(
name=input_field,
shape=input_descs[input_field][0],
dtype=input_descs[input_field][1],
lod_level=input_descs[input_field][2]
if len(input_descs[input_field]) == 3 else 0,
append_batch_size=False)
inputs.append(input_var)
return inputs
# The placeholder for batch_size in compile time. Must be -1 currently to be
# consistent with some ops' infer-shape output in compile time, such as the
# sequence_expand op used in beamsearch decoder.
batch_size = 32
# The placeholder for squence length in compile time.
seq_len = ModelHyperParams.max_length
# Here list the data shapes and data types of all inputs.
# The shapes here act as placeholder and are set to pass the infer-shape in
# compile time.
input_descs = {
# The actual data shape of src_word is:
# [batch_size, max_src_len_in_batch, 1]
"src_word": [(batch_size, seq_len, 1), "int64", 2],
# The actual data shape of src_pos is:
# [batch_size, max_src_len_in_batch, 1]
"src_pos": [(batch_size, seq_len, 1), "int64"],
# This input is used to remove attention weights on paddings in the
# encoder.
# The actual data shape of src_slf_attn_bias is:
# [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
"src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# The actual data shape of trg_word is:
# [batch_size, max_trg_len_in_batch, 1]
"trg_word": [(batch_size, seq_len, 1), "int64",
2], # lod_level is only used in fast decoder.
# The actual data shape of trg_pos is:
# [batch_size, max_trg_len_in_batch, 1]
"trg_pos": [(batch_size, seq_len, 1), "int64"],
# This input is used to remove attention weights on paddings and
# subsequent words in the decoder.
# The actual data shape of trg_slf_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
"trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This input is used to remove attention weights on paddings of the source
# input in the encoder-decoder attention.
# The actual data shape of trg_src_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
"trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This input is used in independent decoder program for inference.
# The actual data shape of enc_output is:
# [batch_size, max_src_len_in_batch, d_model]
"enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
# The actual data shape of label_word is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_word": [(batch_size * seq_len, 1), "int64"],
# This input is used to mask out the loss of paddding tokens.
# The actual data shape of label_weight is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_weight": [(batch_size * seq_len, 1), "float32"],
# This input is used in beam-search decoder.
"init_score": [(batch_size, 1), "float32", 2],
# This input is used in beam-search decoder for the first gather
# (cell states updation)
"init_idx": [(batch_size, ), "int32"],
}
# Names of word embedding table which might be reused for weight sharing.
word_emb_param_names = (
"src_word_emb_table",
"trg_word_emb_table", )
# Names of position encoding table which will be initialized externally.
pos_enc_param_names = (
"src_pos_enc_table",
"trg_pos_enc_table", )
# separated inputs for different usages.
encoder_data_input_fields = (
"src_word",
"src_pos",
"src_slf_attn_bias", )
decoder_data_input_fields = (
"trg_word",
"trg_pos",
"trg_slf_attn_bias",
"trg_src_attn_bias",
"enc_output", )
label_data_input_fields = (
"lbl_word",
"lbl_weight", )
# In fast decoder, trg_pos (only containing the current time step) is generated
# by ops and trg_slf_attn_bias is not needed.
fast_decoder_data_input_fields = (
"trg_word",
"init_score",
"init_idx",
"trg_src_attn_bias", )
# if we use py_reader
use_py_reader = False
# if we run sync mode
sync = False
# how many batches we use
batch_num = 2
np.random.seed = 1
src_word_np = np.random.randint(
1,
ModelHyperParams.src_vocab_size - 1,
size=(batch_size, seq_len, 1),
dtype='int64')
src_pos_np = np.random.randint(
1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
seq_len, seq_len).astype('float32')
trg_word_np = np.random.randint(
1,
ModelHyperParams.src_vocab_size - 1,
size=(batch_size, seq_len, 1),
dtype='int64')
trg_pos_np = np.random.randint(
1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
seq_len, seq_len).astype('float32')
trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
seq_len, seq_len).astype('float32')
lbl_word_np = np.random.randint(
1,
ModelHyperParams.src_vocab_size - 1,
size=(batch_size * seq_len, 1),
dtype='int64')
lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
# np.random.seed = 1
# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
# src_pos_np = np.random.randint(
# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
# seq_len, seq_len).astype('float32')
#
# trg_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
# trg_pos_np = np.random.randint(
# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
# seq_len, seq_len).astype('float32')
# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
# seq_len, seq_len).astype('float32')
#
# lbl_word_np = np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
#
pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
class PrePostProcessLayer(Layer):
def __init__(self, name_scope, process_cmd, shape_len=None):
super(PrePostProcessLayer, self).__init__(name_scope)
for cmd in process_cmd:
if cmd == "n":
self._layer_norm = LayerNorm(
name_scope=self.full_name(),
begin_norm_axis=shape_len - 1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(0.)))
def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out = self._layer_norm(out)
elif cmd == "d": # add dropout
if dropout_rate:
out = fluid.layers.dropout(
out,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
return out
class PositionwiseFeedForwardLayer(Layer):
def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate):
super(PositionwiseFeedForwardLayer, self).__init__(name_scope)
self._i2h = FC(name_scope=self.full_name(),
size=d_inner_hid,
num_flatten_dims=2,
act="relu")
self._h2o = FC(name_scope=self.full_name(),
size=d_hid,
num_flatten_dims=2)
self._dropout_rate = dropout_rate
def forward(self, x):
hidden = self._i2h(x)
if self._dropout_rate:
hidden = fluid.layers.dropout(
hidden,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = self._h2o(hidden)
return out
class MultiHeadAttentionLayer(Layer):
def __init__(self,
name_scope,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
gather_idx=None,
static_kv=False):
super(MultiHeadAttentionLayer, self).__init__(name_scope)
self._n_head = n_head
self._d_key = d_key
self._d_value = d_value
self._d_model = d_model
self._dropout_rate = dropout_rate
self._q_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._k_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._v_fc = FC(name_scope=self.full_name(),
size=d_value * n_head,
bias_attr=False,
num_flatten_dims=2)
self._proj_fc = FC(name_scope=self.full_name(),
size=self._d_model,
bias_attr=False,
num_flatten_dims=2)
def forward(self, queries, keys, values, attn_bias):
# compute q ,k ,v
keys = queries if keys is None else keys
values = keys if values is None else values
q = self._q_fc(queries)
k = self._k_fc(keys)
v = self._v_fc(values)
# split head
reshaped_q = fluid.layers.reshape(
x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False)
transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
reshaped_k = fluid.layers.reshape(
x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False)
transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
reshaped_v = fluid.layers.reshape(
x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
#scale dot product attention
product = fluid.layers.matmul(
x=transpose_q,
y=transpose_k,
transpose_y=True,
alpha=self._d_model**-0.5)
if attn_bias:
product += attn_bias
weights = fluid.layers.softmax(product)
if self._dropout_rate:
weights_droped = fluid.layers.dropout(
weights,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = fluid.layers.matmul(weights_droped, transpose_v)
else:
out = fluid.layers.matmul(weights, transpose_v)
# combine heads
if len(out.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
final_out = fluid.layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=False)
# fc to output
proj_out = self._proj_fc(final_out)
return proj_out
class EncoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderSubLayer, self).__init__(name_scope)
self._preprocess_cmd = preprocess_cmd
self._postprocess_cmd = postprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
self._preprocess_layer = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(), d_key, d_value, d_model, n_head,
attention_dropout)
self._postprocess_layer = PrePostProcessLayer(
self.full_name(), self._postprocess_cmd, None)
self._preprocess_layer2 = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
self._postprocess_layer2 = PrePostProcessLayer(
self.full_name(), self._postprocess_cmd, None)
def forward(self, enc_input, attn_bias):
pre_process_multihead = self._preprocess_layer(
None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout)
attn_output = self._multihead_attention_layer(pre_process_multihead,
None, None, attn_bias)
attn_output = self._postprocess_layer(enc_input, attn_output,
self._postprocess_cmd,
self._prepostprocess_dropout)
pre_process2_output = self._preprocess_layer2(
None, attn_output, self._preprocess_cmd,
self._prepostprocess_dropout)
ffd_output = self._positionwise_feed_forward(pre_process2_output)
return self._postprocess_layer2(attn_output, ffd_output,
self._postprocess_cmd,
self._prepostprocess_dropout)
class EncoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderLayer, self).__init__(name_scope)
self._preprocess_cmd = preprocess_cmd
self._encoder_sublayers = list()
self._prepostprocess_dropout = prepostprocess_dropout
self._n_layer = n_layer
self._preprocess_layer = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
for i in range(n_layer):
self._encoder_sublayers.append(
self.add_sublayer(
'esl_%d' % i,
EncoderSubLayer(
self.full_name(), n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)))
def forward(self, enc_input, attn_bias):
for i in range(self._n_layer):
enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
enc_input = enc_output
return self._preprocess_layer(None, enc_output, self._preprocess_cmd,
self._prepostprocess_dropout)
class PrepareEncoderDecoderLayer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
src_emb_dim,
src_max_len,
dropout_rate,
word_emb_param_name=None,
pos_enc_param_name=None):
super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
self._src_max_len = src_max_len
self._src_emb_dim = src_emb_dim
self._src_vocab_size = src_vocab_size
self._dropout_rate = dropout_rate
self._input_emb = Embedding(
name_scope=self.full_name(),
size=[src_vocab_size, src_emb_dim],
padding_idx=0,
param_attr=fluid.ParamAttr(
name=word_emb_param_name,
initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
if pos_enc_param_name is pos_enc_param_names[0]:
pos_inp = pos_inp1
else:
pos_inp = pos_inp2
self._pos_emb = Embedding(
name_scope=self.full_name(),
size=[self._src_max_len, src_emb_dim],
param_attr=fluid.ParamAttr(
name=pos_enc_param_name,
initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
trainable=False))
# use in imperative_mode to fit different length batch
# self._pos_emb._w = to_variable(
# position_encoding_init(self._src_max_len, self._src_emb_dim))
def forward(self, src_word, src_pos):
src_word_emb = self._input_emb(src_word)
src_word_emb = fluid.layers.scale(
x=src_word_emb, scale=self._src_emb_dim**0.5)
# # TODO change this to fit dynamic length input
src_pos_emb = self._pos_emb(src_pos)
src_pos_emb.stop_gradient = True
enc_input = src_word_emb + src_pos_emb
return fluid.layers.dropout(
enc_input,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False) if self._dropout_rate else enc_input
class WrapEncoderLayer(Layer):
def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd,
postprocess_cmd, weight_sharing):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapEncoderLayer, self).__init__(name_cope)
self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
src_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
word_emb_param_name=word_emb_param_names[0],
pos_enc_param_name=pos_enc_param_names[0])
self._encoder = EncoderLayer(
self.full_name(), n_layer, n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)
def forward(self, enc_inputs):
src_word, src_pos, src_slf_attn_bias = enc_inputs
enc_input = self._prepare_encoder_layer(src_word, src_pos)
enc_output = self._encoder(enc_input, src_slf_attn_bias)
return enc_output
class DecoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None,
gather_idx=None):
super(DecoderSubLayer, self).__init__(name_scope)
self._postprocess_cmd = postprocess_cmd
self._preprocess_cmd = preprocess_cmd
self._prepostprcess_dropout = prepostprocess_dropout
self._pre_process_layer = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx)
self._post_process_layer = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
self._pre_process_layer2 = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._multihead_attention_layer2 = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx,
static_kv=True)
self._post_process_layer2 = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
self._pre_process_layer3 = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
self._post_process_layer3 = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
pre_process_rlt = self._pre_process_layer(
None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
None, slf_attn_bias)
slf_attn_output_pp = self._post_process_layer(
dec_input, slf_attn_output, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp,
self._preprocess_cmd,
self._prepostprcess_dropout)
enc_attn_output_pp = self._multihead_attention_layer2(
pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
enc_attn_output = self._post_process_layer2(
slf_attn_output, enc_attn_output_pp, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
self._preprocess_cmd,
self._prepostprcess_dropout)
ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
dec_output = self._post_process_layer3(enc_attn_output, ffd_output,
self._postprocess_cmd,
self._prepostprcess_dropout)
return dec_output
class DecoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=None,
gather_idx=None):
super(DecoderLayer, self).__init__(name_scope)
self._pre_process_layer = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._decoder_sub_layers = list()
self._n_layer = n_layer
self._preprocess_cmd = preprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
for i in range(n_layer):
self._decoder_sub_layers.append(
self.add_sublayer(
'dsl_%d' % i,
DecoderSubLayer(
self.full_name(),
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None if caches is None else caches[i],
gather_idx=gather_idx)))
def forward(self, dec_input, enc_output, dec_slf_attn_bias,
dec_enc_attn_bias):
for i in range(self._n_layer):
tmp_dec_output = self._decoder_sub_layers[i](
dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
dec_input = tmp_dec_output
dec_output = self._pre_process_layer(None, tmp_dec_output,
self._preprocess_cmd,
self._prepostprocess_dropout)
return dec_output
class WrapDecoderLayer(Layer):
def __init__(self,
name_scope,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
caches=None,
gather_idx=None):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapDecoderLayer, self).__init__(name_scope)
self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
trg_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
word_emb_param_name=word_emb_param_names[1],
pos_enc_param_name=pos_enc_param_names[1])
self._decoder_layer = DecoderLayer(
self.full_name(),
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=caches,
gather_idx=gather_idx)
self._weight_sharing = weight_sharing
if not weight_sharing:
self._fc = FC(self.full_name(),
size=trg_vocab_size,
bias_attr=False)
def forward(self, dec_inputs=None, enc_output=None):
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
dec_input = self._prepare_decoder_layer(trg_word, trg_pos)
dec_output = self._decoder_layer(dec_input, enc_output,
trg_slf_attn_bias, trg_src_attn_bias)
dec_output_reshape = fluid.layers.reshape(
dec_output, shape=[-1, dec_output.shape[-1]], inplace=False)
if self._weight_sharing:
predict = fluid.layers.matmul(
x=dec_output_reshape,
y=self._prepare_decoder_layer._input_emb._w,
transpose_y=True)
else:
predict = self._fc(dec_output_reshape)
if dec_inputs is None:
# Return probs for independent decoder program.
predict_out = fluid.layers.softmax(predict)
return predict_out
return predict
class TransFormer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
label_smooth_eps,
use_py_reader=False,
is_test=False):
super(TransFormer, self).__init__(name_scope)
self._label_smooth_eps = label_smooth_eps
self._trg_vocab_size = trg_vocab_size
if weight_sharing:
assert src_vocab_size == trg_vocab_size, (
"Vocabularies in source and target should be same for weight sharing."
)
self._wrap_encoder_layer = WrapEncoderLayer(
self.full_name(), src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
weight_sharing)
self._wrap_decoder_layer = WrapDecoderLayer(
self.full_name(), trg_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
weight_sharing)
if weight_sharing:
self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w
def forward(self, enc_inputs, dec_inputs, label, weights):
enc_output = self._wrap_encoder_layer(enc_inputs)
predict = self._wrap_decoder_layer(dec_inputs, enc_output)
if self._label_smooth_eps:
label_out = fluid.layers.label_smooth(
label=fluid.layers.one_hot(
input=label, depth=self._trg_vocab_size),
epsilon=self._label_smooth_eps)
cost = fluid.layers.softmax_with_cross_entropy(
logits=predict,
label=label_out,
soft_label=True if self._label_smooth_eps else False)
weighted_cost = cost * weights
sum_cost = fluid.layers.reduce_sum(weighted_cost)
token_num = fluid.layers.reduce_sum(weights)
token_num.stop_gradient = True
avg_cost = sum_cost / token_num
return sum_cost, avg_cost, predict, token_num
class TestImperativeTransformer(unittest.TestCase):
def test_transformer_float32(self):
seed = 90
with guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer,
ModelHyperParams.n_head,
ModelHyperParams.d_key,
ModelHyperParams.d_value,
ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid,
ModelHyperParams.prepostprocess_dropout,
ModelHyperParams.attention_dropout,
ModelHyperParams.relu_dropout,
ModelHyperParams.preprocess_cmd,
ModelHyperParams.postprocess_cmd,
ModelHyperParams.weight_sharing,
TrainTaskConfig.label_smooth_eps,
use_py_reader=use_py_reader,
is_test=False)
if sync:
lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
with fluid.default_main_program()._lr_schedule_guard():
learning_rate = lr_decay * TrainTaskConfig.learning_rate
optimizer = fluid.optimizer.Adam(
learning_rate=learning_rate,
beta1=TrainTaskConfig.beta1,
beta2=TrainTaskConfig.beta2,
epsilon=TrainTaskConfig.eps)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
dy_param_init = dict()
dy_param_updated = dict()
for i in range(batch_num):
enc_inputs, dec_inputs, label, weights = create_data()
dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer(
enc_inputs, dec_inputs, label, weights)
if i == 0:
for param in transformer.parameters():
dy_param_init[param.name] = param._numpy()
dy_avg_cost._backward()
optimizer.minimize(dy_avg_cost)
transformer.clear_gradients()
if i == batch_num - 1:
for param in transformer.parameters():
dy_param_updated[param.name] = param._numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer,
ModelHyperParams.n_head,
ModelHyperParams.d_key,
ModelHyperParams.d_value,
ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid,
ModelHyperParams.prepostprocess_dropout,
ModelHyperParams.attention_dropout,
ModelHyperParams.relu_dropout,
ModelHyperParams.preprocess_cmd,
ModelHyperParams.postprocess_cmd,
ModelHyperParams.weight_sharing,
TrainTaskConfig.label_smooth_eps,
use_py_reader=use_py_reader,
is_test=False)
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
-1] + label_data_input_fields
all_inputs = make_all_inputs(data_input_names)
enc_inputs_len = len(encoder_data_input_fields)
dec_inputs_len = len(decoder_data_input_fields[:-1])
enc_inputs = all_inputs[0:enc_inputs_len]
dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
dec_inputs_len]
label = all_inputs[-2]
weights = all_inputs[-1]
static_param_updated = dict()
static_param_init = dict()
static_param_name_list = list()
static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
enc_inputs, dec_inputs, label, weights)
optimizer.minimize(static_avg_cost)
for param in transformer.parameters():
static_param_name_list.append(param.name)
out = exe.run(fluid.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init[static_param_name_list[i]] = out[i]
static_sum_cost_value = None
static_avg_cost_value = None
static_predict_value = None
static_token_num_value = None
for i in range(batch_num):
feed_dict = create_feed_dict_list(create_data(True))
fetch_list = [
static_sum_cost, static_avg_cost, static_predict,
static_token_num
]
fetch_list.extend(static_param_name_list)
out = exe.run(fluid.default_main_program(),
feed=feed_dict,
fetch_list=fetch_list)
static_sum_cost_value = out[0]
static_avg_cost_value = out[1]
static_predict_value = out[2]
static_token_num_value = out[3]
if i == batch_num - 1:
for k in range(4, len(out)):
static_param_updated[static_param_name_list[k -
4]] = out[k]
self.assertTrue(
np.allclose(static_avg_cost_value, dy_avg_cost._numpy()))
self.assertTrue(
np.allclose(static_sum_cost_value, dy_sum_cost._numpy()))
self.assertTrue(
np.allclose(
static_predict_value, dy_predict._numpy(), atol=1e-5))
self.assertTrue(
np.allclose(static_token_num_value, dy_token_num._numpy()))
for key, value in six.iteritems(static_param_init):
self.assertTrue(np.allclose(value, dy_param_init[key]))
for key, value in six.iteritems(static_param_updated):
self.assertTrue(
np.allclose(
value, dy_param_updated[key], atol=1e-4))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
class TestInstallCheck(unittest.TestCase):
def test_install_check(self):
fluid.install_check.run_check()
...@@ -42,10 +42,14 @@ class LayerTest(unittest.TestCase): ...@@ -42,10 +42,14 @@ class LayerTest(unittest.TestCase):
def tearDownClass(cls): def tearDownClass(cls):
pass pass
def _get_place(self): def _get_place(self, force_to_use_cpu=False):
if core.is_compiled_with_cuda(): # this option for ops that only have cpu kernel
return core.CUDAPlace(0) if force_to_use_cpu:
return core.CPUPlace() return core.CPUPlace()
else:
if core.is_compiled_with_cuda():
return core.CUDAPlace(0)
return core.CPUPlace()
@contextlib.contextmanager @contextlib.contextmanager
def static_graph(self): def static_graph(self):
...@@ -54,22 +58,52 @@ class LayerTest(unittest.TestCase): ...@@ -54,22 +58,52 @@ class LayerTest(unittest.TestCase):
fluid.default_main_program().random_seed = self.seed fluid.default_main_program().random_seed = self.seed
yield yield
def get_static_graph_result(self, feed, fetch_list): def get_static_graph_result(self, feed, fetch_list, with_lod=False):
exe = fluid.Executor(self._get_place()) exe = fluid.Executor(self._get_place())
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
return exe.run(fluid.default_main_program(), return exe.run(fluid.default_main_program(),
feed=feed, feed=feed,
fetch_list=fetch_list) fetch_list=fetch_list,
return_numpy=(not with_lod))
@contextlib.contextmanager @contextlib.contextmanager
def dynamic_graph(self): def dynamic_graph(self, force_to_use_cpu=False):
with fluid.imperative.guard(self._get_place()): with fluid.imperative.guard(
self._get_place(force_to_use_cpu=force_to_use_cpu)):
fluid.default_startup_program().random_seed = self.seed fluid.default_startup_program().random_seed = self.seed
fluid.default_main_program().random_seed = self.seed fluid.default_main_program().random_seed = self.seed
yield yield
class TestLayer(LayerTest): class TestLayer(LayerTest):
def test_layer_norm(self):
inp = np.ones([3, 32, 32], dtype='float32')
with self.static_graph():
t = layers.data(
name='data',
shape=[3, 32, 32],
dtype='float32',
append_batch_size=False)
ret = layers.layer_norm(t)
static_ret = self.get_static_graph_result(
feed={'data': inp}, fetch_list=[ret])[0]
with self.static_graph():
t = layers.data(
name='data',
shape=[3, 32, 32],
dtype='float32',
append_batch_size=False)
lm = nn.LayerNorm('layer_norm')
ret = lm(t)
static_ret2 = self.get_static_graph_result(
feed={'data': inp}, fetch_list=[ret])[0]
with self.dynamic_graph():
lm = nn.LayerNorm('layer_norm')
dy_ret = lm(base.to_variable(inp))
self.assertTrue(np.allclose(static_ret, static_ret2))
self.assertTrue(np.allclose(dy_ret._numpy(), static_ret2))
def test_relu(self): def test_relu(self):
with self.static_graph(): with self.static_graph():
t = layers.data(name='t', shape=[3, 3], dtype='float32') t = layers.data(name='t', shape=[3, 3], dtype='float32')
...@@ -228,6 +262,304 @@ class TestLayer(LayerTest): ...@@ -228,6 +262,304 @@ class TestLayer(LayerTest):
self.assertTrue(np.allclose(n, min_ret._numpy())) self.assertTrue(np.allclose(n, min_ret._numpy()))
self.assertTrue(np.allclose(n2, max_ret._numpy())) self.assertTrue(np.allclose(n2, max_ret._numpy()))
def test_sequence_conv(self):
inp_np = np.arange(12).reshape([3, 4]).astype('float32')
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
else:
place = core.CPUPlace()
with self.static_graph():
seq = layers.data(
name='seq_in',
shape=[3, 4],
dtype='float32',
lod_level=1,
append_batch_size=False)
out = layers.sequence_conv(seq, 2)
static_rlt = self.get_static_graph_result(
feed={
"seq_in": fluid.create_lod_tensor(
data=inp_np,
recursive_seq_lens=[[1, 1, 1]],
place=place)
},
fetch_list=[out],
with_lod=True)[0]
with self.static_graph():
seq = layers.data(
name='seq_in',
shape=[3, 4],
dtype='float32',
lod_level=1,
append_batch_size=False)
seq_conv = nn.SequenceConv('seq_conv', num_filters=2)
out = seq_conv(seq)
static_rlt2 = self.get_static_graph_result(
feed={
"seq_in": fluid.create_lod_tensor(
data=inp_np,
recursive_seq_lens=[[1, 1, 1]],
place=place)
},
fetch_list=[out],
with_lod=True)[0]
self.assertTrue(
np.allclose(np.array(static_rlt), np.array(static_rlt2)))
def test_conv2d_transpose(self):
inp_np = np.arange(0, 24).reshape([2, 3, 2, 2]).astype('float32')
with self.static_graph():
img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
out = layers.conv2d_transpose(
input=img, num_filters=10, output_size=28)
static_rlt = self.get_static_graph_result(
feed={'pixel': inp_np}, fetch_list=[out])[0]
with self.static_graph():
img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
conv2d_transpose = nn.Conv2DTranspose(
'conv2d_transpose', num_filters=10, output_size=28)
out = conv2d_transpose(img)
static_rlt2 = self.get_static_graph_result(
feed={'pixel': inp_np}, fetch_list=[out])[0]
with self.dynamic_graph():
conv2d_transpose = nn.Conv2DTranspose(
'conv2d_transpose', num_filters=10, output_size=28)
dy_rlt = conv2d_transpose(base.to_variable(inp_np))
self.assertTrue(np.allclose(static_rlt2, static_rlt))
self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
def test_bilinear_tensor_product(self):
inp_np_x = np.array([[1, 2, 3]]).astype('float32')
inp_np_y = np.array([[4, 5, 6]]).astype('float32')
with self.static_graph():
data_x = layers.data(
name='x',
shape=[1, 3],
dtype="float32",
append_batch_size=False)
data_y = layers.data(
name='y',
shape=[1, 3],
dtype="float32",
append_batch_size=False)
out = layers.bilinear_tensor_product(data_x, data_y, 6)
static_rlt = self.get_static_graph_result(
feed={'x': inp_np_x,
'y': inp_np_y}, fetch_list=[out])[0]
with self.static_graph():
data_x = layers.data(
name='x',
shape=[1, 3],
dtype="float32",
append_batch_size=False)
data_y = layers.data(
name='y',
shape=[1, 3],
dtype="float32",
append_batch_size=False)
btp = nn.BilinearTensorProduct('btp', 6)
out = btp(data_x, data_y)
static_rlt2 = self.get_static_graph_result(
feed={'x': inp_np_x,
'y': inp_np_y}, fetch_list=[out])[0]
with self.dynamic_graph():
btp = nn.BilinearTensorProduct('btp', 6)
dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
self.assertTrue(np.allclose(static_rlt2, static_rlt))
self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
def test_prelu(self):
inp_np = np.ones([5, 200, 100, 100]).astype('float32')
with self.static_graph():
data_t = layers.data(
name="input",
shape=[5, 200, 100, 100],
dtype="float32",
append_batch_size=False)
mode = 'channel'
out = layers.prelu(
data_t, mode, param_attr=ParamAttr(initializer=Constant(1.0)))
static_rlt = self.get_static_graph_result(
feed={"input": inp_np}, fetch_list=[out])[0]
with self.static_graph():
data_t = layers.data(
name="input",
shape=[5, 200, 100, 100],
dtype="float32",
append_batch_size=False)
mode = 'channel'
prelu = nn.PRelu(
'prelu',
mode=mode,
param_attr=ParamAttr(initializer=Constant(1.0)))
out = prelu(data_t)
static_rlt2 = self.get_static_graph_result(
feed={"input": inp_np}, fetch_list=[out])[0]
with self.dynamic_graph():
mode = 'channel'
prelu = nn.PRelu(
'prelu',
mode=mode,
param_attr=ParamAttr(initializer=Constant(1.0)))
dy_rlt = prelu(base.to_variable(inp_np))
self.assertTrue(np.allclose(static_rlt2, static_rlt))
self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
def test_embeding(self):
inp_word = np.array([[[1]]]).astype('int64')
dict_size = 20
with self.static_graph():
data_t = layers.data(name='word', shape=[1], dtype='int64')
emb = layers.embedding(
input=data_t,
size=[dict_size, 32],
param_attr='emb.w',
is_sparse=False)
static_rlt = self.get_static_graph_result(
feed={'word': inp_word}, fetch_list=[emb])[0]
with self.static_graph():
data_t = layers.data(name='word', shape=[1], dtype='int64')
emb2 = nn.Embedding(
name_scope='embedding',
size=[dict_size, 32],
param_attr='emb.w',
is_sparse=False)
emb_rlt = emb2(data_t)
static_rlt2 = self.get_static_graph_result(
feed={'word': inp_word}, fetch_list=[emb_rlt])[0]
with self.dynamic_graph():
emb2 = nn.Embedding(
name_scope='embedding',
size=[dict_size, 32],
param_attr='emb.w',
is_sparse=False)
static_rlt3 = emb2(base.to_variable(inp_word))
self.assertTrue(np.allclose(static_rlt2, static_rlt))
self.assertTrue(np.allclose(static_rlt3._numpy(), static_rlt))
def test_nce(self):
window_size = 5
dict_size = 20
label_word = int(window_size // 2) + 1
inp_word = np.array([[[1]], [[2]], [[3]], [[4]], [[5]]]).astype('int64')
nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
seed = 1
with self.static_graph():
words = []
for i in range(window_size):
words.append(
layers.data(
name='word_{0}'.format(i), shape=[1], dtype='int64'))
embs = []
for i in range(window_size):
if i == label_word:
continue
emb = layers.embedding(
input=words[i],
size=[dict_size, 32],
param_attr='emb.w',
is_sparse=False)
embs.append(emb)
embs = layers.concat(input=embs, axis=1)
nce_loss = layers.nce(input=embs,
label=words[label_word],
num_total_classes=dict_size,
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr='nce.w',
bias_attr='nce.b')
feed_dict = dict()
for i in range(window_size):
feed_dict['word_{0}'.format(i)] = inp_word[i]
static_rlt = self.get_static_graph_result(
feed=feed_dict, fetch_list=[nce_loss])[0]
with self.static_graph():
words = []
for i in range(window_size):
words.append(
layers.data(
name='word_{0}'.format(i), shape=[1], dtype='int64'))
emb = nn.Embedding(
'embedding',
size=[dict_size, 32],
param_attr='emb.w',
is_sparse=False)
embs2 = []
for i in range(window_size):
if i == label_word:
continue
emb_rlt = emb(words[i])
embs2.append(emb_rlt)
embs2 = layers.concat(input=embs2, axis=1)
nce = nn.NCE('nce',
num_total_classes=dict_size,
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr='nce.w',
bias_attr='nce.b')
nce_loss2 = nce(embs2, words[label_word])
feed_dict = dict()
for i in range(len(words)):
feed_dict['word_{0}'.format(i)] = inp_word[i]
static_rlt2 = self.get_static_graph_result(
feed=feed_dict, fetch_list=[nce_loss2])[0]
with self.dynamic_graph(force_to_use_cpu=True):
words = []
for i in range(window_size):
words.append(base.to_variable(inp_word[i]))
emb = nn.Embedding(
'embedding',
size=[dict_size, 32],
param_attr='emb.w',
is_sparse=False)
embs3 = []
for i in range(window_size):
if i == label_word:
continue
emb_rlt = emb(words[i])
embs3.append(emb_rlt)
embs3 = layers.concat(input=embs3, axis=1)
nce = nn.NCE('nce',
num_total_classes=dict_size,
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr='nce.w',
bias_attr='nce.b')
nce_loss3 = nce(embs3, words[label_word])
self.assertTrue(np.allclose(static_rlt2, static_rlt))
self.assertTrue(np.allclose(nce_loss3._numpy(), static_rlt))
class TestBook(unittest.TestCase): class TestBook(unittest.TestCase):
def test_fit_a_line(self): def test_fit_a_line(self):
......
...@@ -205,9 +205,9 @@ class TestListenAndServOp(unittest.TestCase): ...@@ -205,9 +205,9 @@ class TestListenAndServOp(unittest.TestCase):
out = nce(x_array, param_array, bias_array, sample_weight, out = nce(x_array, param_array, bias_array, sample_weight,
label_array, 5, 2) label_array, 5, 2)
self.assertAlmostEqual(o_cost.all(), out[0].all(), delta=1e-6) np.testing.assert_almost_equal(o_cost, out[0], decimal=6)
self.assertAlmostEqual(o_logits.all(), out[1].all(), delta=1e-6) np.testing.assert_almost_equal(o_logits, out[1], decimal=6)
self.assertAlmostEqual(o_labels.all(), out[2].all(), delta=1e-6) np.testing.assert_almost_equal(o_labels, out[2], decimal=6)
def test_nce_op_remote(self): def test_nce_op_remote(self):
os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from __future__ import print_function from __future__ import print_function
import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import unittest import unittest
import numpy import numpy
...@@ -183,6 +184,58 @@ class TestTensor(unittest.TestCase): ...@@ -183,6 +184,58 @@ class TestTensor(unittest.TestCase):
tensor_array = numpy.array(tensor) tensor_array = numpy.array(tensor)
self.assertEqual((0, 1), tensor_array.shape) self.assertEqual((0, 1), tensor_array.shape)
def run_sliece_tensor(self, place):
tensor = fluid.Tensor()
shape = [3, 3, 3]
tensor._set_dims(shape)
tensor_array = numpy.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
[[10, 11, 12], [13, 14, 15], [16, 17, 18]],
[[19, 20, 21], [22, 23, 24], [25, 26, 27]]])
tensor.set(tensor_array, place)
n1 = tensor[1]
t1 = tensor_array[1]
self.assertTrue((numpy.array(n1) == numpy.array(t1)).all())
n2 = tensor[1:]
t2 = tensor_array[1:]
self.assertTrue((numpy.array(n2) == numpy.array(t2)).all())
n3 = tensor[0:2:]
t3 = tensor_array[0:2:]
self.assertTrue((numpy.array(n3) == numpy.array(t3)).all())
n4 = tensor[2::-2]
t4 = tensor_array[2::-2]
self.assertTrue((numpy.array(n4) == numpy.array(t4)).all())
n5 = tensor[2::-2][0]
t5 = tensor_array[2::-2][0]
self.assertTrue((numpy.array(n5) == numpy.array(t5)).all())
n6 = tensor[2:-1:-1]
t6 = tensor_array[2:-1:-1]
self.assertTrue((numpy.array(n6) == numpy.array(t6)).all())
n7 = tensor[0:, 0:]
t7 = tensor_array[0:, 0:]
self.assertTrue((numpy.array(n7) == numpy.array(t7)).all())
n8 = tensor[0::1, 0::-1, 2:]
t8 = tensor_array[0::1, 0::-1, 2:]
self.assertTrue((numpy.array(n8) == numpy.array(t8)).all())
def test_sliece_tensor(self):
# run cpu first
place = core.CPUPlace()
self.run_sliece_tensor(place)
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
self.run_sliece_tensor(place)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -16,8 +16,10 @@ from __future__ import print_function ...@@ -16,8 +16,10 @@ from __future__ import print_function
import unittest import unittest
from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_ from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import numpy as np import numpy as np
from test_imperative_base import new_program_scope
class TestVariable(unittest.TestCase): class TestVariable(unittest.TestCase):
...@@ -60,6 +62,100 @@ class TestVariable(unittest.TestCase): ...@@ -60,6 +62,100 @@ class TestVariable(unittest.TestCase):
name='step_scopes', type=core.VarDesc.VarType.STEP_SCOPES) name='step_scopes', type=core.VarDesc.VarType.STEP_SCOPES)
self.assertEqual(core.VarDesc.VarType.STEP_SCOPES, var.type) self.assertEqual(core.VarDesc.VarType.STEP_SCOPES, var.type)
def _test_slice(self):
b = default_main_program().current_block()
w = b.create_var(dtype="float64", shape=[784, 100, 100], lod_level=0)
for i in range(3):
nw = w[i]
self.assertEqual((1, 100, 100), nw.shape)
nw = w[:]
self.assertEqual((784, 100, 100), nw.shape)
nw = w[:, :, ...]
self.assertEqual((784, 100, 100), nw.shape)
nw = w[::2, ::2, :]
self.assertEqual((392, 50, 100), nw.shape)
nw = w[::-2, ::-2, :]
self.assertEqual((392, 50, 100), nw.shape)
self.assertEqual(0, nw.lod_level)
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
exe = fluid.Executor(place)
tensor_array = np.array(
[[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
[[10, 11, 12], [13, 14, 15], [16, 17, 18]],
[[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype('float32')
var = fluid.layers.assign(tensor_array)
var1 = var[0, 1, 1]
var2 = var[1:]
var3 = var[0:1]
var4 = var[..., ]
var5 = var[2::-2]
var6 = var[1, 1:, 1:]
var7 = var[1, ..., 1:]
var8 = var[1, ...]
local_out = exe.run(main,
fetch_list=[
var, var1, var2, var3, var4, var5, var6,
var7, var8
])
self.assertTrue((np.array(local_out[1]) == np.array(tensor_array[
0, 1, 1])).all())
self.assertTrue((np.array(local_out[2]) == np.array(tensor_array[
1:])).all())
self.assertTrue((np.array(local_out[3]) == np.array(tensor_array[
0:1])).all())
self.assertTrue((np.array(local_out[4]) == np.array(
tensor_array[..., ])).all())
self.assertTrue((np.array(local_out[5]) == np.array(tensor_array[
2::-2])).all())
self.assertTrue((np.array(local_out[6]) == np.array(tensor_array[
1, 1:, 1:])).all())
self.assertTrue((np.array(local_out[7]) == np.array(tensor_array[
1, ..., 1:])).all())
self.assertTrue((np.array(local_out[8]) == np.array(tensor_array[
1, ...])).all())
def test_slice(self):
self._test_slice()
class TestVariableImperative(unittest.TestCase):
def _test_slice(self):
b = default_main_program().current_block()
w = b.create_var(dtype="float64", shape=[784, 100, 100], lod_level=0)
for i in range(3):
nw = w[i]
self.assertEqual([1, 100, 100], nw.shape)
nw = w[:]
self.assertEqual([784, 100, 100], nw.shape)
nw = w[:, :, :]
self.assertEqual([784, 100, 100], nw.shape)
nw = w[::2, ::2, :]
self.assertEqual([392, 50, 100], nw.shape)
nw = w[::-2, ::-2, :]
self.assertEqual([392, 50, 100], nw.shape)
nw = w[0::-2, 0::-2, :]
self.assertEqual([1, 1, 100], nw.shape)
def test_slice(self):
with fluid.imperative.guard():
self._test_slice()
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册