提交 7c09d198 编写于 作者: L luotao1

Merge branch 'develop' into anakin_bug

......@@ -204,12 +204,11 @@ include(external/snappy) # download snappy
include(external/snappystream)
include(external/threadpool)
set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
if(WITH_GPU)
include(cuda)
include(tensorrt)
include(external/anakin)
else()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
endif()
include(cudnn) # set cudnn libraries, must before configure
......
......@@ -4,7 +4,7 @@ Paddle 预测 API
为了更简单方便的预测部署,Fluid 提供了一套高层 API
用来隐藏底层不同的优化实现。
`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
包括
- 头文件 ``paddle_inference_api.h`` 定义了所有的接口
......@@ -104,5 +104,5 @@ engine
------------
- `inference
demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference/demo>`__
- `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc>`__
demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api/demo_ci>`__
- `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/api_impl_tester.cc>`__
......@@ -6,7 +6,7 @@ paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=N
paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
......@@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No
paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
......@@ -52,7 +55,7 @@ paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path
paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.InferenceTranspiler.__init__
......@@ -74,7 +77,7 @@ paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_pro
paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
......@@ -156,6 +159,8 @@ paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaul
paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
......@@ -324,7 +329,7 @@ paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.transpiler.InferenceTranspiler.__init__
......
......@@ -14,6 +14,7 @@
#pragma once
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -22,27 +23,24 @@ namespace details {
class ExceptionHolder {
public:
void Catch(const platform::EnforceNotMet& exp) {
std::lock_guard<std::mutex> lock(mu_);
exception_.reset(new platform::EnforceNotMet(exp));
type_ = kEnforceNotMet;
}
void Catch(const platform::EOFException& exp) {
std::lock_guard<std::mutex> lock(mu_);
// EOFException will not cover up existing EnforceNotMet.
if (exception_.get() == nullptr) {
exception_.reset(new platform::EOFException(exp));
type_ = kEOF;
void Catch(std::exception_ptr eptr) {
try {
std::rethrow_exception(eptr);
} catch (platform::EOFException exp) {
Catch(exp);
} catch (platform::EnforceNotMet exp) {
Catch(exp);
} catch (...) {
LOG(FATAL) << "Unknown exception caught";
}
}
bool ExceptionCatched() const {
bool IsCaught() const {
std::lock_guard<std::mutex> lock(mu_);
return exception_.get() != nullptr;
}
void Throw() {
void ReThrow() {
std::lock_guard<std::mutex> lock(mu_);
switch (type_) {
case kNone:
......@@ -50,27 +48,41 @@ class ExceptionHolder {
case kEnforceNotMet: {
auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
throw e;
break;
}
case kEOF: {
auto e = *static_cast<platform::EOFException*>(exception_.get());
throw e;
break;
}
default:
LOG(FATAL) << "Unknown exception.";
}
exception_.reset();
type_ = kNone;
ClearImpl();
}
void Clear() {
std::lock_guard<std::mutex> lock(mu_);
ClearImpl();
}
private:
void ClearImpl() {
exception_.reset();
type_ = kNone;
}
private:
void Catch(const platform::EnforceNotMet& exp) {
std::lock_guard<std::mutex> lock(mu_);
exception_.reset(new platform::EnforceNotMet(exp));
type_ = kEnforceNotMet;
}
void Catch(const platform::EOFException& exp) {
std::lock_guard<std::mutex> lock(mu_);
// EOFException will not cover up existing EnforceNotMet.
if (exception_.get() == nullptr) {
exception_.reset(new platform::EOFException(exp));
type_ = kEOF;
}
}
enum ExceptionType { kNone, kEnforceNotMet, kEOF };
ExceptionType type_{kNone};
......
......@@ -107,11 +107,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
if (timeout) {
if (exception_holder_.ExceptionCatched()) {
if (exception_holder_.IsCaught()) {
for (auto &run_op_future : run_op_futures_) {
run_op_future.wait();
}
exception_holder_.Throw();
exception_holder_.ReThrow();
} else {
continue;
}
......@@ -220,12 +220,8 @@ void ThreadedSSAGraphExecutor::RunOp(
running_ops_--;
ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted";
} catch (platform::EOFException ex) {
exception_holder_.Catch(ex);
} catch (platform::EnforceNotMet ex) {
exception_holder_.Catch(ex);
} catch (...) {
LOG(FATAL) << "Unknown exception catched";
exception_holder_.Catch(std::current_exception());
}
};
if (pool_) {
......
......@@ -3,7 +3,10 @@ cc_library(graph SRCS graph.cc DEPS node)
cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
......@@ -28,6 +28,38 @@ namespace paddle {
namespace framework {
namespace ir {
/*
* The graph is a Directed Acyclic Single Static Assignment Graph.
*
* In more detail, the following properties must hold:
*
* The graph shouldn't contain cycle. Each node is a black-box to the graph
* so the node itself could be a loop operator.
*
* Each Variable-type node has only one input (thus single static assignment).
*
* The output/input of operator is variable and the output/input of variable
* is operator.
*
* The following data harzards in Program are addressed in the Graph:
*
* Write-After-Read
* a = op1(x)
* x = op2(b)
* A control-dependency connection is created bettwen op1 and op2 such that
* op1->op2, so as to ensure correct order.
*
* Write-After-Write
* x = op1(a)
* x = op2(b)
* A control-dependency connection is created between op1 and op2 such that
* op1->op2, so as to ensure correct order.
*
* Other properties currently hold, but is not enforced yet:
*
* Variable-type node (not control dep) with the same variable name share
* the same underlying VarDesc.
*/
class Graph {
public:
explicit Graph(const ProgramDesc &program);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
nodes_.emplace_back(new PDNode(std::move(teller), name));
auto* cur = nodes_.back().get();
return cur;
}
void PDPattern::AddEdge(PDNode* a, PDNode* b) {
PADDLE_ENFORCE(a);
PADDLE_ENFORCE(b);
PADDLE_ENFORCE(a != b, "can't connect to the same nodes.");
edges_.emplace_back(a, b);
}
void GraphPatternDetecter::operator()(Graph* graph,
GraphPatternDetecter::handle_t handler) {
if (!MarkPDNodesInGraph(*graph)) return;
auto subgraphs = DetectPatterns();
UniquePatterns(&subgraphs);
RemoveOverlappedMatch(&subgraphs);
for (auto& g : subgraphs) {
handler(g, graph);
}
}
bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
if (graph.Nodes().empty()) return false;
for (auto& node : GraphTraits::DFS(graph)) {
for (const auto& pdnode : pattern_.nodes()) {
if (pdnode->Tell(&node)) {
pdnodes2nodes_[pdnode.get()].insert(&node);
}
}
}
return !pdnodes2nodes_.empty();
}
struct HitGroup {
std::unordered_map<PDNode*, Node*> roles;
bool Match(Node* node, PDNode* pat) {
return !roles.count(pat) || roles.at(pat) == node;
}
void Register(Node* node, PDNode* pat) { roles[pat] = node; }
};
// Tell whether Node a links to b.
bool IsNodesLink(Node* a, Node* b) {
for (auto* node : a->outputs) {
if (b == node) {
return true;
}
}
return false;
}
std::vector<GraphPatternDetecter::subgraph_t>
GraphPatternDetecter::DetectPatterns() {
// Init empty subgraphs.
std::vector<GraphPatternDetecter::subgraph_t> result;
std::vector<HitGroup> init_groups;
PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
auto* first_pnode = pattern_.edges().front().first;
if (!pdnodes2nodes_.count(first_pnode)) return result;
for (auto* node : pdnodes2nodes_[first_pnode]) {
HitGroup group;
group.roles[first_pnode] = node;
init_groups.emplace_back(group);
}
int step = 0;
std::array<std::vector<HitGroup>, 2> bi_records;
bi_records[0] = std::move(init_groups);
// Extend a PDNode to subgraphs by deducing the connection relations defined
// in edges of PDNodes.
for (const auto& edge : pattern_.edges()) {
// Each role has two PDNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected.
auto& pre_groups = bi_records[step % 2];
auto& cur_groups = bi_records[1 - (step++ % 2)];
cur_groups.clear();
// source -> target
for (Node* source : pdnodes2nodes_[edge.first]) {
for (Node* target : pdnodes2nodes_[edge.second]) {
// TODO(Superjomn) add some prune strategies.
for (const auto& group : pre_groups) {
HitGroup new_group = group;
if (IsNodesLink(source, target) &&
new_group.Match(source, edge.first)) {
new_group.Register(source, edge.first);
if (new_group.Match(target, edge.second)) {
new_group.Register(target, edge.second);
cur_groups.push_back(new_group);
// TODO(Superjomn) need to unique
}
}
}
}
}
}
for (auto& group : bi_records[step % 2]) {
GraphPatternDetecter::subgraph_t subgraph;
for (auto& role : group.roles) {
subgraph.emplace(role.first, role.second);
}
result.emplace_back(subgraph);
}
return result;
}
void GraphPatternDetecter::UniquePatterns(
std::vector<GraphPatternDetecter::subgraph_t>* subgraphs) {
if (subgraphs->empty()) return;
std::vector<GraphPatternDetecter::subgraph_t> result;
std::unordered_set<size_t> set;
for (auto& g : *subgraphs) {
size_t key = 0;
for (auto& item : g) {
key ^= std::hash<void*>{}(item.first);
key ^= std::hash<void*>{}(item.second);
}
if (!set.count(key)) {
result.emplace_back(g);
set.insert(key);
}
}
*subgraphs = result;
}
void GraphPatternDetecter::RemoveOverlappedMatch(
std::vector<subgraph_t>* subgraphs) {
std::vector<subgraph_t> result;
std::unordered_set<Node*> node_set;
for (const auto& subgraph : *subgraphs) {
bool valid = true;
for (auto& item : subgraph) {
if (node_set.count(item.second)) {
valid = false;
break;
}
}
if (valid) {
for (auto& item : subgraph) {
node_set.insert(item.second);
}
result.push_back(subgraph);
}
}
*subgraphs = result;
}
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_TESTING
#include <gtest/gtest_prod.h>
#endif
#include <numeric>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
namespace paddle {
namespace framework {
namespace ir {
// Some basic torminolygies:
// - PDPattern: a pattern defined as a data flow graph.
// - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
// that meets some conditions defined in `PDNode.teller`.
// - A pattern is defined with PDNodes with edges.
// Pattern detector node. This node helps to build a pattern.
struct PDNode {
// tell whether an ir::Node* is a candidation for a PDNode.
using teller_t = std::function<bool(Node*)>;
PDNode(teller_t&& teller, const std::string& name = "")
: teller_(teller), name_(name) {
PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set.");
}
PDNode(PDNode&& other) = default;
std::vector<PDNode*> inlinks;
std::vector<PDNode*> outlinks;
bool Tell(Node* node) const {
PADDLE_ENFORCE(teller_ != nullptr, "teller should be set for a PDNode");
return teller_(node);
}
const std::string& name() const { return name_; }
PDNode(const PDNode&) = delete;
PDNode& operator=(const PDNode&) = delete;
private:
teller_t teller_;
std::string name_;
};
/*
* A pattern in a graph, which defined with PDNode and edges. Most graph
* patterns can be divided into PDNodes and link relations between them.
*
* For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD
* operators from the computation graph, the MUL's output should have only one
* consumer which is the ELEMENTWISE_ADD.
* This pattern can be defined as with the following pseudo codes
*
* // Create two operator PDNodes.
* MUL = PDPattern.NewNode()
* ELE = PDPattern.NewNode()
* // Create the variable PDNodes.
* MUL_out = PDPattern.NewNode()
* // Add teller to define some rules that help to filter the target Nodes.
* MUL.teller = lambda(node): node->IsOp() && node->Op()->Type == "mul";
* ELE.teller = lambda(node): \
* node->IsOp() && node->Op()->Type == "elementwise_add";
* MUL_out.teller = lambda(node): node->IsVar() && (MUL in node->inputs)
* && (ELE in node->outputs)
*
* One can add more specific tellers for PDNodes or edges, both the Operator
* and Variable Nodes can be ruled in PDNode.teller.
*
* PDPattern can record the general patterns, such as the pattern represents
* - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place.
* - Ops whose inputs and outputs share the same variables
*/
class PDPattern {
public:
using edge_t = std::pair<PDNode*, PDNode*>;
void AddEdge(PDNode* a, PDNode* b);
PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = "");
const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
const std::vector<edge_t>& edges() const { return edges_; }
private:
#ifdef PADDLE_WITH_TESTING
FRIEND_TEST(PDPattern, AddEdge);
FRIEND_TEST(PDPattern, NewNode);
#endif
std::vector<std::unique_ptr<PDNode>> nodes_;
std::vector<edge_t> edges_;
};
/*
* GraphPatternDetecter helps to detect the specific patterns in the graph.
* Input a pattern, output a list of the matched subgraphs/nodes.
* This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
*
* The algorithm has three phases:
* 1. Mark the nodes that match the defined PDNodes in a PDPattern,
* 2. Extend a PDNode to subgraphs by deducing the connection relation defined
* in PAPattern(the edges),
* 3. Get the filtered subgraphs and treat them with a pre-defined handler.
*
* Usage:
* // Create a detector
* GraphPatternDetecter detector;
* // Define the detector's pattern, by adding PDNode and define the edges.
* auto* node0 = detector.mutable_pattern().AddNode(...)
* auto* node1 = detector.mutable_pattern().AddNode(...)
* node0->teller = some lambda.
* node1->teller = some lambda.
* detector.mutable_pattern().AddEdge(node0, node1);
* // Create an handler, to define the behavior of treating the filtered
* // subgraphs that comply with the patterns.
* GraphPatternDetecter::handle_t handler = some labmda
* // Execute the detector.
* detector(&graph, handler);
*/
class GraphPatternDetecter {
public:
using subgraph_t = std::unordered_map<PDNode*, Node*>;
// Operate on the detected pattern.
using handle_t =
std::function<void(const subgraph_t& /*hitted pattern*/, Graph*)>;
void operator()(Graph* graph, handle_t handler);
const PDPattern& pattern() const { return pattern_; }
PDPattern* mutable_pattern() { return &pattern_; }
private:
// Mark the nodes that fits the pattern.
bool MarkPDNodesInGraph(const ir::Graph& graph);
// Detect all the pattern and output the hit records.
std::vector<subgraph_t> DetectPatterns();
// Remove duplicate patterns.
void UniquePatterns(std::vector<subgraph_t>* subgraphs);
// Remove overlapped match subgraphs, when overlapped, keep the previous one.
void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
#ifdef PADDLE_WITH_TESTING
FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph);
FRIEND_TEST(GraphPatternDetecter, DetectPatterns);
#endif
private:
using hit_rcd_t =
std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
PDPattern pattern_;
std::vector<hit_rcd_t> marked_records_;
std::unordered_map<const PDNode*, std::unordered_set<Node*>> pdnodes2nodes_;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
void BuildGraph(Graph* g) {
ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
// o1->v1->o2
o1->outputs.push_back(v1);
o2->inputs.push_back(v1);
v1->inputs.push_back(o1);
v1->outputs.push_back(o2);
// o2->v2->o3
// o2->v2->o4
o2->outputs.push_back(v2);
o3->inputs.push_back(v2);
o4->inputs.push_back(v2);
v2->inputs.push_back(o2);
v2->outputs.push_back(o3);
v2->outputs.push_back(o4);
// o2->v3->o5
o2->outputs.push_back(v3);
o5->inputs.push_back(v3);
v3->inputs.push_back(o2);
v3->outputs.push_back(o5);
// o3-v4->o5
o3->outputs.push_back(v4);
o5->inputs.push_back(v4);
v4->inputs.push_back(o3);
v4->outputs.push_back(o5);
}
TEST(PDPattern, NewNode) {
PDPattern x;
auto* n = x.NewNode([](Node* x) { return true; });
ASSERT_TRUE(n);
ASSERT_EQ(x.nodes_.size(), 1UL);
}
TEST(PDPattern, AddEdge) {
PDPattern x;
auto* a = x.NewNode([](Node* x) { return true; });
auto* b = x.NewNode([](Node* x) { return true; });
ASSERT_TRUE(a);
ASSERT_TRUE(b);
x.AddEdge(a, b);
ASSERT_EQ(x.nodes_.size(), 2UL);
ASSERT_EQ(x.edges_.size(), 1UL);
ASSERT_EQ(x.edges_.front().first, a);
ASSERT_EQ(x.edges_.front().second, b);
ASSERT_EQ(x.nodes().size(), 2UL);
ASSERT_EQ(x.edges().size(), 1UL);
ASSERT_EQ(x.edges().front().first, a);
ASSERT_EQ(x.edges().front().second, b);
}
TEST(GraphPatternDetecter, MarkPDNodesInGraph) {
GraphPatternDetecter x;
// mark o2, o3, v2
// The pattern is a graph:
// o2(a node named o2) -> v2(a node named v2)
// v2 -> o3(a node named o3)
auto* o2 = x.pattern_.NewNode([](Node* node) {
// The teller can be any condition, such as op type, or variable's shape.
return node && node->Name() == "op2" && node->IsOp();
});
auto* o3 = x.pattern_.NewNode([](Node* node) {
// The teller can be any condition, such as op type, or variable's shape.
return node && node->Name() == "op3" && node->IsOp();
});
auto* v2 = x.pattern_.NewNode([](Node* node) {
// The teller can be any condition, such as op type, or variable's shape.
return node && node->Name() == "var2" && node->IsVar();
});
ASSERT_FALSE(o2->Tell(nullptr));
ASSERT_FALSE(o3->Tell(nullptr));
ASSERT_FALSE(v2->Tell(nullptr));
x.pattern_.AddEdge(o2, v2);
x.pattern_.AddEdge(v2, o3);
ASSERT_EQ(x.pattern_.edges().size(), 2UL);
ASSERT_EQ(x.pattern_.edges()[0].first, o2);
ASSERT_EQ(x.pattern_.edges()[0].second, v2);
ASSERT_EQ(x.pattern_.edges()[1].first, v2);
ASSERT_EQ(x.pattern_.edges()[1].second, o3);
ProgramDesc program;
Graph graph(program);
BuildGraph(&graph);
x.MarkPDNodesInGraph(graph);
ASSERT_EQ(x.pdnodes2nodes_.size(), 3UL);
auto subgraphs = x.DetectPatterns();
ASSERT_EQ(subgraphs.size(), 1UL);
}
TEST(GraphPatternDetecter, MultiSubgraph) {
ProgramDesc program;
Graph graph(program);
BuildGraph(&graph);
GraphPatternDetecter x;
// The pattern is a graph:
// op -> var
auto* any_op = x.mutable_pattern()->NewNode(
[](Node* node) {
return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
},
"OP0");
auto* any_var = x.mutable_pattern()->NewNode(
[](Node* node) { return node->IsVar(); }, "VAR");
auto* any_op1 = x.mutable_pattern()->NewNode(
[](Node* node) { return node->IsOp(); }, "OP1");
x.mutable_pattern()->AddEdge(any_op, any_var);
x.mutable_pattern()->AddEdge(any_var, any_op1);
int count = 0;
GraphPatternDetecter::handle_t handle = [&](
const GraphPatternDetecter::subgraph_t& s, Graph* g) {
LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
<< s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
count++;
};
x(&graph, handle);
// 1. Detect op3 -> var4 -> op5
// 2. Detect op2 -> var2 -> op3
// 3. Detect op2 -> var2 -> op4
// 4. Detect op2 -> var3 -> op5
// But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
ASSERT_GE(count, 1UL);
ASSERT_LE(count, 2UL);
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -36,7 +36,7 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "").AsDuplicable();
AddOutput("Out", "");
AddOutput("Out", "").AsDuplicable();
AddComment("");
}
};
......@@ -59,11 +59,27 @@ class SumOpVarTypeInference : public VarTypeInference {
block->Var(out_var_name)->SetType(default_var_type);
}
};
class DummyOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "").AsDuplicable();
AddOutput("Out", "").AsDuplicable();
AddComment("");
}
};
class DummyOpVarTypeInference : public VarTypeInference {
public:
void operator()(const OpDesc &op_desc, BlockDesc *block) const override {}
};
} // namespace framework
} // namespace paddle
REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
paddle::framework::SumOpVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::NOP, paddle::framework::SumOpMaker,
paddle::framework::SumOpVarTypeInference);
REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
paddle::framework::SumOpMaker);
......@@ -110,5 +126,83 @@ TEST(GraphTest, Basic) {
}
ASSERT_EQ(nodes.size(), 5);
}
TEST(GraphTest, WriteAfterRead) {
// void Test() {
ProgramDesc prog;
auto *op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
op->SetAttr("op_role", 1);
op = prog.MutableBlock(0)->AppendOp();
op->SetType("dummy");
op->SetInput("X", {"c"});
op->SetOutput("Out", {"a"});
op->SetAttr("op_role", 1);
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
ir::Node *control_dep1 = nullptr;
ir::Node *control_dep2 = nullptr;
for (ir::Node *n : g->Nodes()) {
if (n->Name() == "sum") {
ASSERT_EQ(n->outputs[0]->Name(), "b");
ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
control_dep1 = n->outputs[1];
ASSERT_EQ(n->outputs.size(), 2);
}
if (n->Name() == "dummy") {
ASSERT_EQ(n->inputs[0]->Name(), "c");
ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
control_dep2 = n->inputs[1];
ASSERT_EQ(n->inputs.size(), 2);
}
}
ASSERT_EQ(control_dep1, control_dep2);
}
TEST(GraphTest, WriteAfterWrite) {
// void Test() {
ProgramDesc prog;
auto *op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
op->SetAttr("op_role", 1);
op = prog.MutableBlock(0)->AppendOp();
op->SetType("dummy");
op->SetInput("X", {"c"});
op->SetOutput("Out", {"b"});
op->SetAttr("op_role", 1);
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
ir::Node *control_dep1 = nullptr;
ir::Node *control_dep2 = nullptr;
for (ir::Node *n : g->Nodes()) {
if (n->Name() == "sum") {
ASSERT_EQ(n->outputs[0]->Name(), "b");
ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
ASSERT_EQ(n->outputs.size(), 2);
control_dep1 = n->outputs[1];
}
if (n->Name() == "dummy") {
ASSERT_EQ(n->inputs[0]->Name(), "c");
ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
control_dep2 = n->inputs[1];
ASSERT_EQ(n->inputs.size(), 2);
ASSERT_EQ(control_dep1, control_dep2);
}
}
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/graph_traits.h"
namespace paddle {
namespace framework {
namespace ir {
//
// NodesDFSIterator
//
NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
for (auto *x : source) stack_.push(x);
}
NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept
: stack_(std::move(other.stack_)),
visited_(std::move(other.visited_)) {}
NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
: stack_(other.stack_), visited_(other.visited_) {}
Node &NodesDFSIterator::operator*() {
PADDLE_ENFORCE(!stack_.empty());
return *stack_.top();
}
NodesDFSIterator &NodesDFSIterator::operator++() {
PADDLE_ENFORCE(!stack_.empty(), "the iterator exceeds range");
visited_.insert(stack_.top());
auto *cur = stack_.top();
stack_.pop();
for (auto *x : cur->outputs) {
if (!visited_.count(x)) {
stack_.push(x);
}
}
return *this;
}
bool NodesDFSIterator::operator==(const NodesDFSIterator &other) {
if (stack_.empty()) return other.stack_.empty();
if ((!stack_.empty()) && (!other.stack_.empty())) {
return stack_.top() == other.stack_.top();
}
return false;
}
NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
stack_ = other.stack_;
visited_ = other.visited_;
return *this;
}
Node *NodesDFSIterator::operator->() { return stack_.top(); }
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stack>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
namespace paddle {
namespace framework {
namespace ir {
template <typename IteratorT>
class iterator_range {
IteratorT begin_, end_;
public:
template <typename Container>
explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
iterator_range(const IteratorT &begin, const IteratorT &end)
: begin_(begin), end_(end) {}
const IteratorT &begin() const { return begin_; }
const IteratorT &end() const { return end_; }
};
// DFS iterator on nodes.
struct NodesDFSIterator
: public std::iterator<std::forward_iterator_tag, Node *> {
NodesDFSIterator() = default;
explicit NodesDFSIterator(const std::vector<Node *> &source);
NodesDFSIterator(NodesDFSIterator &&other) noexcept;
NodesDFSIterator(const NodesDFSIterator &other);
Node &operator*();
NodesDFSIterator &operator++();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesDFSIterator &operator=(const NodesDFSIterator &other);
bool operator==(const NodesDFSIterator &other);
bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
Node *operator->();
private:
std::stack<Node *> stack_;
std::unordered_set<Node *> visited_;
};
/*
* GraphTraits contains some graph traversal algorithms.
*
* Usage:
*
*/
struct GraphTraits {
static iterator_range<NodesDFSIterator> DFS(const Graph &g) {
auto start_points = ExtractStartPoints(g);
NodesDFSIterator x(start_points);
return iterator_range<NodesDFSIterator>(NodesDFSIterator(start_points),
NodesDFSIterator());
}
private:
// The nodes those have no input will be treated as start points.
static std::vector<Node *> ExtractStartPoints(const Graph &g) {
std::vector<Node *> result;
for (auto *node : g.Nodes()) {
if (node->inputs.empty()) {
result.push_back(node);
}
}
return result;
}
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -58,6 +58,9 @@ class Node {
return op_desc_;
}
bool IsOp() const { return type_ == Type::kOperation; }
bool IsVar() const { return type_ == Type::kVariable; }
std::vector<Node*> inputs;
std::vector<Node*> outputs;
......
......@@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
}
}
int OpDesc::GetBlockAttr(const std::string &name) const {
std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
auto blocks = boost::get<std::vector<BlockDesc *>>(it->second);
std::vector<int> ids;
for (auto n : blocks) {
ids.push_back(n->ID());
}
return ids;
}
int OpDesc::GetBlockAttrId(const std::string &name) const {
auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
return boost::get<BlockDesc *>(it->second)->ID();
......
......@@ -83,7 +83,9 @@ class OpDesc {
Attribute GetNullableAttr(const std::string &name) const;
int GetBlockAttr(const std::string &name) const;
int GetBlockAttrId(const std::string &name) const;
std::vector<int> GetBlocksAttrIds(const std::string &name) const;
void Rename(const std::string &old_name, const std::string &new_name);
......
......@@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
for (const std::string &attr_name : op->AttrNames()) {
if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
int sub_block_id =
o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
}
}
......
......@@ -112,5 +112,6 @@ Tensor& Tensor::Resize(const DDim& dims) {
const DDim& Tensor::dims() const { return dims_; }
int64_t Tensor::numel() const { return product(dims_); }
} // namespace framework
} // namespace paddle
......@@ -59,6 +59,14 @@ inline T* Tensor::mutable_data(platform::Place place) {
}
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
int rank = src.dims().size();
PADDLE_ENFORCE_GE(
rank, 2,
"'ReshapeToMatrix()' is only used for flatten high rank "
"tensors to matrixs. Can not be used in reshaping vectors.");
if (rank == 2) {
return src;
}
Tensor res;
res.ShareDataWith(src);
res.Resize(flatten_to_2d(src.dims(), num_col_dims));
......
......@@ -20,6 +20,9 @@
DEFINE_int32(io_threadpool_size, 100,
"number of threads used for doing IO, default 100");
DEFINE_int32(dist_threadpool_size, 0,
"number of threads used for distributed executed.");
namespace paddle {
namespace framework {
......@@ -35,6 +38,10 @@ void ThreadPool::Init() {
if (threadpool_.get() == nullptr) {
// TODO(Yancey1989): specify the max threads number
int num_threads = std::thread::hardware_concurrency();
if (FLAGS_dist_threadpool_size > 0) {
num_threads = FLAGS_dist_threadpool_size;
VLOG(1) << "set dist_threadpool_size to " << num_threads;
}
PADDLE_ENFORCE_GT(num_threads, 0);
threadpool_.reset(new ThreadPool(num_threads));
}
......
......@@ -22,6 +22,9 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(profile, false, "Turn on profiler for fluid");
namespace paddle {
namespace {
......@@ -58,6 +61,15 @@ bool NativePaddlePredictor::Init(
std::shared_ptr<framework::Scope> parent_scope) {
VLOG(3) << "Predictor::init()";
if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance";
LOG(INFO) << "You can turn off by set gflags '-profile false'";
auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
: platform::ProfilerState::kCPU;
platform::EnableProfiler(tracking_device);
}
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
......@@ -102,6 +114,10 @@ bool NativePaddlePredictor::Init(
}
NativePaddlePredictor::~NativePaddlePredictor() {
if (FLAGS_profile) {
platform::DisableProfiler(platform::EventSortingKey::kTotal,
"./profile.log");
}
if (sub_scope_) {
scope_->DeleteScope(sub_scope_);
}
......
......@@ -170,6 +170,9 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
elseif(${TARGET} STREQUAL "tensorrt_engine_op")
message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
elseif(${TARGET} STREQUAL "fc")
# HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif()
......@@ -300,12 +303,6 @@ op_library(channel_recv_op DEPS concurrency)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
# Because the fully connected layer has only one MKLDNN's operator
if(NOT WITH_MKLDNN)
list(REMOVE_ITEM GENERAL_OPS fc_op)
endif(NOT WITH_MKLDNN)
foreach(src ${GENERAL_OPS})
op_library(${src})
endforeach()
......
......@@ -28,23 +28,26 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputDim("X");
auto label_dims = ctx->GetInputDim("Label");
PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
"Input(Label)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
"The 1st dimension of Input(X) and Input(Label) should "
"be equal.");
int rank = x_dims.size();
PADDLE_ENFORCE_EQ(rank, label_dims.size(),
"Input(X) and Input(Label) shall have the same rank.");
PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
framework::slice_ddim(label_dims, 0, rank - 1),
"Input(X) and Input(Label) shall have the same shape "
"except the last dimension.");
if (ctx->Attrs().Get<bool>("soft_label")) {
PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
"If Attr(soft_label) == true, the 2nd dimension of "
PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
"If Attr(soft_label) == true, the last dimension of "
"Input(X) and Input(Label) should be equal.");
} else {
PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
"If Attr(softLabel) == false, the 2nd dimension of "
PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL,
"If Attr(softLabel) == false, the last dimension of "
"Input(Label) should be 1.");
}
ctx->SetOutputDim("Y", {x_dims[0], 1});
auto y_dims = x_dims;
y_dims[rank - 1] = 1;
ctx->SetOutputDim("Y", y_dims);
ctx->ShareLoD("X", /*->*/ "Y");
}
......@@ -74,24 +77,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputDim("X");
auto label_dims = ctx->GetInputDim("Label");
auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
"The 1st dimension of Input(X) and Input(Label) should "
"be equal.");
PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
"The 1st dimension of Input(X) and Input(Y@Grad) should "
"be equal.");
PADDLE_ENFORCE_EQ(dy_dims[1], 1,
"The 2nd dimension of Input(Y@Grad) should be 1.");
int rank = x_dims.size();
PADDLE_ENFORCE_EQ(dy_dims.size(), rank,
"Input(Y@Grad) and Input(X) should have the same rank.");
PADDLE_ENFORCE_EQ(label_dims.size(), rank,
"Input(Label) and Input(X) should have the same rank.");
PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
framework::slice_ddim(label_dims, 0, rank - 1),
"The Input(X) and Input(Label) should have the same "
"shape except the last dimension.");
PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
framework::slice_ddim(dy_dims, 0, rank - 1),
"The Input(X) and Input(Y@Grad) should have the same "
"shape except the last dimension.");
PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
"The last dimension of Input(Y@Grad) should be 1.");
if (ctx->Attrs().Get<bool>("soft_label")) {
PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
"When Attr(soft_label) == true, the 2nd dimension of "
PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
"When Attr(soft_label) == true, the last dimension of "
"Input(X) and Input(Label) should be equal.");
} else {
PADDLE_ENFORCE_EQ(label_dims[1], 1,
"When Attr(soft_label) == false, the 2nd dimension of "
PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
"When Attr(soft_label) == false, the last dimension of "
"Input(Label) should be 1.");
}
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
......@@ -113,18 +120,20 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
" where N is the batch size and D is the number of classes. "
"This input is a probability computed by the previous operator, "
"which is almost always the result of a softmax operator.");
AddInput("Label",
"(Tensor), the ground truth which is a 2-D tensor. When "
"soft_label is set to false, Label is a Tensor<int64> with shape "
"[N x 1]. When soft_label is set to true, Label is a "
"Tensor<float/double> with shape [N x D].");
"(Tensor, default Tensor<float>), a tensor whose last dimension "
"size is equal to the number of classes. This input is a "
"probability computed by the previous operator, which is almost "
"always the result of a softmax operator.");
AddInput(
"Label",
"(Tensor), the tensor which represents the ground truth. It has the "
"same shape with 'X' except the last dimension. When soft_label is set "
"to false, the last dimension size is 1; when soft_label is set to "
"true, the last dimension size is equal to the number of classes.");
AddOutput("Y",
"(Tensor, default Tensor<float>), a 2-D tensor with shape "
"[N x 1]. The cross entropy loss.");
"(Tensor, default Tensor<float>), a tensor whose shape is same "
"with 'X' except that the last dimension size is 1. It "
"represents the cross entropy loss.");
AddAttr<bool>("soft_label",
"(bool, default false), a flag indicating whether to "
"interpretate the given labels as soft labels.")
......@@ -132,6 +141,12 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
CrossEntropy Operator.
The input 'X' and 'Label' will first be logically flattened to 2-D matrixs.
The matrix's second dimension(row length) is as same as the original last
dimension, and the first dimension(column length) is the product of all other
original dimensions. Then the softmax computation will take palce on each raw
of flattened matrixs.
It supports both standard cross-entropy and soft-label cross-entropy loss
computation.
1) One-hot cross-entropy:
......
......@@ -33,8 +33,13 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
auto* y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
int rank = x->dims().size();
Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
math::CrossEntropyFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), y, x, labels,
ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d,
ctx.Attr<bool>("soft_label"));
}
};
......@@ -98,9 +103,12 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto* label = ctx.Input<Tensor>("Label");
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
int64_t class_num = x->dims()[1];
// Following computation only depends on the last dimension size. So it's
// unnecessary to convert tensors to 2-D views.
int rank = x->dims().size();
int64_t class_num = x->dims()[rank - 1];
if (ctx.Attr<bool>("soft_label")) {
XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
label->data<T>(),
......
......@@ -190,12 +190,15 @@ bool VariableResponse::ProcSerializedField(
#endif
}
VLOG(7) << "ProcSerializedField:" << meta_.varname()
<< ", type:" << meta_.type() << std::endl;
framework::DDim dims = GetDims(meta_.dims());
if (meta_.type() == sendrecv::LOD_TENSOR) {
PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
return false;
}
return true;
}
......@@ -206,7 +209,9 @@ bool VariableResponse::ProcSerializedField(
return true;
}
return true;
PADDLE_ENFORCE("not supported var types:", meta_.varname(), meta_.type());
return false;
}
}; // namespace distributed
......
......@@ -125,13 +125,16 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto input = ctx.Input<Tensor>("Input");
auto w = ctx.Input<Tensor>("W");
auto bias = ctx.Input<Tensor>("Bias");
PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
"Input must be with 2 or 4 dimensions, i.e. NCHW");
// TODO(intel friends): the native weight format is io,
// but the mkldnn weight format is oihw, which may need be transposed.
PADDLE_ENFORCE(w->dims().size() == 2 || w->dims().size() == 4,
"Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
bool with_bias = ctx.Attr<bool>("bias_attr");
bool with_bias = bias != nullptr;
MKLDNNMD<Tensor> md(input, w, with_bias);
std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> pd =
......@@ -154,6 +157,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto dst_memory = mem.dst(output_data);
auto src_memory = mem.src(input_data);
auto weights_memory = mem.weights(w_data);
// TODO(intel friends): bias memory should also be obtain from bias->data()
auto bias_memory = mem.bias();
auto forward = with_bias ? mkldnn::inner_product_forward(
......@@ -216,7 +220,8 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
const T* out_grad_data = out_grad->data<T>();
bool with_bias = ctx.Attr<bool>("bias_attr");
auto bias = ctx.Input<Tensor>("Bias");
bool with_bias = bias != nullptr;
MKLDNNMD<Tensor> md(input, w, with_bias);
MKLDNNMemory mem(&md, mkldnn_engine);
......
......@@ -14,6 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/fc_op.h"
#include <vector>
#include "paddle/fluid/operators/math/blas.h"
DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace operators {
......@@ -25,16 +28,24 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
"Out(Output) of Fully Connected should not be null.");
PADDLE_ENFORCE(ctx->HasInput("W"),
"W(Input) of Fully Connected should not be null.");
// NCHW
auto in_dims = ctx->GetInputDim("Input");
// IO, I=C*H*W
auto w_dims = ctx->GetInputDim("W");
std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias");
PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim].");
PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
"The shape of Bias must be [1, dim].");
}
PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
"Fully Connected input should be 2-D or 4-D tensor.");
PADDLE_ENFORCE(w_dims.size() == 2 || w_dims.size() == 4,
"Fully Connected input should be 2-D or 4-D tensor.");
PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
"Fully Connected input should be 2-D tensor.");
PADDLE_ENFORCE_EQ(framework::product(in_dims) / in_dims[0], w_dims[0],
"Fully Connected input and weigth size do not match.");
ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
ctx->ShareLoD("Input", "Out");
......@@ -42,9 +53,12 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType FCOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
framework::LibraryType library{framework::LibraryType::kMKLDNN};
framework::DataLayout layout{framework::DataLayout::kMKLDNN};
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
if (ctx.Attr<bool>("use_mkldnn")) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
layout, library);
......@@ -60,27 +74,39 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
if (ctx->HasOutput(framework::GradVarName("W"))) {
ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
}
if (ctx->HasInput("Bias")) {
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
"Should have bias grad");
auto bias_dims = ctx->GetInputDim("Bias");
ctx->SetOutputDim(framework::GradVarName("Bias"), bias_dims);
}
}
framework::OpKernelType FCOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
framework::LibraryType library{framework::LibraryType::kMKLDNN};
framework::DataLayout layout{framework::DataLayout::kMKLDNN};
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
if (ctx.Attr<bool>("use_mkldnn")) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
layout, library);
}
void FCOpMaker::Make() {
AddInput("Input", "(Tensor) The input tensor of fully connected operator. ");
AddInput("W", "(Tensor), The second input tensor of fc op.");
AddInput("Input",
"(Tensor), The input tensor of fully connected operator with format "
"(NCHW). ");
AddInput("W", "(Tensor), The weight fc op with shape (I, O).");
AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O")
.AsDispensable();
AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("bias_attr", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC(
Fully Connected Operator.
......@@ -94,9 +120,47 @@ void FCOpMaker::Make() {
)DOC");
}
template <typename T>
class FCOpKernel : public framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto input = ctx.Input<Tensor>("Input");
auto w = ctx.Input<Tensor>("W");
auto bias = ctx.Input<Tensor>("Bias");
auto output = ctx.Output<Tensor>("Out");
auto in_dims = input->dims();
auto w_dims = w->dims();
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(dev_ctx);
const T* input_data = input->data<T>();
const T* w_data = w->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
blas.GEMM(CblasNoTrans, CblasNoTrans, in_dims[0], w_dims[1], w_dims[0],
static_cast<T>(1), input_data, w_data, static_cast<T>(0),
output_data);
if (bias) {
const T* bias_data = bias->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
for (int bs = 0; bs < in_dims[0]; bs++) {
blas.AXPY(w_dims[1], static_cast<T>(1), bias_data,
output_data + bs * w_dims[1]);
}
}
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker,
namespace ops = paddle::operators;
REGISTER_OPERATOR(fc, ops::FCOp, ops::FCOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(fc_grad, paddle::operators::FCOpGrad);
REGISTER_OPERATOR(fc_grad, ops::FCOpGrad);
REGISTER_OP_CPU_KERNEL(fc, ops::FCOpKernel<float>, ops::FCOpKernel<double>);
......@@ -123,8 +123,11 @@ void ListenAndServOp::RunSyncLoop(
optimize_prepared.begin(),
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
// Trainers will get all parameters from pserver in the
// startup program, so we will wait RequestGet first
rpc_service_->SetCond(distributed::kRequestGet);
rpc_service_->WaitBarrier(distributed::kRequestGet);
rpc_service_->ResetBarrierCounter();
while (true) {
rpc_service_->Profiler().OneStep();
// Get from multiple trainers, we don't care about the order in which
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -26,14 +23,40 @@ class PReluOp : public framework::OperatorWithKernel {
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
std::string mode = ctx->Attrs().Get<std::string>("mode");
auto x_dim = ctx->GetInputDim("X");
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
"Size of weight Alpha must be one.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
if (mode == "all") {
PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
"For mode 'all', size of weight Alpha must be one.");
} else if (mode == "channel") {
PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == x_dim[1],
"For channel-wise mode, size of weight Alpha must be "
"equal to the number of channels, should be %d",
x_dim[1]);
} else if (mode == "element") {
PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == product(x_dim),
"For element-wise mode, size of weight Alpha must be "
"equal to the number of input, should be %d",
product(x_dim));
} else {
PADDLE_THROW("Unkown mode %s", mode);
}
ctx->SetOutputDim("Out", x_dim);
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()),
platform::CPUPlace());
}
};
class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -44,9 +67,7 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "The output tensor of prelu operator.");
AddComment(R"DOC(
PRelu Operator.
The equation is:
$$
f(x) =
\begin{cases}
......@@ -54,11 +75,15 @@ f(x) =
x, \qquad \text{if} \ x >= 0
\end{cases}
$$
The input `X` can carry the LoD (Level of Details) information,
or not. And the output shares the LoD information with input `X`.
There are modes:
all: all elements share same weight
channel: elements in a channel share same weight
element: each element has a weight
)DOC");
AddAttr<std::string>("mode", "The mode for inputs to share weights.")
.SetDefault("all");
}
};
......@@ -71,9 +96,23 @@ class PReluGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->SetOutputDim(framework::GradVarName("Alpha"),
ctx->GetInputDim("Alpha"));
auto x_grad_name = framework::GradVarName("X");
auto alpha_grad_name = framework::GradVarName("Alpha");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
}
if (ctx->HasOutput(alpha_grad_name)) {
ctx->SetOutputDim(alpha_grad_name, ctx->GetInputDim("Alpha"));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()),
platform::CPUPlace());
}
};
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/prelu_op.h"
REGISTER_OP_CUDA_KERNEL(
prelu,
paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(prelu_grad,
paddle::operators::PReluGradKernel<
paddle::platform::CUDADeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -13,32 +10,16 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using platform::Transform;
template <typename T>
class PReluFunctor {
public:
explicit PReluFunctor(const T* alpha) : alpha_(alpha) {}
HOSTDEVICE T operator()(const T& x) const {
if (x > 0)
return x;
else
return x * (*alpha_);
}
private:
const T* alpha_;
};
template <typename DeviceContext, typename T>
class PReluKernel : public framework::OpKernel<T> {
public:
......@@ -50,53 +31,93 @@ class PReluKernel : public framework::OpKernel<T> {
const T* x_ptr = x->data<T>();
T* o_ptr = out->mutable_data<T>(context.GetPlace());
auto* alpha_ptr = alpha->data<T>();
const T* alpha_ptr = alpha->data<T>();
std::string mode = context.Attr<std::string>("mode");
int numel = x->numel();
Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), x_ptr,
x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
}
};
template <typename T>
class PReluGradFunctor {
public:
explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {}
HOSTDEVICE T operator()(const T& out, const T& dout) const {
if (out > 0)
return dout;
else
return dout * (*alpha_);
auto dim = x->dims();
int index = 0;
int i = 0;
int temp = 0;
if (mode == "channel") {
for (i = 0; i < numel; i++) {
temp = numel / (dim[0] * dim[1]);
index = (i / temp) % dim[1];
o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
}
} else if (mode == "element") {
for (i = 0; i < numel; i++) {
o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i];
}
} else {
for (i = 0; i < numel; i++) {
o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
}
}
}
private:
const T* alpha_;
};
template <typename DeviceContext, typename T>
class PReluGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
auto* out = context.Input<Tensor>("Out");
auto* alpha = context.Input<Tensor>("Alpha");
auto* alpha_ptr = alpha->data<T>();
T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
const T* alpha_ptr = alpha->data<T>();
const T* x_ptr = x->data<T>();
const T* dout_ptr = dout->data<T>();
const T* out_ptr = out->data<T>();
int numel = dx->numel();
Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), out_ptr,
out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
// TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
std::string mode = context.Attr<std::string>("mode");
int numel = x->numel();
auto dim = x->dims();
int index = 0;
int i = 0;
int temp = 0;
if (dx) {
T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
if (mode == "channel") {
for (i = 0; i < numel; i++) {
temp = numel / (dim[0] * dim[1]);
index = (i / temp) % dim[1];
dx_ptr[i] =
out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
}
} else if (mode == "element") {
for (i = 0; i < numel; i++) {
dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[i] * dout_ptr[i];
}
} else {
for (i = 0; i < numel; i++) {
dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i];
}
}
}
index = 0;
if (dalpha) {
T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace());
if (mode == "channel") {
for (i = 0; i < numel; i++) {
temp = numel / (dim[0] * dim[1]);
index = (i / temp) % dim[1];
dalpha_ptr[index] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
}
} else if (mode == "element") {
for (i = 0; i < numel; i++) {
dalpha_ptr[i] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
}
} else {
for (i = 0; i < numel; i++) {
dalpha_ptr[0] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
}
}
}
// TODO(Guanzhong): add GPU kernels
}
};
......
......@@ -38,7 +38,7 @@ class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Input", "(Tensor), The input tensor.");
AddOutput("Out",
"(Tensor), The shape of input tensor, the data type of the shape"
" is int64_t, will be on the same device with the input Tensor.");
" is int32_t, will be on the same device with the input Tensor.");
AddComment(R"DOC(
Shape Operator
......@@ -53,5 +53,5 @@ Get the shape of input tensor. Only support CPU input Tensor now.
namespace ops = paddle::operators;
REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int64_t>,
REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>,
ops::ShapeKernel<float>, ops::ShapeKernel<double>);
......@@ -15,6 +15,6 @@ limitations under the License. */
#include "paddle/fluid/operators/shape_op.h"
REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
paddle::operators::ShapeKernel<int64_t>,
paddle::operators::ShapeKernel<int32_t>,
paddle::operators::ShapeKernel<float>,
paddle::operators::ShapeKernel<double>);
......@@ -27,7 +27,7 @@ class ShapeKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_t = ctx.Input<Tensor>("Input");
auto* out_t = ctx.Output<Tensor>("Out");
auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
auto in_dims = in_t->dims();
for (int i = 0; i < in_dims.size(); ++i) {
out_data[i] = in_dims[i];
......
......@@ -31,16 +31,12 @@ class SoftmaxKernel : public framework::OpKernel<T> {
// allocate memory on device.
Out->mutable_data<T>(context.GetPlace());
auto dims = X->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_x;
framework::LoDTensor flattened_out;
flattened_x.ShareDataWith(*X).Resize(flattened_dims);
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
int rank = X->dims().size();
Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
math::SoftmaxFunctor<DeviceContext, T>()(
context.template device_context<DeviceContext>(), &flattened_x,
&flattened_out);
context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
}
};
......@@ -55,18 +51,14 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
// allocate memory on device.
dX->mutable_data<T>(context.GetPlace());
auto dims = Out->dims();
auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
framework::LoDTensor flattened_out;
framework::LoDTensor flattened_d_out;
framework::LoDTensor flattened_d_x;
flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
int rank = Out->dims().size();
Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
Tensor dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1);
Tensor dX_2d = framework::ReshapeToMatrix(*dX, rank - 1);
math::SoftmaxGradFunctor<DeviceContext, T>()(
context.template device_context<DeviceContext>(), &flattened_out,
&flattened_d_out, &flattened_d_x);
context.template device_context<DeviceContext>(), &Out_2d, &dOut_2d,
&dX_2d);
}
};
......
......@@ -270,12 +270,13 @@ struct EventItem {
double min_time;
double max_time;
double ave_time;
float ratio;
};
// Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
const std::string& sorted_domain, const size_t name_width,
const size_t data_width) {
const size_t data_width, double total) {
// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
......@@ -300,7 +301,8 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
<< "Max." << std::setw(data_width) << "Ave."
<< std::setw(data_width) << "Ratio." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
const EventItem& event_item = events_table[i][j];
......@@ -309,7 +311,9 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
<< std::setw(data_width) << event_item.ave_time
<< std::setw(data_width) << event_item.total_time / total
<< std::endl;
}
}
std::cout << std::endl;
......@@ -359,6 +363,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
std::vector<std::vector<EventItem>> events_table;
size_t max_name_width = 0;
double total = 0.; // the total time
for (size_t i = 0; i < events.size(); i++) {
std::list<Event> pushed_events;
std::vector<EventItem> event_items;
......@@ -379,6 +384,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
g_state == ProfilerState::kAll)
? rit->CudaElapsedMs(events[i][j])
: rit->CpuElapsedMs(events[i][j]);
total += event_time;
std::string event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
......@@ -387,7 +393,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
if (event_idx.find(event_name) == event_idx.end()) {
event_idx[event_name] = event_items.size();
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time};
event_time, event_time, event_time,
0.};
event_items.push_back(event_item);
} else {
int index = event_idx[event_name];
......@@ -431,7 +438,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
}
// Print report
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, total);
}
void DisableProfiler(EventSortingKey sorted_key,
......
......@@ -301,7 +301,8 @@ void BindOpDesc(pybind11::module *m) {
std::string ser(seriralized);
self.SetAttr(name, ser);
})
.def("block_attr", &pd::OpDesc::GetBlockAttr)
.def("block_attr_id", &pd::OpDesc::GetBlockAttrId)
.def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
.def("check_attrs", &pd::OpDesc::CheckAttrs)
.def("infer_shape", &pd::OpDesc::InferShape)
.def("infer_var_type", &pd::OpDesc::InferVarType)
......
......@@ -122,7 +122,7 @@ def __bootstrap__():
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
'cpu_deterministic'
"dist_threadpool_size", 'cpu_deterministic'
]
if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline')
......
......@@ -344,7 +344,7 @@ def _append_backward_ops_(block,
grad_sub_block_list = []
# If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr("sub_block"))
sub_block = program.block(op.block_attr_id("sub_block"))
grad_sub_block = program.create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx)
cb = _callback_lookup_(op)
......@@ -406,7 +406,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx)
if op_desc.has_attr("sub_block"):
sub_block = block.program.block(op_desc.block_attr("sub_block"))
sub_block = block.program.block(op_desc.block_attr_id("sub_block"))
_append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
new_vars = set()
# create new gradient variables
......
......@@ -476,23 +476,25 @@ class Operator(object):
attrs=None):
self.block = block
self.desc = desc
self.attrs = attrs
if self.attrs is None:
self.attrs = dict()
# note: not add self.attrs here:
# https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173
op_attrs = attrs
if op_attrs is None:
op_attrs = dict()
del attrs
op_maker = core.op_proto_and_checker_maker
if op_maker.kOpRoleAttrName() not in self.attrs:
self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
if op_maker.kOpRoleAttrName() not in op_attrs:
op_attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
role_var_name = op_maker.kOpRoleVarAttrName()
if len(self.block.program.
op_role_var) != 0 and role_var_name not in self.attrs:
self.attrs[role_var_name] = self.block.program.op_role_var
op_role_var) != 0 and role_var_name not in op_attrs:
op_attrs[role_var_name] = self.block.program.op_role_var
if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0:
del self.attrs[role_var_name]
if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
del op_attrs[role_var_name]
if len(self.desc.type()) != 0:
return
......@@ -576,15 +578,14 @@ class Operator(object):
arg.op = self
self.desc.set_output(out_proto.name, out_arg_names)
if self.attrs is not None:
if not isinstance(self.attrs, dict):
if op_attrs is not None:
if not isinstance(op_attrs, dict):
raise TypeError("'attrs' should be a dict.")
for attr in proto.attrs:
attr_name = attr.name
if (attr_name not in self.attrs) or (
self.attrs[attr_name] is None):
if (attr_name not in op_attrs) or (op_attrs[attr_name] is None):
continue
attr_val = self.attrs[attr_name]
attr_val = op_attrs[attr_name]
self._update_desc_attr(attr_name, attr_val)
self.desc.check_attrs()
......@@ -732,7 +733,6 @@ class Operator(object):
Raises:
ValueError: If the type of value doesn't match with desc.attr_type(name).
"""
self.attrs[name] = val
self._update_desc_attr(name, val)
def _update_desc_attr(self, name, val):
......@@ -774,9 +774,9 @@ class Operator(object):
"""
return self.desc.attr(name)
def block_attr(self, name):
def block_attr_id(self, name):
"""
Get the block attribute by name.
Get the block attribute's id by name.
Args:
name(str): the attribute name.
......@@ -784,22 +784,74 @@ class Operator(object):
Returns:
int: the block index.
"""
return self.desc.block_attr(name)
return self.desc.block_attr_id(name)
def block_attr(self, name):
"""
Get the block attribute by name.
Args:
name(str): the attribute name.
Returns:
block: the block attribute.
"""
id = self.block_attr_id(name)
assert (id >= 0 and id < len(self.block.program.blocks))
return self.block.program.blocks[id]
def blocks_attr(self, name):
"""
Get the blocks attribute by name.
Args:
name(str): the attribute name.
Returns:
list: list of the blocks attribute.
"""
attrs = []
for i in self.blocks_attr_ids(name):
assert (i >= 0 and i < len(self.block.program.blocks))
attrs.append(self.block.program.blocks[i])
return attrs
def blocks_attr_ids(self, name):
"""
Get the blocks attribute's ids by name.
Args:
name(str): the attribute name.
Returns:
list: list of the blocks ids.
"""
return self.desc.blocks_attr_ids(name)
def all_attrs(self):
"""
Get the attribute dict.
Returns:
dict: The Operator's attribute dict.
dict: The Operator's attribute dict, name->attr.
"""
attr_names = self.attr_names
attr_map = {}
for n in attr_names:
if n == 'sub_block':
attr_type = self.desc.attr_type(n)
if attr_type == core.AttrType.BLOCK:
attr_map[n] = self.block_attr(n)
else:
attr_map[n] = self.attr(n)
continue
if attr_type == core.AttrType.BLOCKS:
attr_map[n] = self.blocks_attr(n)
continue
attr_map[n] = self.attr(n)
return attr_map
......@@ -1518,11 +1570,17 @@ class Program(object):
The two code snippets above will generate same programs.
"""
if for_test:
p = self.inference_optimize()
p = self.inference_optimize(export_for_deployment=False)
else:
p = Program()
p.current_block_idx = self.current_block_idx
p._seed = self._seed
p.desc = core.ProgramDesc(self.desc)
p.blocks = [Block(p, i) for i in range(self.desc.num_blocks())]
p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
p._current_role = self._current_role
p._op_role_var = self._op_role_var
p._sync_with_cpp()
p._copy_param_info_from(self)
......@@ -1578,7 +1636,7 @@ class Program(object):
res._sync_with_cpp()
return res
def inference_optimize(self):
def inference_optimize(self, export_for_deployment=True):
"""
This method will create a new program and do following adjustments on it:
1. Remove all reader variables and their creator ops if exist.
......@@ -1589,6 +1647,10 @@ class Program(object):
attribute of operators to :code:`True`. All the :code:`Parameter`
information will be lost.
Args:
export_for_deployment(bool): remove the read ops that are added by py_reader
for cpp inference library
Notes: This API is a very low level API. Use
:code:`Program.clone(for_test=True)` instead.
......@@ -1603,16 +1665,17 @@ class Program(object):
# remove all readers and the read_op if exist
read_op_idx = 0
root_block = res.desc.block(0)
while True:
if read_op_idx >= root_block.op_size() or root_block.op(
read_op_idx).type() == 'read':
break
read_op_idx += 1
if read_op_idx < root_block.op_size():
root_block._remove_op(0, read_op_idx + 1)
for var in root_block.all_vars():
if var.type() == core.VarDesc.VarType.READER:
root_block._remove_var(var.name())
if export_for_deployment:
while True:
if read_op_idx >= root_block.op_size() or root_block.op(
read_op_idx).type() == 'read':
break
read_op_idx += 1
if read_op_idx < root_block.op_size():
root_block._remove_op(0, read_op_idx + 1)
for var in root_block.all_vars():
if var.type() == core.VarDesc.VarType.READER:
root_block._remove_var(var.name())
# change all `is_test` attributes to True
for i in range(res.desc.num_blocks()):
......
......@@ -15,7 +15,6 @@
from . import framework
import numpy as np
import contextlib
from .framework import convert_np_dtype_to_dtype_
from .core import VarDesc
__all__ = [
......@@ -264,7 +263,8 @@ class NormalInitializer(Initializer):
"dtype": int(var.dtype),
"mean": self._mean,
"std": self._std_dev,
"seed": self._seed
"seed": self._seed,
"use_mkldnn": False
})
var.op = op
return op
......
......@@ -555,7 +555,8 @@ def save_inference_model(dirname,
executor,
main_program=None,
model_filename=None,
params_filename=None):
params_filename=None,
export_for_deployment=True):
"""
Prune the given `main_program` to build a new program especially for inference,
and then save it and all related parameters to given `dirname` by the `executor`.
......@@ -577,6 +578,8 @@ def save_inference_model(dirname,
params_filename(str|None): The name of file to save all related parameters.
If it is setted None, parameters will be saved
in separate files .
export_for_deployment(bool): remove the read ops that are added by py_reader
for cpp inference lib. Default True
Returns:
None
......@@ -643,7 +646,8 @@ def save_inference_model(dirname,
copy_program.desc.flush()
pruned_program = copy_program.prune(targets=target_vars)
inference_program = pruned_program.inference_optimize()
inference_program = pruned_program.inference_optimize(
export_for_deployment=export_for_deployment)
fetch_var_names = [v.name for v in target_vars]
prepend_feed_ops(inference_program, feeded_var_names)
......
......@@ -20,7 +20,9 @@ from .layer_function_generator import autodoc, templatedoc
from ..layer_helper import LayerHelper
from . import tensor
from . import nn
from . import ops
import math
import numpy
from functools import reduce
__all__ = [
......@@ -264,10 +266,11 @@ def detection_output(loc,
prior_box_var=prior_box_var,
target_box=loc,
code_type='decode_center_size')
old_shape = scores.shape
scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
compile_shape = scores.shape
run_shape = ops.shape(scores)
scores = nn.flatten(x=scores, axis=2)
scores = nn.softmax(input=scores)
scores = nn.reshape(x=scores, shape=old_shape)
scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
scores = nn.transpose(scores, perm=[0, 2, 1])
scores.stop_gradient = True
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
......@@ -677,9 +680,10 @@ def ssd_loss(location,
raise ValueError("Only support mining_type == max_negative now.")
num, num_prior, num_class = confidence.shape
conf_shape = ops.shape(confidence)
def __reshape_to_2d(var):
return nn.reshape(x=var, shape=[-1, var.shape[-1]])
return nn.flatten(x=var, axis=2)
# 1. Find matched boundding box by prior box.
# 1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
......@@ -690,7 +694,8 @@ def ssd_loss(location,
# 2. Compute confidence for mining hard examples
# 2.1. Get the target label based on matched indices
gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
gt_label = nn.reshape(
x=gt_label, shape=(len(gt_label.shape) - 1) * (0, ) + (-1, 1))
gt_label.stop_gradient = True
target_label, _ = target_assign(
gt_label, matched_indices, mismatch_value=background_label)
......@@ -701,9 +706,12 @@ def ssd_loss(location,
target_label = __reshape_to_2d(target_label)
target_label.stop_gradient = True
conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
# 3. Mining hard examples
conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
conf_loss = nn.reshape(
x=conf_loss,
shape=(num, num_prior),
actual_shape=ops.slice(
conf_shape, axes=[0], starts=[0], ends=[2]))
conf_loss.stop_gradient = True
neg_indices = helper.create_tmp_variable(dtype='int32')
dtype = matched_indices.dtype
......@@ -772,7 +780,11 @@ def ssd_loss(location,
# 5.3 Compute overall weighted loss.
loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
# reshape to [N, Np], N is the batch size and Np is the prior box number.
loss = nn.reshape(x=loss, shape=[-1, num_prior])
loss = nn.reshape(
x=loss,
shape=(num, num_prior),
actual_shape=ops.slice(
conf_shape, axes=[0], starts=[0], ends=[2]))
loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
if normalize:
normalizer = nn.reduce_sum(target_loc_weight)
......@@ -1005,13 +1017,7 @@ def multi_box_head(inputs,
"""
def _reshape_with_axis_(input, axis=1):
if not (axis > 0 and axis < len(input.shape)):
raise ValueError("The axis should be smaller than "
"the arity of input and bigger than 0.")
new_shape = [
-1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
]
out = nn.reshape(x=input, shape=new_shape)
out = nn.flatten(x=input, axis=axis)
return out
def _is_list_or_tuple_(data):
......@@ -1101,11 +1107,13 @@ def multi_box_head(inputs,
stride=stride)
mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
new_shape = [
compile_shape = [
mbox_loc.shape[0],
mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
]
mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
run_shape = tensor.assign(numpy.array([0, -1, 4]).astype("int32"))
mbox_loc_flatten = nn.reshape(
mbox_loc, shape=compile_shape, actual_shape=run_shape)
mbox_locs.append(mbox_loc_flatten)
# get conf
......@@ -1117,11 +1125,15 @@ def multi_box_head(inputs,
padding=pad,
stride=stride)
conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
new_shape = [
new_shape = [0, -1, num_classes]
compile_shape = [
conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
conf_loc.shape[3] / num_classes, num_classes
]
conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
run_shape = tensor.assign(
numpy.array([0, -1, num_classes]).astype("int32"))
conf_loc_flatten = nn.reshape(
conf_loc, shape=compile_shape, actual_shape=run_shape)
mbox_confs.append(conf_loc_flatten)
if len(box_results) == 1:
......
......@@ -112,6 +112,8 @@ __all__ = [
'log',
'crop',
'rank_loss',
'prelu',
'flatten',
]
......@@ -5361,3 +5363,123 @@ def rank_loss(label, left, right, name=None):
"Right": right},
outputs={'Out': out})
return out
def prelu(x, mode, param_attr=None, name=None):
"""
Equation:
y = \max(0, x) + alpha \min(0, x)
Args:
x (Variable): The input tensor.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
mode (string): The mode for weight sharing
all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
mode = 'channel'
output = fluid.layers.prelu(x,mode)
"""
helper = LayerHelper('prelu', **locals())
if mode not in ['all', 'channel', 'element']:
raise ValueError('mode should be one of all, channel, element.')
alpha_shape = [1]
if mode == 'channel':
alpha_shape = [1, x.shape[1], 1, 1]
elif mode == 'element':
alpha_shape = x.shape
dtype = helper.input_dtype(input_param_name='x')
alpha = helper.create_parameter(
attr=param_attr,
shape=alpha_shape,
dtype='float32',
is_bias=False,
default_initializer=Constant(1.0))
out = helper.create_tmp_variable(dtype)
helper.append_op(
type="prelu",
inputs={"X": x,
'Alpha': alpha},
attrs={"mode": mode},
outputs={"Out": out})
return out
def flatten(x, axis=1, name=None):
"""
**Flatten layer**
Flattens the input tensor into a 2D matrix.
Examples:
Case 1:
Given
X.shape = (3, 100, 100, 4)
and
axis = 2
We get:
Out.shape = (3 * 100, 4 * 100)
Case 2:
Given
X.shape = (3, 100, 100, 4)
and
axis = 0
We get:
Out.shape = (1, 3 * 100 * 100 * 4)
Args:
x (Variable): A tensor of rank >= axis.
axis (int): Indicate up to which input dimensions (exclusive) should
be flattened to the outer dimension of the output.
The value for axis must be in the range [0, R], where R
is the rank of the input tensor. When axis = 0, the shape
of the output tensor is (1, (d_0 X d_1 ... d_n), where the
shape of the input tensor is (d_0, d_1, ... d_n).
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: A 2D tensor with the contents of the input tensor, with input
dimensions up to axis flattened to the outer dimension of
the output and remaining input dimensions flattened into the
inner dimension of the output.
Raises:
ValueError: If x is not a variable.
ValueError: If axis is not in range [0, rank(x)].
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[4, 4, 3], dtype="float32")
out = fluid.layers.flatten(x=x, axis=2)
"""
helper = LayerHelper('flatten', **locals())
if not (isinstance(x, Variable)):
raise ValueError("The input x should be a Variable")
if not (isinstance(axis, int)) or axis > len(x.shape) or axis < 0:
raise ValueError("The axis should be a int, and in range [0, rank(x)]")
out = helper.create_tmp_variable(x.dtype)
helper.append_op(
type='flatten',
inputs={"X": x},
outputs={'Out': out},
attrs={"axis": axis})
return out
......@@ -59,8 +59,8 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$
if(WITH_DISTRIBUTE)
py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
......
......@@ -105,5 +105,107 @@ class TestCrossEntropyOp3(OpTest):
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp4(OpTest):
"""Test high rank tensor cross-entropy with discrete one-hot labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
shape = [10, 2, 4]
ins_num = np.prod(np.array(shape))
class_num = 10
X_2d = randomize_probability(ins_num, class_num, dtype='float64')
label_2d = np.random.randint(0, class_num, (ins_num, 1), dtype="int64")
cross_entropy_2d = np.asmatrix(
[[-np.log(X_2d[i][label_2d[i][0]])] for i in range(X_2d.shape[0])],
dtype="float64")
X = X_2d.reshape(shape + [class_num])
label = label_2d.reshape(shape + [1])
cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
self.inputs = {"X": X, "Label": label}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": False}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
class TestCrossEntropyOp5(OpTest):
"""Test high rank tensor cross-entropy with vectorized soft labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
shape = [4, 3]
ins_num = np.prod(np.array(shape))
class_num = 37
X_2d = randomize_probability(ins_num, class_num)
label_2d = np.random.uniform(0.1, 1.0,
[ins_num, class_num]).astype("float32")
label_2d /= label_2d.sum(axis=1, keepdims=True)
cross_entropy_2d = (-label_2d * np.log(X_2d)).sum(
axis=1, keepdims=True).astype("float32")
X = X_2d.reshape(shape + [class_num])
label = label_2d.reshape(shape + [class_num])
cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
self.inputs = {"X": X, "Label": label}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": True}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp6(OpTest):
"""Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
shape = [4, 3, 2]
ins_num = np.prod(np.array(shape))
class_num = 17
X_2d = randomize_probability(ins_num, class_num)
label_index_2d = np.random.randint(
0, class_num, (ins_num), dtype="int32")
label_2d = np.zeros(X_2d.shape)
label_2d[np.arange(ins_num), label_index_2d] = 1
cross_entropy_2d = np.asmatrix(
[[-np.log(X_2d[i][label_index_2d[i]])]
for i in range(X_2d.shape[0])],
dtype="float32")
X = X_2d.reshape(shape + [class_num])
label = label_2d.reshape(shape + [class_num])
cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
self.inputs = {"X": X, "Label": label.astype(np.float32)}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": True}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
import collections
SEED = 1
DTYPE = "float32"
paddle.dataset.mnist.fetch()
# random seed must set before configuring the network.
# fluid.default_startup_program().random_seed = SEED
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
# TODO(dzhwinter) : refine the initializer and random seed settting
SIZE = 10
input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc(
input=conv_pool_2,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)))
return predict
def get_model(batch_size):
# Input data
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
opt.minimize(avg_cost)
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=trainer_id,
program=main_program,
pservers=pserver_endpoints,
trainers=trainers)
return t
def operator_equal(a, b):
for k, v in a.__dict__.iteritems():
if isinstance(v, fluid.framework.Program) or \
isinstance(v, fluid.framework.Block):
continue
elif isinstance(v, core.OpDesc):
if v.serialize_to_string() != b.__dict__[k].serialize_to_string():
raise ValueError("In operator_equal not equal:{0}\n".format(k))
elif isinstance(v, collections.OrderedDict):
v0 = sorted(v.iteritems(), key=lambda x: x[0])
v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
if v0 != v1:
raise ValueError("In operator_equal not equal:{0}\n".format(k))
elif (v != b.__dict__[k]):
raise ValueError("In operator_equal not equal:{0}\n".format(k))
return True
def block_equal(a, b):
for k, v in a.__dict__.iteritems():
if isinstance(v, core.ProgramDesc) or isinstance(
v, fluid.framework.Program) or isinstance(v, core.BlockDesc):
continue
elif k == "ops":
for i in range(0, len(a.ops)):
if not operator_equal(a.ops[i], b.ops[i]):
raise ValueError("In block_equal not equal:{0}\n".format(k))
assert (len(a.ops) == len(b.ops))
elif isinstance(v, collections.OrderedDict):
v0 = sorted(v.iteritems(), key=lambda x: x[0])
v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
if v0 != v1:
raise ValueError("In block_equal not equal:{0}\n".format(k))
elif (v != b.__dict__[k]):
raise ValueError("In block_equal not equal:{0}\n".format(k))
return True
def program_equal(a, b):
for k, v in a.__dict__.iteritems():
if isinstance(v, core.ProgramDesc):
continue
elif k == 'blocks':
for i in range(0, len(a.blocks)):
if not block_equal(a.blocks[i], b.blocks[i]):
raise ValueError("In operator_equal not equal:{0}\n".format(
k))
return False
assert (len(a.blocks) == len(b.blocks))
elif (v != b.__dict__[k]):
raise ValueError("In program_equal not equal:{0}\n".format(k))
return True
class TestDistMnist(unittest.TestCase):
def test_desc_clone(self):
get_model(batch_size=20)
pserver_endpoints = "127.0.0.1:9123"
trainers = 1
current_endpoint = "127.0.0.1:9123"
t = get_transpiler(0,
fluid.default_main_program(), pserver_endpoints,
trainers)
pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
main = pserver_prog.clone()
startup = startup_prog.clone()
self.assertTrue(program_equal(main, pserver_prog))
self.assertTrue(program_equal(startup, startup_prog))
if __name__ == "__main__":
unittest.main()
......@@ -130,7 +130,7 @@ class TestDistBase(unittest.TestCase):
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
self._python_interp = "python"
def start_pserver(self, model_file):
def start_pserver(self, model_file, check_error_log):
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep,
......@@ -139,11 +139,23 @@ class TestDistBase(unittest.TestCase):
(self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers)
ps0_pipe = subprocess.PIPE
ps1_pipe = subprocess.PIPE
if check_error_log:
print("ps0_cmd:", ps0_cmd)
print("ps1_cmd:", ps1_cmd)
ps0_pipe = open("/tmp/ps0_err.log", "wb")
ps1_pipe = open("/tmp/ps1_err.log", "wb")
ps0_proc = subprocess.Popen(
ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
ps1_proc = subprocess.Popen(
ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return ps0_proc, ps1_proc
ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
if not check_error_log:
return ps0_proc, ps1_proc, None, None
else:
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
def _wait_ps_ready(self, pid):
retry_times = 50
......@@ -160,7 +172,7 @@ class TestDistBase(unittest.TestCase):
(e, retry_times))
retry_times -= 1
def check_with_place(self, model_file, delta=1e-3):
def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs = {
"PATH": os.getenv("PATH"),
......@@ -169,17 +181,32 @@ class TestDistBase(unittest.TestCase):
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1"
}
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1"
# Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs)
local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
(self._python_interp, model_file,
"127.0.0.1:1234", "127.0.0.1:1234", 1)
local_proc = subprocess.Popen(
local_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env_local)
if not check_error_log:
local_proc = subprocess.Popen(
local_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env_local)
else:
print("trainer cmd:", local_cmd)
err_log = open("/tmp/trainer.err.log", "wb")
local_proc = subprocess.Popen(
local_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=err_log,
env=env_local)
local_proc.wait()
out, err = local_proc.communicate()
local_ret = out
......@@ -187,7 +214,8 @@ class TestDistBase(unittest.TestCase):
sys.stderr.write('local_stderr: %s\n' % err)
# Run dist train to compare with local results
ps0, ps1 = self.start_pserver(model_file)
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
check_error_log)
self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid)
......@@ -205,15 +233,23 @@ class TestDistBase(unittest.TestCase):
env1.update(required_envs)
FNULL = open(os.devnull, 'w')
tr0_pipe = subprocess.PIPE
tr1_pipe = subprocess.PIPE
if check_error_log:
print("tr0_cmd:", tr0_cmd)
print("tr1_cmd:", tr1_cmd)
tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb")
tr0_proc = subprocess.Popen(
tr0_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stderr=tr0_pipe,
env=env0)
tr1_proc = subprocess.Popen(
tr1_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stderr=tr1_pipe,
env=env1)
tr0_proc.wait()
......@@ -230,6 +266,13 @@ class TestDistBase(unittest.TestCase):
local_first_loss = eval(local_lines[0])[0]
local_last_loss = eval(local_lines[1])[0]
# close trainer file
if check_error_log:
tr0_pipe.close()
tr1_pipe.close()
ps0_pipe.close()
ps1_pipe.close()
# FIXME: use terminate() instead of sigkill.
os.kill(ps0.pid, signal.SIGKILL)
os.kill(ps1.pid, signal.SIGKILL)
......
......@@ -26,6 +26,12 @@ from paddle.fluid.layers.io import ListenAndServ
from paddle.fluid.layers.io import Recv
from paddle.fluid.layers.io import Send
from paddle.fluid import core
RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
)
RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
class TestSendOp(unittest.TestCase):
def test_send(self):
......@@ -89,18 +95,29 @@ class TestSendOp(unittest.TestCase):
def init_client(self, place, port):
main = fluid.Program()
with fluid.program_guard(main):
main.global_block().append_op(
type="fetch_barrier",
inputs={},
outputs={},
attrs={
"endpoints": ["127.0.0.1:{0}".format(port)],
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
})
x = layers.data(
shape=[32, 32],
dtype='float32',
name='X',
append_batch_size=False)
fluid.initializer.Constant(value=2.3)(x, main.global_block())
get_var = main.global_block().create_var(
name="scale_0.tmp_0", # server side var
dtype="float32",
persistable=False,
shape=[32, 32])
fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
Send("127.0.0.1:%d" % port, [x])
o = Recv("127.0.0.1:%d" % port, [get_var])
......
......@@ -12,10 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import unittest
import paddle.fluid as fluid
from paddle.fluid.transpiler.distribute_transpiler import delete_ops
import traceback
import collections
class TranspilerTest(unittest.TestCase):
......@@ -51,9 +54,18 @@ class TranspilerTest(unittest.TestCase):
self.origin_prog = main.clone()
return main
def get_trainer(self, config=None, sync_mode=True):
t = self._transpiler_instance(config, sync_mode)
return t.get_trainer_program()
def get_trainer(self, config=None):
src = fluid.default_startup_program().clone()
t = self._transpiler_instance(config)
trainer_main = t.get_trainer_program()
trainer_startup = fluid.default_startup_program()
assert (src.num_blocks == 1)
assert (trainer_startup.num_blocks == src.num_blocks)
return trainer_main, trainer_startup
def get_pserver(self, ep, config=None, sync_mode=True):
t = self._transpiler_instance(config, sync_mode)
......@@ -89,7 +101,21 @@ class TestBasicModel(TranspilerTest):
pserver, startup = self.get_pserver(self.pserver1_ep)
pserver2, startup2 = self.get_pserver(self.pserver2_ep)
trainer = self.get_trainer()
trainer, trainer_startup = self.get_trainer()
# splited var blocks should be in startup program
self.assertTrue("fc_w.block0" in trainer_startup.global_block().vars)
self.assertTrue("fc_w.block1" in trainer_startup.global_block().vars)
self.assertTrue("fc_w" in trainer_startup.global_block().vars)
self.assertTrue("fc_b" in trainer_startup.global_block().vars)
self.assertTrue("fc_w@GRAD" not in trainer_startup.global_block().vars)
self.assertTrue("fc_b@GRAD" not in trainer_startup.global_block().vars)
src = [op.type for op in trainer_startup.global_block().ops]
dst = ['fill_constant', 'fill_constant', 'uniform_random', 'recv', 'recv', \
'fetch_barrier', 'concat']
self.assertEqual(src, dst)
self.assertEqual([op.type for op in trainer.global_block().ops], [
'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
......@@ -140,7 +166,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
pserver, startup = self.get_pserver(self.pserver1_ep, config)
pserver2, startup2 = self.get_pserver(self.pserver2_ep, config)
trainer = self.get_trainer(config)
trainer, _ = self.get_trainer(config)
self.assertEqual([op.type for op in trainer.global_block().ops], [
'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
......@@ -224,7 +250,7 @@ class TestLRDecay(TranspilerTest):
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
trainer = self.get_trainer()
trainer, _ = self.get_trainer()
self.assertEqual(len(pserver.blocks), 4)
lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
......@@ -254,12 +280,12 @@ class TestLRDecayConditional(TranspilerTest):
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
trainer = self.get_trainer()
trainer, _ = self.get_trainer()
serv_op = pserver.blocks[0].ops[0]
sub_blocks = []
optimize_blocks = []
for b in serv_op.attrs["optimize_blocks"]:
for b in serv_op.all_attrs()["optimize_blocks"]:
optimize_blocks.append(b.idx)
for b in pserver.blocks:
if b.idx not in optimize_blocks:
......@@ -303,7 +329,7 @@ class TestL2Decay(TranspilerTest):
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
trainer = self.get_trainer()
trainer, _ = self.get_trainer()
self.assertEqual(len(pserver.blocks), 3)
self.assertEqual([op.type for op in pserver.blocks[1].ops],
......@@ -338,7 +364,7 @@ class TestL2DecayWithPiecewise(TranspilerTest):
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
trainer = self.get_trainer()
trainer, _ = self.get_trainer()
self.assertEqual(len(pserver.blocks), 9)
self.assertEqual([op.type for op in pserver.blocks[1].ops], [
......@@ -362,12 +388,13 @@ class TestL2DecayWithPiecewise(TranspilerTest):
class TestDistLookupTableBase(TranspilerTest):
def network_with_table(self, is_sparse, is_distributed):
self.table_size = 1000
self.emb_size = 64
def emb_pool(ids):
table_size = 1000
emb_size = 64
emb = fluid.layers.embedding(
input=ids,
size=[table_size, emb_size],
size=[self.table_size, self.emb_size],
dtype='float32',
param_attr='shared_w', # share parameter
is_sparse=is_sparse,
......@@ -412,7 +439,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["sum", "adam", "scale", "scale"])
trainer = self.get_trainer()
trainer, _ = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1)
ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
......@@ -450,7 +477,7 @@ class TestDistLookupTable(TestDistLookupTableBase):
# 5 save table
self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
trainer = self.get_trainer()
trainer, _ = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1)
ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
......@@ -483,7 +510,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["adam", "scale", "scale"])
trainer = self.get_trainer(config)
trainer, _ = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1)
ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
......@@ -522,7 +549,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
# 5 save table
self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
trainer = self.get_trainer(config)
trainer, _ = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1)
ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
......@@ -536,6 +563,22 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
class TestDistLookupTableSliceSize(TestDistLookupTableBase):
def net_conf(self):
self.network_with_table(is_sparse=True, is_distributed=True)
def transpiler_test_impl(self):
config = fluid.DistributeTranspilerConfig()
pserver1, startup1 = self.get_pserver(self.pserver1_ep, config)
self.assertTrue(self.transpiler.has_distributed_lookup_table)
lookup_table_var = pserver1.global_block().vars[
self.transpiler.table_name]
row_size = lookup_table_var.shape[0]
calc_row_size = int(math.ceil(self.table_size / self.pservers))
self.assertEqual(row_size, calc_row_size)
class TestRMSPropOptimizer(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
......
......@@ -22,6 +22,7 @@ def fully_connected_naive(input, weights, bias_data=None):
w_h, w_c = weights.shape
x_data = np.reshape(input, [in_n, in_c * in_h * in_w])
# this transpose should be implemented at C code
w_data = np.transpose(np.reshape(weights, (w_c, in_c * in_h * in_w)))
result = None
......@@ -43,15 +44,11 @@ class TestFCMKLDNNOp(OpTest):
def setUp(self):
self.op_type = "fc"
self.use_mkldnn = True
self.with_bias = True
self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
self.attrs = {
'use_mkldnn': self.use_mkldnn,
'with_bias': self.with_bias
}
self.attrs = {'use_mkldnn': self.use_mkldnn, }
self.outputs = {
'Out': fully_connected_naive(self.matrix.input, self.matrix.weights)
......@@ -85,13 +82,11 @@ class TestFCMKLDNNOp3(TestFCMKLDNNOp):
class TestFCMKLDNNOp4(TestFCMKLDNNOp):
def init_op_type(self):
self.with_bias = False
self.matrix = MatrixGenerate(2, 32, 48, 2, 2)
class TestFCMKLDNNOp4(TestFCMKLDNNOp):
def init_op_type(self):
self.with_bias = False
self.matrix = MatrixGenerate(2, 32, 1000, 6, 6)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
def fc_refer(matrix, with_bias):
in_n, in_c, in_h, in_w = matrix.input.shape
w_i, w_o = matrix.weights.shape
x_data = np.reshape(matrix.input, [in_n, in_c * in_h * in_w])
w_data = np.reshape(matrix.weights, [w_i, w_o])
b_data = np.reshape(matrix.bias, [1, w_o])
result = None
if with_bias:
result = np.dot(x_data, w_data) + b_data
else:
result = np.dot(x_data, w_data)
return result
class MatrixGenerate:
def __init__(self, mb, ic, oc, h, w):
self.input = np.random.random((mb, ic, h, w)).astype("float32")
self.weights = np.random.random((ic * h * w, oc)).astype("float32")
self.bias = np.random.random((1, oc)).astype("float32")
class TestFCOp(OpTest):
def setUp(self):
self.op_type = "fc"
self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
self.with_bias = True
if self.with_bias:
self.inputs = {
'Input': self.matrix.input,
'W': self.matrix.weights,
'Bias': self.matrix.bias
}
else:
self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
self.attrs = {'use_mkldnn': False}
self.outputs = {'Out': fc_refer(self.matrix, self.with_bias)}
def test_check_output(self):
self.check_output()
class TestFCOpBiasBoth(TestFCOp):
def init_shapes(self, mb, ic, oc, h, w):
for with_bias in {True, False}:
self.with_bias = with_bias
self.matrix = MatrixGenerate(mb, ic, oc, h, w)
class TestFCOp1(TestFCOpBiasBoth):
def init_op_type(self):
self.init_shapes(2, 8, 10, 1, 1)
class TestFCOp2(TestFCOpBiasBoth):
def init_op_type(self):
self.init_shapes(4, 5, 6, 2, 2)
class TestFCOp4(TestFCOpBiasBoth):
def init_op_type(self):
self.init_shapes(1, 32, 64, 3, 3)
if __name__ == "__main__":
unittest.main()
......@@ -21,6 +21,7 @@ import paddle.fluid.nets as nets
from paddle.fluid.framework import Program, program_guard, default_main_program
from paddle.fluid.param_attr import ParamAttr
import decorators
from paddle.fluid.initializer import Constant
class TestBook(unittest.TestCase):
......@@ -465,6 +466,17 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out)
print(str(program))
def test_flatten(self):
program = Program()
with program_guard(program):
x = layers.data(
name='x',
append_batch_size=False,
shape=[4, 4, 3],
dtype="float32")
out = layers.flatten(x, axis=1, name="flatten")
self.assertIsNotNone(out)
def test_shape(self):
program = Program()
with program_guard(program):
......@@ -474,6 +486,20 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out)
print(str(program))
def test_prelu(self):
program = Program()
with program_guard(program):
input = layers.data(
name="input", shape=[5, 200, 100, 100], dtype="float32")
mode = 'channel'
out = layers.prelu(
input,
mode,
param_attr=ParamAttr(initializer=Constant(1.0)),
name='prelu')
self.assertIsNotNone(out)
print(str(program))
if __name__ == '__main__':
unittest.main()
......@@ -20,30 +20,58 @@ from op_test import OpTest
class PReluTest(OpTest):
def setUp(self):
self.op_type = "prelu"
x_np = np.random.normal(size=(10, 10)).astype("float32")
for pos, val in np.ndenumerate(x_np):
# Since zero point in prelu is not differentiable, avoid randomize
# zero.
while abs(val) < 1e-3:
x_np[pos] = np.random.normal()
val = x_np[pos]
x_np_sign = np.sign(x_np)
x_np = x_np_sign * np.maximum(x_np, .005)
alpha_np = np.array([.1], dtype="float32")
self.inputs = {'X': x_np, 'Alpha': alpha_np}
self.initTestCase()
x_np = np.random.normal(size=(3, 5, 5, 10)).astype("float32")
# Since zero point in prelu is not differentiable, avoid randomize
# zero.
x_np[np.abs(x_np) < 0.005] = 0.02
if self.attrs == {'mode': "all"}:
alpha_np = np.random.rand(1).astype("float32")
self.inputs = {'X': x_np, 'Alpha': alpha_np}
elif self.attrs == {'mode': "channel"}:
alpha_np = np.random.rand(1, x_np.shape[1], 1, 1).astype("float32")
self.inputs = {'X': x_np, 'Alpha': alpha_np}
else:
alpha_np = np.random.rand(*x_np.shape).astype("float32")
self.inputs = {'X': x_np, 'Alpha': alpha_np}
out_np = np.maximum(self.inputs['X'], 0.)
out_np = out_np + np.minimum(self.inputs['X'],
0.) * self.inputs['Alpha']
assert out_np is not self.inputs['X']
self.outputs = {'Out': out_np}
def initTestCase(self):
self.attrs = {'mode': "channel"}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
self.check_grad(['X', 'Alpha'], 'Out')
def test_check_grad_ignore_x(self):
self.check_grad(['Alpha'], 'Out', no_grad_set=set('X'))
def test_check_grad_ignore_alpha(self):
self.check_grad(['X'], 'Out', no_grad_set=set('Alpha'))
class TestCase1(PReluTest):
def initTestCase(self):
self.attrs = {'mode': "all"}
class TestCase2(PReluTest):
def initTestCase(self):
self.attrs = {'mode': "channel"}
class TestCase3(PReluTest):
def initTestCase(self):
self.attrs = {'mode': "element"}
if __name__ == "__main__":
......
......@@ -17,6 +17,7 @@ import unittest
from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name
import paddle.fluid.layers as layers
import paddle.fluid as fluid
main_program = default_main_program()
......@@ -98,6 +99,39 @@ class TestProgram(unittest.TestCase):
new_program = main_program.clone()
self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
def test_program_inference_optimize(self):
def net():
reader = fluid.layers.py_reader(
capacity=10,
shapes=[[-1, 10], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
use_double_buffer=True)
in_data, label = fluid.layers.read_file(reader)
predict_label = fluid.layers.fc(in_data, size=2, act='softmax')
loss = fluid.layers.mean(
fluid.layers.cross_entropy(
input=predict_label, label=label))
optimizer = fluid.optimizer.Adam()
optimizer.minimize(loss)
startup_program = fluid.Program()
main_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
net()
no_read_program = main_program.inference_optimize()
keep_read_program = main_program.inference_optimize(
export_for_deployment=False)
no_read_ops = no_read_program.global_block().ops
keep_read_ops = keep_read_program.global_block().ops
self.assertEqual(len(keep_read_ops) - len(no_read_ops), 2)
self.assertEqual(keep_read_ops[0].type, 'create_double_buffer_reader')
self.assertEqual(keep_read_ops[1].type, 'read')
for i in range(len(no_read_ops)):
self.assertEqual(no_read_ops[i].type, keep_read_ops[i + 2].type)
if __name__ == '__main__':
unittest.main()
......@@ -68,7 +68,7 @@ class TestOpDesc(unittest.TestCase):
self.assertEqual(8, len(op.attr_names()))
op.set_block_attr("block_attr", program_desc.block(0))
self.assertEqual(0, op.block_attr("block_attr"))
self.assertEqual(0, op.block_attr_id("block_attr"))
mul_op = block.append_op()
mul_op.set_type("mul")
......
......@@ -195,6 +195,9 @@ class DistributeTranspiler(object):
if program is None:
program = default_main_program()
self.origin_program = program
self.origin_startup_program = default_startup_program().clone()
self.startup_program = default_startup_program()
self.trainer_num = trainers
self.sync_mode = sync_mode
self.trainer_id = trainer_id
......@@ -205,10 +208,10 @@ class DistributeTranspiler(object):
ps_dispatcher = self.config.split_method(self.pserver_endpoints)
self.has_distributed_lookup_table = self._has_distributed_lookup_table()
# split and create vars, then put splited vars in dicts for later use.
# step 1: split and create vars, then put splited vars in dicts for later use.
self._init_splited_vars()
# step 3.1: insert send op to send gradient vars to parameter servers
# step 2: insert send op to send gradient vars to parameter servers
ps_dispatcher.reset()
send_vars = []
......@@ -265,7 +268,7 @@ class DistributeTranspiler(object):
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
})
# step 3.2: insert recv op to receive parameters from parameter server
# step 3: insert recv op to receive parameters from parameter server
recv_vars = []
for _, var in enumerate(send_vars):
recv_vars.append(self.grad_param_mapping[var])
......@@ -312,6 +315,8 @@ class DistributeTranspiler(object):
outputs={"Out": [orig_param]},
attrs={"axis": 0})
self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)
if self.has_distributed_lookup_table:
self._replace_lookup_table_op_with_prefetch(program,
pserver_endpoints)
......@@ -328,8 +333,78 @@ class DistributeTranspiler(object):
# FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
delete_ops(self.origin_program.global_block(), self.optimize_ops)
self.origin_program.__str__()
return self.origin_program
def _get_trainer_startup_program(self,
recv_vars,
eplist,
startup_program=None):
"""
Get transpiled trainer side startup program.
Args:
startup_program(Program): Startup program.
Returns:
Program: trainer side startup program.
"""
if startup_program is None:
startup_program = self.startup_program
# FIXME(gongwb): delete not need ops.
# note that: some parameter is not trainable and those ops can't be deleted.
for varname, splited_var in self.param_var_mapping.iteritems():
# Get the eplist of recv vars
eps = []
for var in splited_var:
index = [v.name for v in recv_vars].index(var.name)
eps.append(eplist[index])
for var in splited_var:
if startup_program.global_block().has_var(var.name):
continue
startup_program.global_block().create_var(
name=var.name,
persistable=False,
type=var.type,
dtype=var.dtype,
shape=var.shape,
lod_level=var.lod_level)
op = startup_program.global_block().append_op(
type="recv",
inputs={},
outputs={"Out": splited_var},
attrs={
"epmap": eps,
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
})
startup_program.global_block().append_op(
type="fetch_barrier",
inputs={},
outputs={},
attrs={
"endpoints": self.pserver_endpoints,
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
})
for varname, splited_var in self.param_var_mapping.iteritems():
#add concat ops to merge splited parameters received from parameter servers.
if len(splited_var) <= 1:
continue
orig_param = startup_program.global_block().vars[varname]
startup_program.global_block().append_op(
type="concat",
inputs={"X": splited_var},
outputs={"Out": [orig_param]},
attrs={"axis": 0})
return startup_program
def get_pserver_program(self, endpoint):
"""
Get parameter server side program.
......@@ -530,7 +605,10 @@ class DistributeTranspiler(object):
pserver_program._sync_with_cpp()
return pserver_program
def get_startup_program(self, endpoint, pserver_program):
def get_startup_program(self,
endpoint,
pserver_program,
startup_program=None):
"""
Get startup program for current parameter server.
Modify operator input variables if there are variables that
......@@ -540,12 +618,17 @@ class DistributeTranspiler(object):
endpoint (str): current pserver endpoint.
pserver_program (Program): call get_pserver_program first and
pass the result here.
startup_program (Program): if pass None, will use
default_startup_program
Returns:
Program: parameter server side startup program.
"""
s_prog = Program()
orig_s_prog = default_startup_program()
if not startup_program:
orig_s_prog = default_startup_program()
else:
orig_s_prog = startup_program
s_prog.random_seed = orig_s_prog.random_seed
params = self.param_grad_ep_mapping[endpoint]["params"]
......@@ -568,14 +651,16 @@ class DistributeTranspiler(object):
new_outputs = dict()
# do not append startup op if var is not on this pserver
op_on_pserver = False
for key in op.output_names:
newname, _ = _get_splited_name_and_shape(op.output(key)[0])
if newname:
op_on_pserver = True
new_outputs[key] = created_var_map[newname]
elif op.output(key)[0] in pserver_vars:
op_on_pserver = True
new_outputs[key] = pserver_vars[op.output(key)[0]]
# TODO(gongwb): remove this line.
if op.type not in ["recv", "fetch_barrier", "concat"]:
for key in op.output_names:
newname, _ = _get_splited_name_and_shape(op.output(key)[0])
if newname:
op_on_pserver = True
new_outputs[key] = created_var_map[newname]
elif op.output(key)[0] in pserver_vars:
op_on_pserver = True
new_outputs[key] = pserver_vars[op.output(key)[0]]
if op_on_pserver:
# most startup program ops have no inputs
......@@ -584,12 +669,12 @@ class DistributeTranspiler(object):
if op.type in [
"gaussian_random", "fill_constant", "uniform_random"
]:
op.attrs["shape"] = new_outputs["Out"].shape
op.set_attr("shape", list(new_outputs["Out"].shape))
s_prog.global_block().append_op(
type=op.type,
inputs=new_inputs,
outputs=new_outputs,
attrs=op.attrs)
attrs=op.all_attrs())
return s_prog
# ====================== private transpiler functions =====================
......@@ -603,7 +688,7 @@ class DistributeTranspiler(object):
self.table_name = None
for op in self.origin_program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE:
if op.attrs['is_distributed'] is True:
if op.attr('is_distributed') is True:
if self.table_name is None:
self.table_name = op.input("W")[0]
if self.table_name != op.input("W")[0]:
......@@ -877,9 +962,15 @@ class DistributeTranspiler(object):
# create table param and grad var in pserver program
origin_param_var = self.origin_program.global_block().vars[
self.table_name]
zero_dim = int(
math.ceil(origin_param_var.shape[0] / len(self.pserver_endpoints)))
table_shape = list(origin_param_var.shape)
table_shape[0] = zero_dim
param_var = pserver_program.global_block().create_var(
name=origin_param_var.name,
shape=origin_param_var.shape,
shape=table_shape,
dtype=origin_param_var.dtype,
type=core.VarDesc.VarType.SELECTED_ROWS,
persistable=True)
......@@ -1008,7 +1099,6 @@ class DistributeTranspiler(object):
var_mapping[varname] = \
[program.global_block().var(orig_var.name)]
continue
var_mapping[varname] = []
orig_shape = orig_var.shape
orig_dim1_flatten = 1
......@@ -1263,7 +1353,7 @@ class DistributeTranspiler(object):
type=opt_op.type,
inputs=new_inputs,
outputs=outputs,
attrs=opt_op.attrs)
attrs=opt_op.all_attrs())
def _is_splited_grad_var(self, var, var_dict):
grad_block = None
......@@ -1296,7 +1386,7 @@ class DistributeTranspiler(object):
block._clone_variable(var)
return block.append_op(
type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
program = optimize_block.program
......@@ -1337,7 +1427,7 @@ class DistributeTranspiler(object):
type=opt_op.type,
inputs=inputs,
outputs=outputs,
attrs=opt_op.attrs)
attrs=opt_op.all_attrs())
def _is_op_connected(self, op1, op2):
# If one op's input is another op's output or
......@@ -1442,8 +1532,8 @@ class DistributeTranspiler(object):
# optimize
op_maker = core.op_proto_and_checker_maker
optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
if op_maker.kOpRoleAttrName() in op.attrs and \
int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
if op_maker.kOpRoleAttrName() in op.attr_names and \
int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
return True
return False
......@@ -1466,8 +1556,8 @@ class DistributeTranspiler(object):
# and op_role_var to get the pair.
for input_name in op.input_arg_names:
if input_name.find("@GRAD") != -1 and \
op.attrs[RPC_OP_ROLE_ATTR_NAME]:
param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0]
op.attr(RPC_OP_ROLE_ATTR_NAME):
param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
params_grads.append([
origin_var_dict[param_name],
origin_var_dict[input_name]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册