“43cdafbb97e52e6d93cc5bbdc6e7486f27665fc8”上不存在“...dialogue_model_toolkit/deep_attention_matching/util.py”
未验证 提交 5a4a24cc 编写于 作者: T tangwei12 提交者: GitHub

Merge branch 'develop' into ckpt_m2

...@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args): ...@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
return train_program, fluid.default_startup_program() return train_program, fluid.default_startup_program()
else: else:
raise ValueError( raise ValueError(
'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
) )
......
...@@ -108,10 +108,10 @@ def gen_job(): ...@@ -108,10 +108,10 @@ def gen_job():
tn_container["ports"][0]["containerPort"] = spreadport tn_container["ports"][0]["containerPort"] = spreadport
envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname}) envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
envs.append({"name": "TRAINERS", "value": str(args.trainers)}) envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
envs.append({"name": "PSERVERS", "value": str(args.pservers)}) envs.append({"name": "PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry}) envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)}) envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)}) envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify # NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster. # this settings before you run on your own cluster.
...@@ -167,16 +167,22 @@ def gen_job(): ...@@ -167,16 +167,22 @@ def gen_job():
tn_container["volumeMounts"] = volumeMounts tn_container["volumeMounts"] = volumeMounts
ps_container["env"] = envs ps_container["env"] = envs
ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"}) ps_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "PSERVER"
})
tn_container["env"] = envs tn_container["env"] = envs
if args.disttype == "pserver": if args.disttype == "pserver":
tn_container["env"].append({ tn_container["env"].append({
"name": "TRAINING_ROLE", "name": "PADDLE_TRAINING_ROLE",
"value": "TRAINER" "value": "TRAINER"
}) })
elif args.disttype == "nccl2" or args.disttype == "local": elif args.disttype == "nccl2" or args.disttype == "local":
# NCCL2 have no training role, set to plain WORKER # NCCL2 have no training role, set to plain WORKER
tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"}) tn_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "WORKER"
})
os.mkdir(args.jobname) os.mkdir(args.jobname)
if args.disttype == "pserver": if args.disttype == "pserver":
......
...@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML") ...@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
ELSE() ELSE()
MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
ENDIF() ENDIF()
SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result") SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
ExternalProject_Add( ExternalProject_Add(
......
...@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book ...@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
第二步,启动Parameter Server: 第二步,启动Parameter Server:
```bash ```bash
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
``` ```
执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。 执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
第三步,启动Trainer: 第三步,启动Trainer:
```bash ```bash
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
``` ```
由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。 由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
......
...@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id): ...@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
ret_list.append(f) ret_list.append(f)
return ret_list return ret_list
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
data_file = fluid.layers.io.open_files( data_file = fluid.layers.io.open_files(
filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0), filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
thread_num=1, thread_num=1,
......
...@@ -14,4 +14,3 @@ ...@@ -14,4 +14,3 @@
# #
add_subdirectory(inference) add_subdirectory(inference)
add_subdirectory(tape)
# Dynamic Graph on Fluid
PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very
challenging and we are still way from there. DyNet and PyTorch provide a good design
idea, the *tape*, that significantly eases the challenge. Also, DyNet provides
a C++ API that is as convenient as Python but with higher efficiency and could
conveniently integrate with industrial/production systems. This package, `tape`,
combines the good of
1. tape from PyTorch and DyNet
2. C++ API and core from DyNet
3. rich set of operators from PaddlePaddle
## Overview
We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md))
by wrapping Paddle Fluid's `Operator` and `Variable`.
The user API is straight forward since
1. it is imperative. And it uses host language's control flow logic.
1. it avoids extra concepts such as `Scope` and `Executor`.
All of these benefits come at the cost of just adding one line `reset_global_tape`
at every iteration.
## Code Structure
In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its
`type`, the pointers to the `Variable`s, and necessary attributes.
```c++
class Variable {
public:
VriableHandle Grad(); // returns its gradient variable
private:
framework::VarDesc desc_; // compile time infershape, necessary for lazy execution
framework::Variable var_; // run time variable, holds data memory
};
using VariableHandle = shared_ptr<Variable>;
struct OpHandle {
string type_;
map<string, vector<VariableHandle>> inputs_;
map<string, vector<VariableHandle>> outputs_;
AttributeMap attrs_;
};
class Tape {
public:
void AddOp(OpHandle); // add op
void Forward(); // execute the tape_
void Backward(); // execute the backward of the tape_
private:
vector<OpHandle> tape_;
};
```
We uses `Function` to indicate layers. It takes care of parameter
initialization and `AddOp` to the Tape when it is called.
```c++
class Linear {
public:
Linear(int in_dim, int out_dim, const std::string &act)
: w_(new Variable("LinearWeight")),
b_(new Variable("LinearBias")),
act_(act) {
Tape init_tape;
std::string initializer = "fill_constant";
framework::AttributeMap attrs;
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{in_dim, out_dim};
attrs["value"] = 1.0f;
init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{out_dim};
attrs["value"] = 1.0f;
init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
init_tape.Forward();
}
VariableHandle operator()(VariableHandle input) {
VariableHandle pre_bias(new Variable("linear"));
get_global_tape().AddOp("mul",
{{"X", {input}}, {"Y", {w_}}},
{{"Out", {pre_bias}}},
{{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
VariableHandle pre_act(new Variable("linear"));
get_global_tape().AddOp("elementwise_add",
{{"X", {pre_bias}}, {"Y", {b_}}},
{{"Out", {pre_act}}},
{{"axis", 1}});
VariableHandle post_act(new Variable("linear"));
get_global_tape().AddOp(act_,
{{"X", {pre_act}}},
{{"Out", {post_act}}},
{});
return post_act;
}
std::vector<VariableHandle> Params() { return {w_, b_}; }
private:
VariableHandle w_;
VariableHandle b_;
std::string act_;
};
```
## User API
```c++
// Model function
paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias
paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias
paddle::tape::Mean mean;
// Optimizer
paddle::tape::SGD sgd(0.001);
// Data Feeder
paddle::tape::Fill data_feeder(...);
VariableHandle input(new paddle::tape::Variable("input"));
VariableHandle label(new paddle::tape::Variable("label"));
for (int i = 0; i < 2; ++i) {
reset_global_tape();
data_feeder(input, label);
auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType
LOG(INFO) << loss.value(); // Run forward up to loss
// Run backward, store gradient of w at w->Grad()
get_global_tape.Backward(loss);
// Update w
sgd(linear1.Params());
sgd(linear2.Params());
}
```
<details>
<summary></summary>
digraph G {
subgraph cluster_0 {
node [shape=record,style=filled];
style=filled;
color=lightgrey;
linear1 [label="{type: mul | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1}} | {output |<before_bias1> Out: before_bias1}}"];
elementwise_add1 [label="{type: elementwise_add | {input | {<before_bias1>X: before_bias1 |<bias1> Y: bias1}} | {output |<before_act1> Out: before_act1}}"];
relu1 [label="{type: relu | {input | {<before_act1>X: before_act1 }} | {output |<after_act1> Out: after_act1}}"];
linear1 -> elementwise_add1->relu1;
label = "forward tape";
}
linear1:before_mul1->before_mul1
linear1:weight1->weight1
linear1:before_bias1->before_bias1
elementwise_add1:bias1->bias1
elementwise_add1:before_bias1->before_bias1
elementwise_add1:before_act1->before_act1
relu1:before_act1->before_act1
relu1:after_act1->after_act1
subgraph cluster_1 {
node [shape=record,style=filled];
style=filled;
color=lightgrey;
linear1_grad [label="{type: mul_grad | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1|<before_bias1_grad> Out_grad: before_bias1_grad}} | {output |{<before_mul1_grad>X_grad: before_mul1_grad |<weight1_grad> Y_grad: weight1_grad}}}"];
elementwise_add1_grad [label="{type: elementwise_add_grad | {input | <before_act1_grad> Out_grad: before_act1_grad} | {output |{<before_bias1_grad>X_grad: before_bias1_grad |<bias1_grad> Y_grad: bias1_grad}}}"];
relu1_grad [label="{type: relu_grad | {input |<after_act1_grad> Out_grad: after_act1_grad} | {ouput | {<before_act1_grad>X_grad: before_act1_grad }}}"];
linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back];
label = "backward tape";
}
relu1_grad:after_act1_grad->after_act1_grad
relu1_grad:before_act1_grad->before_act1_grad
elementwise_add1_grad:before_act1_grad->before_act1_grad
elementwise_add1_grad:before_bias1_grad->before_bias1_grad
elementwise_add1_grad:bias1_grad->bias1_grad
linear1_grad:before_mul1->before_mul1
linear1_grad:weight1->weight1
linear1_grad:before_bias1_grad->before_bias1_grad
linear1_grad:before_mul1_grad->before_mul1_grad
linear1_grad:weight1_grad->weight1_grad
subgraph cluster_2 {
node [shape=record];
label = "Linear1";
weight1
bias1
}
weight1 -> weight1_grad [ label="Grad()", style="dashed" ];
bias1 -> bias1_grad [ label="Grad()", style="dashed"];
}
</details>
![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png)
## Code Reuse
We want to stay close to Paddle Fluid as much as possible.
### Reuse All Operators
As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function`
is about 10 lines of code, similar to expose an operator to Python.
### Reuse Compile Time InferShape and InferVarType
Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead
of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and
`InferVarType` every time we `AddOp` to the tape.
### Reuse Operator::Run
We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary
`Scope` for every `Operator::Run()`.
## Possible Feature
### Release Memory on Backward
We can release memory aggressively. During backward, we can delete the OpHandle once
we have finished its backward. Since all the variable is managed by smart pointer, the
memory is automatically released when its `ref_count` goes to 0.
### Kernel Fusion
As a symbolic representation of the Tape is constructed first before the actual
execution, it would be possible to perform graph optimization. One use case is kernel
fusion.
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/contrib/tape/tape.h"
#include "paddle/contrib/tape/variable.h"
#include "paddle/fluid/framework/type_defs.h"
namespace paddle {
namespace tape {
class Function {};
class Fill {
public:
Fill(const std::string &initializer, const framework::AttributeMap &attrs)
: initializer_(initializer), attrs_(attrs) {}
void operator()(VariableHandle var) {
get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
}
private:
const std::string initializer_;
const framework::AttributeMap attrs_;
};
class Mean {
public:
VariableHandle operator()(VariableHandle var) {
VariableHandle out(new Variable("mean"));
get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
return out;
}
};
class Linear {
public:
Linear(int in_dim, int out_dim, const std::string &act)
: w_(new Variable("LinearWeight")),
b_(new Variable("LinearBias")),
act_(act) {
Tape init_tape;
std::string initializer = "fill_constant";
framework::AttributeMap attrs;
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{in_dim, out_dim};
attrs["value"] = 1.0f;
init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{out_dim};
attrs["value"] = 1.0f;
init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
init_tape.Forward();
}
VariableHandle operator()(VariableHandle input) {
VariableHandle pre_bias(new Variable("linear"));
get_global_tape().AddOp("mul",
{{"X", {input}}, {"Y", {w_}}},
{{"Out", {pre_bias}}},
{{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
VariableHandle pre_act(new Variable("linear"));
get_global_tape().AddOp("elementwise_add",
{{"X", {pre_bias}}, {"Y", {b_}}},
{{"Out", {pre_act}}},
{{"axis", 1}});
VariableHandle post_act(new Variable("linear"));
get_global_tape().AddOp(
act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
return post_act;
}
std::vector<VariableHandle> Params() { return {w_, b_}; }
private:
VariableHandle w_;
VariableHandle b_;
std::string act_;
};
class SGD {
public:
SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
Tape init_tape;
std::string initializer = "fill_constant";
framework::AttributeMap attrs;
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{1};
attrs["value"] = learning_rate;
init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
init_tape.Forward();
}
void operator()(VariableHandle input) {
PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
"optimization must happen after the backward");
Tape temp_tape;
temp_tape.AddOp("sgd",
{{"Param", {input}},
{"LearningRate", {learning_rate_}},
{"Grad", {input->Grad()}}},
{{"ParamOut", {input}}},
{});
temp_tape.Forward();
}
private:
VariableHandle learning_rate_;
};
}
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/contrib/tape/tape.h"
#include <list>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/dim.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
namespace paddle {
namespace tape {
// borrowed from
// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c
inline bool ends_with(std::string const &value, std::string const &ending) {
if (ending.size() > value.size()) return false;
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
}
std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) {
os << var_desc.Name();
os << "[" << var_desc.GetType() << "]";
os << "[" << var_desc.GetDataType() << "]";
os << "{";
for (auto &i : var_desc.GetShape()) {
os << i << ",";
}
os << "}";
return os;
}
std::string to_string(const std::string &type,
const VariableHandleMap &in_vars,
const VariableHandleMap &out_vars,
const framework::AttributeMap &attrs) {
std::stringstream ss;
ss << type << " ";
for (auto &param_name : in_vars) {
for (auto &var : param_name.second) {
ss << param_name.first << ":(" << var->Desc() << ") ";
}
}
for (auto &param_name : out_vars) {
for (auto &var : param_name.second) {
ss << param_name.first << ":(" << var->Desc() << ") ";
}
}
return ss.str();
}
framework::OpDesc CreateOpDesc(const std::string &type,
const VariableHandleMap &in_vars,
const VariableHandleMap &out_vars,
const framework::AttributeMap &attrs) {
framework::VariableNameMap inputs;
for (auto &param_name : in_vars) {
for (auto &var : param_name.second) {
inputs[param_name.first].emplace_back(var->Name());
}
}
framework::VariableNameMap outputs;
for (auto &param_name : out_vars) {
for (auto &var : param_name.second) {
outputs[param_name.first].emplace_back(var->Name());
}
}
return framework::OpDesc(type, inputs, outputs, attrs);
}
void InferShapeAndVarType(const std::string &type,
const VariableHandleMap &in_vars,
VariableHandleMap *out_vars,
const framework::AttributeMap &attrs) {
framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs);
// Create a temporary block for compile-time
framework::ProgramDesc program_desc;
framework::BlockDesc *block_desc = program_desc.MutableBlock(0);
PADDLE_ENFORCE(block_desc);
for (auto &param_name : in_vars) {
for (auto &var : param_name.second) {
*block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
}
}
for (auto &param_name : *out_vars) {
for (auto &var : param_name.second) {
*block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
}
}
LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs);
op_desc.InferShape(*block_desc);
op_desc.InferVarType(block_desc);
for (auto &param_name : *out_vars) {
for (auto &var : param_name.second) {
*var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto();
}
}
LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs);
}
void Tape::AddOp(const std::string &type,
const VariableHandleMap &in_vars,
VariableHandleMap out_vars,
const framework::AttributeMap &attrs) {
InferShapeAndVarType(type, in_vars, &out_vars, attrs);
tape_.emplace_back(type, in_vars, out_vars, attrs);
}
// Temporary Scope for Operator::Run()
class ScopeWrapper : public framework::Scope {
public:
ScopeWrapper(const VariableHandleMap &in_vars,
const VariableHandleMap &out_vars) {
for (auto &v : in_vars) {
for (auto &vv : v.second) {
if (!vars_.count(vv->Name())) {
vars_[vv->Name()].reset(vv->Var());
}
}
}
for (auto &v : out_vars) {
for (auto &vv : v.second) {
if (!vars_.count(vv->Name())) {
vars_[vv->Name()].reset(vv->Var());
}
}
}
}
~ScopeWrapper() {
for (auto &pair : vars_) {
pair.second.release();
}
}
};
void Tape::Forward() {
LOG(INFO) << "Starting forward -------------------------";
PADDLE_ENFORCE(!has_been_backwarded_);
while (current_position_ < tape_.size()) {
OpHandle &op = tape_[current_position_];
// Create Output Tensor, this is only necessary for OpWithKernel
for (auto &param2var : op.outputs_) {
for (auto &var : param2var.second) {
var->InitializeVariable();
}
}
framework::OpDesc op_desc =
CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_);
ScopeWrapper scope(op.inputs_, op.outputs_);
framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace());
current_position_++;
}
LOG(INFO) << "Finishing forward -------------------------";
}
void Tape::Backward(VariableHandle target) {
PADDLE_ENFORCE(!has_been_backwarded_);
Forward();
// TODO(tonyyang-svail): check output of last op is target
backward_tape_.reset(new Tape());
framework::AttributeMap attrs;
// FIXME(tonyyang-svail): Need to infer_data_type
attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{1};
attrs["value"] = 1.0f;
backward_tape_->AddOp(
"fill_constant", {}, {{"Out", {target->Grad()}}}, attrs);
for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) {
framework::OpDesc op_desc =
CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_);
std::unordered_map<std::string, std::string> grad_to_var;
std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
framework::OpInfoMap::Instance()
.Get(op_desc.Type())
.GradOpMaker()(op_desc, {}, &grad_to_var, {});
for (auto &op_desc : grad_op_descs) {
std::unordered_map<std::string, VariableHandle> name2var;
for (auto &param2vars : it->inputs_) {
for (auto &a : param2vars.second) {
name2var[a->Name()] = a;
}
}
for (auto &param2vars : it->outputs_) {
for (auto &a : param2vars.second) {
name2var[a->Name()] = a;
}
}
VariableHandleMap in_vars;
VariableHandleMap out_vars;
std::map<const framework::VariableNameMap *, VariableHandleMap *>
loop_over{{&op_desc->Inputs(), &in_vars},
{&op_desc->Outputs(), &out_vars}};
for (auto &each : loop_over) {
auto &vmp = *each.first;
auto &vhm = *each.second;
for (auto &p2a : vmp) {
for (auto &argu : p2a.second) {
if (name2var.count(argu)) {
vhm[p2a.first].push_back(name2var[argu]);
} else {
PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix),
argu.c_str());
std::string name = argu.substr(
0, argu.size() - std::strlen(framework::kGradVarSuffix));
PADDLE_ENFORCE(name2var.count(name), name.c_str());
vhm[p2a.first].push_back(name2var[name]->Grad());
}
}
}
}
backward_tape_->AddOp(
op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap());
}
// TODO(tonyyang-svail): how to fill empty grad?
// TODO(tonyyang-svail): Sum var grad is necessary
}
backward_tape_->Forward();
has_been_backwarded_ = true;
}
Tape &get_global_tape() {
static Tape T;
return T;
}
void reset_global_tape() { get_global_tape() = Tape(); }
}
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/contrib/tape/variable.h"
namespace paddle {
namespace tape {
using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;
struct OpHandle {
OpHandle(const std::string &type,
const VariableHandleMap &in_vars,
const VariableHandleMap &out_vars,
const framework::AttributeMap &attrs)
: type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
std::string type_;
VariableHandleMap inputs_;
VariableHandleMap outputs_;
framework::AttributeMap attrs_;
};
class Tape {
public:
void AddOp(const std::string &type,
const VariableHandleMap &in_vars,
VariableHandleMap out_vars,
const framework::AttributeMap &attrs);
void Forward();
void Backward(VariableHandle target);
bool HasBeenBackwarded() { return has_been_backwarded_; }
private:
bool has_been_backwarded_ = false;
size_t current_position_ = 0;
std::vector<OpHandle> tape_;
std::shared_ptr<Tape> backward_tape_;
};
Tape &get_global_tape();
void reset_global_tape();
}
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include "paddle/contrib/tape/function.h"
using namespace paddle::tape;
TEST(Tape, TestMLP) {
LOG(INFO) << "TestMLP";
Linear linear1(3, 3, "relu");
Linear linear2(3, 3, "relu");
Mean mean;
SGD sgd(0.001);
std::string initializer = "fill_constant";
paddle::framework::AttributeMap attrs;
attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
attrs["shape"] = std::vector<int>{3, 3};
attrs["value"] = 1.0f;
Fill filler(initializer, attrs);
for (int i = 0; i < 2; ++i) {
reset_global_tape();
VariableHandle input(new Variable("input"));
filler(input);
auto loss = mean(linear2(linear1(input)));
get_global_tape().Backward(loss);
for (auto w : linear1.Params()) {
sgd(w);
}
for (auto w : linear2.Params()) {
sgd(w);
}
}
}
int main(int argc, char** argv) {
std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace());
paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/contrib/tape/variable.h"
namespace paddle {
namespace tape {
void Variable::InitializeVariable() {
LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
framework::proto::VarType::Type var_type = desc_.GetType();
if (var_type == framework::proto::VarType::LOD_TENSOR) {
var_.GetMutable<framework::LoDTensor>();
} else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
var_.GetMutable<framework::SelectedRows>();
} else {
PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
var_type);
}
}
}
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/framework/operator.h" // framework::kGradVarSuffix
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace tape {
class Variable;
using VariableHandle = std::shared_ptr<Variable>;
/*
* Combination of
* framework::VarDesc desc_;
* framework::Variable var_;
*/
class Variable {
public:
Variable(const std::string pre_fix)
: desc_(pre_fix + std::to_string(count())) {}
Variable(const std::string pre_fix, bool is_grad)
: desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
: std::to_string(count()))) {}
~Variable() { LOG(INFO) << "Deleting " << Name(); }
// Instantiate LoDTensor/SelectedRow
void InitializeVariable();
VariableHandle Grad() {
if (grad_.expired()) {
VariableHandle new_grad(new Variable(desc_.Name(), true));
grad_ = new_grad;
return new_grad;
} else {
return VariableHandle(grad_);
}
}
// Stochastic Gradient Descent with Momentum
// VariableHandle Momentum ();
// void init(const std::string& initializer,
// const framework::AttributeMap& attrs);
// void value() {};
const framework::VarDesc& Desc() const { return desc_; }
framework::VarDesc* MutableDesc() { return &desc_; }
// TODO(tonyyang-svail): No need to expose name
std::string Name() const { return desc_.Name(); }
framework::Variable* Var() { return &var_; }
private:
int count() {
static int counter = 0;
return counter++;
}
framework::VarDesc desc_;
framework::Variable var_;
std::weak_ptr<Variable> grad_;
};
}
}
...@@ -295,13 +295,14 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -295,13 +295,14 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
const ProgramDesc& program, int block_id) { const ProgramDesc& program, int block_id) {
auto* ctx = new ExecutorPrepareContext(program, block_id); std::unique_ptr<ExecutorPrepareContext> ctx(
new ExecutorPrepareContext(program, block_id));
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size()); PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
auto& block = program.Block(block_id); auto& block = program.Block(block_id);
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
return std::unique_ptr<ExecutorPrepareContext>(ctx); return ctx;
} }
std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
* big. * big.
*/ */
#pragma once
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
...@@ -43,7 +45,7 @@ struct Argument { ...@@ -43,7 +45,7 @@ struct Argument {
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0) #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#define ANALYSIS_ARGUMENT_CHECK_FIELD(field__) \ #define ANALYSIS_ARGUMENT_CHECK_FIELD(field__) \
if (!UNLIKELY(field__)) { \ if (UNLIKELY(!(field__))) { \
LOG(ERROR) << "field " << #field__ << " should be set."; \ LOG(ERROR) << "field " << #field__ << " should be set."; \
return false; \ return false; \
} }
......
...@@ -12,16 +12,20 @@ ...@@ -12,16 +12,20 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/mkldnn_activation_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using paddle::framework::Tensor; using framework::DataLayout;
using paddle::platform::MKLDNNDeviceContext; using framework::Tensor;
using mkldnn::memory;
using mkldnn::primitive;
using mkldnn::stream;
using platform::GetMKLDNNFormat;
using platform::MKLDNNDeviceContext;
using platform::to_void_cast;
namespace { namespace {
std::string gethash(const mkldnn::memory::dims &operand_dims, std::string gethash(const mkldnn::memory::dims &operand_dims,
...@@ -35,188 +39,260 @@ std::string gethash(const mkldnn::memory::dims &operand_dims, ...@@ -35,188 +39,260 @@ std::string gethash(const mkldnn::memory::dims &operand_dims,
}; };
return dim2str(operand_dims) + std::to_string(algorithm); return dim2str(operand_dims) + std::to_string(algorithm);
} }
} // namespace
template <typename Functor>
class MKLDNNActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
x->format() != memory::format::format_undef,
"Wrong layout/format set for Input x tensor");
Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
functor(ctx);
}
};
template <typename T, typename ExecContext> template <typename Functor>
void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm, class MKLDNNActivationGradKernel
const T alpha = 0, const T beta = 0) { : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
diff_y->format() != memory::format::format_undef,
"Wrong layout/format set for Input OutGrad tensor");
Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
functor(ctx);
}
};
template <typename T>
void eltwise_forward(const framework::ExecutionContext &ctx,
mkldnn::algorithm algorithm, const T alpha = 0,
const T beta = 0) {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>(); auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto &mkldnn_engine = dev_ctx.GetEngine();
// get buffers const auto *x = ctx.Input<Tensor>("X");
const auto *src = ctx.template Input<Tensor>("X"); auto *y = ctx.Output<Tensor>("Out");
const auto *src_data = src->template data<T>();
auto *dst = ctx.template Output<Tensor>("Out"); const T *x_data = x->data<T>();
T *dst_data = dst->template mutable_data<T>(ctx.GetPlace()); T *y_data = y->mutable_data<T>(ctx.GetPlace());
// get memory dim PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
"Input dim must be with 2 or 4"); "Input dim must be with 2 or 4");
std::vector<int> src_tz = framework::vectorize2int(src->dims());
std::vector<int> src_tz = framework::vectorize2int(x->dims());
auto src_format =
src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
const std::string key = gethash(src_tz, algorithm); const std::string key = gethash(src_tz, algorithm);
const std::string key_src_data = const std::string key_src_data =
key + ctx.op().Output("Out") + "@eltwise_fwd_src_data"; key + ctx.op().Output("Out") + "@eltwise_fwd_src_data";
const std::string key_src_mem = key + "@eltwise_fwd_src_mem"; const std::string key_src_layout =
const std::string key_dst_mem = key + "@eltwise_fwd_dst_mem"; key + ctx.op().Output("Out") + "@eltwise_fwd_src_layout";
const std::string key_fwd = key + "@eltwise_fwd"; const std::string key_with_layout = key + std::to_string(src_format);
const std::string key_src_mem = key_with_layout + "@eltwise_fwd_src_mem";
const std::string key_dst_mem = key_with_layout + "@eltwise_fwd_dst_mem";
const std::string key_fwd = key_with_layout + "@eltwise_fwd";
const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
// save input data and layout to be referred in backward path
auto p_src_data = std::make_shared<const T *>(x_data);
dev_ctx.SetBlob(key_src_data, p_src_data);
auto p_src_layout = std::make_shared<memory::format>(src_format);
dev_ctx.SetBlob(key_src_layout, p_src_layout);
auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>( auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
dev_ctx.GetBlob(key_fwd)); dev_ctx.GetBlob(key_fwd));
// save input data to be referred in backward path std::shared_ptr<memory> dst_memory;
auto p_src_data = std::make_shared<const T *>(src_data);
dev_ctx.SetBlob(key_src_data, p_src_data);
if (p_fwd == nullptr) { if (p_fwd == nullptr) {
// create memory description // create mkldnn memory for input X
auto data_md = src_tz.size() == 2 auto src_md = platform::MKLDNNMemDesc(
? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, src_tz, platform::MKLDNNGetDataType<T>(), src_format);
mkldnn::memory::format::nc) auto src_memory = std::shared_ptr<memory>(
: platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
mkldnn::memory::format::nchw); // save src_memory to be referred in backward path
dev_ctx.SetBlob(key_src_mem, src_memory);
// create memory primitives
auto p_src_mem = std::make_shared<mkldnn::memory>(mkldnn::memory( // create primitive descriptor for activation forward and save it
{data_md, mkldnn_engine}, platform::to_void_cast(src_data))); auto forward_desc = mkldnn::eltwise_forward::desc(
dev_ctx.SetBlob(key_src_mem, p_src_mem); mkldnn::prop_kind::forward_training, algorithm,
src_memory->get_primitive_desc().desc(), alpha, beta);
auto p_dst_mem = std::make_shared<mkldnn::memory>(mkldnn::memory( auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
{data_md, mkldnn_engine}, platform::to_void_cast(dst_data))); forward_desc, mkldnn_engine);
dev_ctx.SetBlob(key_dst_mem, p_dst_mem);
// save prim desc into global device context to be referred in backward path
auto fwd_desc = mkldnn::eltwise_forward::desc( dev_ctx.SetBlob(key_fwd_pd, forward_pd);
mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
auto p_fwd_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>( // create mkldnn memory for output y
fwd_desc, mkldnn_engine); dst_memory =
const std::string key_fwd_pd = key + "eltwise_fwd_pd"; std::make_shared<memory>(forward_pd->dst_primitive_desc(), y_data);
dev_ctx.SetBlob(key_fwd_pd, p_fwd_pd);
p_fwd = std::make_shared<mkldnn::eltwise_forward>( dev_ctx.SetBlob(key_dst_mem, dst_memory);
*p_fwd_pd, *(p_src_mem.get()), *(p_dst_mem.get()));
// create activation primitive
p_fwd = std::make_shared<mkldnn::eltwise_forward>(*forward_pd, *src_memory,
*dst_memory);
dev_ctx.SetBlob(key_fwd, p_fwd); dev_ctx.SetBlob(key_fwd, p_fwd);
} else { } else {
// primitives already exist // primitives already exist
auto p_src_mem = auto src_memory =
std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem)); std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
PADDLE_ENFORCE(p_src_mem != nullptr, PADDLE_ENFORCE(src_memory != nullptr,
"Fail to find eltwise p_src_mem in device context."); "Fail to find eltwise src_memory in device context.");
auto p_dst_mem = dst_memory =
std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_dst_mem)); std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_dst_mem));
PADDLE_ENFORCE(p_dst_mem != nullptr, PADDLE_ENFORCE(dst_memory != nullptr,
"Fail to find eltwise p_src_mem in device context."); "Fail to find eltwise dst_memory in device context.");
p_src_mem->set_data_handle(platform::to_void_reinterpret_cast(src_data)); src_memory->set_data_handle(platform::to_void_cast(x_data));
p_dst_mem->set_data_handle(dst_data); dst_memory->set_data_handle(y_data);
} }
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline = {*(p_fwd.get())}; std::vector<primitive> pipeline;
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); pipeline.push_back(*p_fwd);
stream(stream::kind::eager).submit(pipeline).wait();
y->set_layout(DataLayout::kMKLDNN);
y->set_format(GetMKLDNNFormat(*dst_memory));
} }
template <typename T, typename ExecContext> template <typename T>
void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm, void eltwise_grad(const framework::ExecutionContext &ctx,
const T alpha = 0, const T beta = 0) { mkldnn::algorithm algorithm, const T alpha = 0,
const T beta = 0) {
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>(); auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto &mkldnn_engine = dev_ctx.GetEngine();
// get buffers const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
const auto *out = ctx.template Input<Tensor>("Out"); auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
const auto *diff_dst = dout->template data<T>();
auto *dx = const T *diff_y_data = diff_y->data<T>();
ctx.template Output<framework::Tensor>(framework::GradVarName("X")); T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
// get memory dim std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());
std::vector<int> src_tz = framework::vectorize2int(out->dims());
const std::string key = gethash(src_tz, algorithm); auto diff_y_format =
const std::string key_diff_src_mem = key + "@eltwise_diff_src_mem"; diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
const std::string key_diff_dst_mem = key + "@eltwise_diff_dst_mem";
const std::string key_grad = key + "@eltwise_grad";
const std::string key = gethash(diff_dst_tz, algorithm);
const std::string key_src_data = const std::string key_src_data =
key + ctx.op().Input("Out") + "@eltwise_fwd_src_data"; key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
const std::string key_src_layout =
key + ctx.op().Input("Out") + "@eltwise_fwd_src_layout";
const auto p_src_layout =
std::static_pointer_cast<memory::format>(dev_ctx.GetBlob(key_src_layout));
const std::string key_src_mem =
key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
const std::string key_fwd_pd =
key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
const std::string key_with_layouts =
key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
const std::string key_diff_src_mem =
key_with_layouts + "@eltwise_diff_src_mem";
const std::string key_diff_dst_mem =
key_with_layouts + "@eltwise_diff_dst_mem";
const std::string key_grad = key_with_layouts + "@eltwise_grad";
const auto p_src_data = const auto p_src_data =
std::static_pointer_cast<T *>(dev_ctx.GetBlob(key_src_data)); std::static_pointer_cast<T *>(dev_ctx.GetBlob(key_src_data));
const std::string key_src_mem = key + "@eltwise_fwd_src_mem"; auto src_memory =
auto p_src_mem =
std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem)); std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
p_src_mem->set_data_handle(*p_src_data.get()); PADDLE_ENFORCE(src_memory != nullptr,
"Fail to find src_memory in device context");
src_memory->set_data_handle(*p_src_data.get());
std::shared_ptr<memory> diff_src_memory;
auto p_grad = std::static_pointer_cast<mkldnn::eltwise_forward::primitive>( auto p_grad = std::static_pointer_cast<mkldnn::eltwise_backward>(
dev_ctx.GetBlob(key_grad)); dev_ctx.GetBlob(key_grad));
if (p_grad == nullptr) { if (p_grad == nullptr) {
// create memory description // create mkldnn memory for input diff_y
auto data_md = src_tz.size() == 2 auto diff_dst_md = platform::MKLDNNMemDesc(
? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
mkldnn::memory::format::nc) auto diff_dst_memory = std::shared_ptr<memory>(
: platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
mkldnn::memory::format::nchw); dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
// create memory primitives // retrieve eltwise primitive desc from device context
std::shared_ptr<void> p_diff_src_mem = auto forward_pd =
std::make_shared<mkldnn::memory>(mkldnn::memory( std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
{data_md, mkldnn_engine}, platform::to_void_cast(diff_src))); dev_ctx.GetBlob(key_fwd_pd));
dev_ctx.SetBlob(key_diff_src_mem, p_diff_src_mem); PADDLE_ENFORCE(forward_pd != nullptr,
std::shared_ptr<void> p_diff_dst_mem = "Fail to find eltwise_fwd_pd in device context");
std::make_shared<mkldnn::memory>(mkldnn::memory(
{data_md, mkldnn_engine}, platform::to_void_cast(diff_dst))); // ceate primitive descriptor for activation backward
dev_ctx.SetBlob(key_diff_dst_mem, p_diff_dst_mem); auto backward_desc = mkldnn::eltwise_backward::desc(
algorithm, diff_dst_memory->get_primitive_desc().desc(),
auto bwd_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, src_memory->get_primitive_desc().desc(), alpha, beta);
alpha, beta); auto backward_pd = mkldnn::eltwise_backward::primitive_desc(
backward_desc, mkldnn_engine, *forward_pd);
const std::string key_fwd_pd = key + "eltwise_fwd_pd";
auto *p_fwd_pd = static_cast<mkldnn::eltwise_forward::primitive_desc *>( // create mkldnn memory for output diff_src
dev_ctx.GetBlob(key_fwd_pd).get()); diff_src_memory = std::make_shared<memory>(
backward_pd.diff_src_primitive_desc(), diff_x_data);
auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc( dev_ctx.SetBlob(key_diff_src_mem, diff_src_memory);
bwd_desc, mkldnn_engine, *p_fwd_pd);
// create activation backward primitive
p_grad = std::make_shared<mkldnn::eltwise_backward>( p_grad = std::make_shared<mkldnn::eltwise_backward>(
eltwise_bwd_prim_desc, *static_cast<mkldnn::memory *>(p_src_mem.get()), backward_pd, *src_memory, *diff_dst_memory, *diff_src_memory);
*(static_cast<mkldnn::memory *>(p_diff_dst_mem.get())), dev_ctx.SetBlob(key_grad, p_grad);
*(static_cast<mkldnn::memory *>(p_diff_src_mem.get())));
} else { } else {
// primitives already exist // primitives already exist
auto p_diff_src_mem = std::static_pointer_cast<mkldnn::memory>( diff_src_memory = std::static_pointer_cast<mkldnn::memory>(
dev_ctx.GetBlob(key_diff_src_mem)); dev_ctx.GetBlob(key_diff_src_mem));
auto p_diff_dst_mem = std::static_pointer_cast<mkldnn::memory>( auto diff_dst_memory = std::static_pointer_cast<mkldnn::memory>(
dev_ctx.GetBlob(key_diff_dst_mem)); dev_ctx.GetBlob(key_diff_dst_mem));
p_diff_src_mem->set_data_handle( diff_src_memory->set_data_handle(
platform::to_void_reinterpret_cast(diff_src)); platform::to_void_reinterpret_cast(diff_x_data));
p_diff_dst_mem->set_data_handle( diff_dst_memory->set_data_handle(
platform::to_void_reinterpret_cast(diff_dst)); platform::to_void_reinterpret_cast(diff_y_data));
} }
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline = {*(p_grad.get())}; std::vector<primitive> pipeline;
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); pipeline.push_back(*p_grad);
stream(stream::kind::eager).submit(pipeline).wait();
diff_x->set_layout(DataLayout::kMKLDNN);
diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
} }
} // anonymous namespace
template <typename T, mkldnn::algorithm algorithm> template <typename T, mkldnn::algorithm algorithm>
struct MKLDNNActivationFunc : public BaseActivationFunctor<T> { struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
template <typename ExecContext> void operator()(const framework::ExecutionContext &ctx) const {
void operator()(const ExecContext &ctx) const {
eltwise_forward<T>(ctx, algorithm); eltwise_forward<T>(ctx, algorithm);
} }
}; };
template <typename T, mkldnn::algorithm algorithm> template <typename T, mkldnn::algorithm algorithm>
struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> { struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
template <typename ExecContext> void operator()(const framework::ExecutionContext &ctx) const {
void operator()(const ExecContext &ctx) const {
eltwise_grad<T>(ctx, algorithm); eltwise_grad<T>(ctx, algorithm);
} }
}; };
......
...@@ -19,18 +19,20 @@ limitations under the License. */ ...@@ -19,18 +19,20 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ using paddle::framework::Tensor;
class OP_NAME##OpMaker \
: public ::paddle::framework::OpProtoAndCheckerMaker { \ #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \
public: \ class OP_NAME##OpMaker \
void Make() override { \ : public ::paddle::framework::OpProtoAndCheckerMaker { \
AddInput("X", "Input of " #OP_NAME " operator"); \ public: \
AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \ void Make() override { \
AddAttr<bool>("use_mkldnn", \ AddInput("X", "Input of " #OP_NAME " operator"); \
"(default false) Only used in mkldnn kernel") \ AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \
.SetDefault(false); \ AddAttr<bool>("use_mkldnn", \
AddComment(OP_COMMENT); \ "(bool, default false) Only used in mkldnn kernel") \
} \ .SetDefault(false); \
AddComment(#OP_COMMENT); \
} \
} }
#define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE) \ #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE) \
...@@ -58,7 +60,6 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, ...@@ -58,7 +60,6 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
const framework::OperatorWithKernel& oper, const framework::OperatorWithKernel& oper,
const std::string& name) { const std::string& name) {
framework::LibraryType library{framework::LibraryType::kPlain}; framework::LibraryType library{framework::LibraryType::kPlain};
framework::DataLayout layout = framework::DataLayout::kAnyLayout; framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
auto it = oper.Attrs().find("use_mkldnn"); auto it = oper.Attrs().find("use_mkldnn");
...@@ -82,6 +83,7 @@ class ActivationOp : public framework::OperatorWithKernel { ...@@ -82,6 +83,7 @@ class ActivationOp : public framework::OperatorWithKernel {
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return GetKernelType(ctx, *this, "X"); return GetKernelType(ctx, *this, "X");
...@@ -96,6 +98,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel { ...@@ -96,6 +98,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
} }
protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return GetKernelType(ctx, *this, "Out"); return GetKernelType(ctx, *this, "Out");
...@@ -140,7 +143,7 @@ $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ ...@@ -140,7 +143,7 @@ $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC( __attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
TanhShrink Activation Operator. TanhShrink Activation Operator.
$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
)DOC"; )DOC";
...@@ -382,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -382,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
STanh Activation Operator. STanh Activation Operator.
$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$ $$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
)DOC"); )DOC");
} }
......
...@@ -60,34 +60,45 @@ template <typename DeviceContext, typename T> ...@@ -60,34 +60,45 @@ template <typename DeviceContext, typename T>
class ConcatGradKernel : public framework::OpKernel<T> { class ConcatGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const { void Compute(const framework::ExecutionContext& ctx) const {
auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out")); auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto ins = ctx.MultiInput<framework::Tensor>("X");
auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X")); auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis")); int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
// get output tensor that the name is not kEmptyVarName
std::vector<framework::Tensor*> outputs;
for (size_t j = 0; j < outs.size(); ++j) {
if (out_var_names[j] != framework::kEmptyVarName) {
outs[j]->mutable_data<T>(ctx.GetPlace());
outputs.push_back(outs[j]);
} else {
outputs.push_back(nullptr);
}
}
// Sometimes direct copies will be faster, this maybe need deeply analysis. // Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && outs.size() < 10) { if (axis == 0 && outs.size() < 10) {
size_t input_offset = 0; size_t input_offset = 0;
auto in_stride = framework::stride_numel(in->dims()); const auto in_stride = framework::stride_numel(out_grad->dims());
for (auto& out : outs) { for (size_t i = 0; i < outs.size(); ++i) {
out->mutable_data<T>(ctx.GetPlace()); auto out_stride = framework::stride_numel(ins[i]->dims());
auto out_stride = framework::stride_numel(out->dims()); auto* out = outputs[i];
StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(), if (out != nullptr) {
out_stride, in->data<T>() + input_offset, StridedNumelCopyWithAxis<T>(
in_stride, out_stride[axis]); ctx.device_context(), axis, out->data<T>(), out_stride,
out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
}
input_offset += out_stride[axis]; input_offset += out_stride[axis];
} }
} else { } else {
std::vector<framework::Tensor> outputs(outs.size());
for (size_t j = 0; j < outs.size(); ++j) {
outs[j]->mutable_data<T>(ctx.GetPlace());
outputs[j] = *outs[j];
}
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
paddle::operators::math::ConcatGradFunctor<DeviceContext, T> paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
concat_grad_functor; concat_grad_functor;
concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), &outputs); concat_grad_functor(dev_ctx, *out_grad, ins, static_cast<int>(axis),
&outputs);
} }
} }
}; };
......
...@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
Detection mAP evaluate operator. Detection mAP evaluate operator.
The general steps are as follows. First, calculate the true positive and The general steps are as follows. First, calculate the true positive and
false positive according to the input of detection and labels, then false positive according to the input of detection and labels, then
calculate the mAP evaluate value. calculate the mAP evaluate value.
Supporting '11 point' and 'integral' mAP algorithm. Please get more information Supporting '11 point' and 'integral' mAP algorithm. Please get more information
from the following articles: from the following articles:
https://sanchom.wordpress.com/tag/average-precision/ https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325 https://arxiv.org/abs/1512.02325
)DOC"); )DOC");
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/operators/mean_op.h"
namespace paddle {
namespace operators {
using framework::DataLayout;
template <typename T>
class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
float mean = context.Attr<float>("mean");
float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out");
T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
std::minstd_rand engine;
if (seed == 0) {
seed = std::random_device()();
}
engine.seed(seed);
std::normal_distribution<T> dist(mean, std);
int64_t size = tensor->numel();
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(engine);
}
// The format of output is set as the mkldnn's format
// TODO(@mozga-intel) The format of matrix sets inside the another layers.
tensor->set_layout(DataLayout::kMKLDNN);
tensor->set_format(mkldnn::memory::format::oihw);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(gaussian_random, MKLDNN, ::paddle::platform::CPUPlace,
ops::GaussianMKLDNNKernel<float>);
...@@ -15,6 +15,10 @@ limitations under the License. */ ...@@ -15,6 +15,10 @@ limitations under the License. */
#include <random> #include <random>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -62,9 +66,20 @@ class GaussianRandomOp : public framework::OperatorWithKernel { ...@@ -62,9 +66,20 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
framework::LibraryType library{framework::LibraryType::kPlain};
framework::DataLayout layout{framework::DataLayout::kAnyLayout};
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")), static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
ctx.device_context()); ctx.device_context(), layout, library);
} }
}; };
...@@ -95,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -95,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
"(int, default 5(FP32)) " "(int, default 5(FP32)) "
"Output data type.") "Output data type.")
.SetDefault(framework::proto::VarType::FP32); .SetDefault(framework::proto::VarType::FP32);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
GaussianRandom Operator. GaussianRandom Operator.
......
...@@ -70,35 +70,40 @@ template <typename T> ...@@ -70,35 +70,40 @@ template <typename T>
class ConcatGradFunctor<platform::CPUDeviceContext, T> { class ConcatGradFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, const int axis, const framework::Tensor& input,
std::vector<framework::Tensor>* outputs) { const std::vector<const framework::Tensor*>& ref_inputs,
const int axis, std::vector<framework::Tensor*>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int num = outputs->size(); size_t num = outputs->size();
int input_rows = 1; int input_rows = 1;
auto dim_0 = outputs->at(0).dims(); auto dim_0 = ref_inputs[0]->dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
input_rows *= dim_0[i]; input_rows *= dim_0[i];
} }
int input_cols = 0; int input_cols = 0;
std::vector<int64_t> output_cols(outputs->size()); std::vector<int64_t> output_cols(outputs->size());
for (int i = 0; i < num; ++i) { for (size_t i = 0; i < num; ++i) {
int t_cols = outputs->at(i).numel() / input_rows; int t_cols = ref_inputs[i]->numel() / input_rows;
input_cols += t_cols; input_cols += t_cols;
output_cols[i] = t_cols; output_cols[i] = t_cols;
} }
auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace()); auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
// computation // computation
for (int k = 0; k < input_rows; ++k) { for (size_t k = 0; k < input_rows; ++k) {
const T* src_ptr = input.data<T>() + k * input_cols; const T* src_ptr = input.data<T>() + k * input_cols;
int col_idx = 0; int col_idx = 0;
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
int col_len = output_cols[j]; int col_len = output_cols[j];
T* dst_ptr = outputs->at(j).data<T>() + k * col_len; auto* out_tensor = outputs->at(j);
memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, if (out_tensor != nullptr) {
sizeof(T) * col_len); T* dst_ptr = out_tensor->data<T>() + k * col_len;
memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
sizeof(T) * col_len);
}
col_idx += col_len; col_idx += col_len;
} }
} }
......
...@@ -22,43 +22,24 @@ namespace paddle { ...@@ -22,43 +22,24 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T>
__device__ T upper_bound(const T* first, T count, T val) {
const T* orig = first;
const T* it = nullptr;
T step = 0;
while (count > 0) {
it = first;
step = count / 2;
it += step;
if (!(val < *it)) {
first = ++it;
count -= step + 1;
} else {
count = step;
}
}
return first - orig;
}
template <typename T> template <typename T>
__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
const int output_rows, const int output_cols, const int output_rows, const int output_cols,
T* output) { T* output) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1; int curr_segment = 0;
int curr_offset = input_cols[0];
int curr_offset = input_cols[segment];
int curr_segment = segment;
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset; int curr_col_offset = input_cols[curr_segment + 1];
while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) { while (curr_col_offset <= tid_x) {
curr_offset = curr_col_offset; curr_offset = curr_col_offset;
++curr_segment; ++curr_segment;
curr_col_offset = input_cols[curr_segment + 1];
} }
int local_col = tid_x - curr_offset; int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset; int segment_width = curr_col_offset - curr_offset;
T* input_ptr = inputs[curr_segment]; T* input_ptr = inputs[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
...@@ -89,23 +70,25 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row, ...@@ -89,23 +70,25 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
const int in_col, const int* out_cols, const int in_col, const int* out_cols,
int out_cols_size, T** outputs_data) { int out_cols_size, T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1; int curr_segment = 0;
int curr_offset = out_cols[segment]; int curr_offset = out_cols[0];
int curr_segment = segment;
for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset; int curr_col_offset = out_cols[curr_segment + 1];
while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) { while (curr_col_offset <= tid_x) {
curr_offset = curr_col_offset; curr_offset = curr_col_offset;
++curr_segment; ++curr_segment;
curr_col_offset = out_cols[curr_segment + 1];
} }
int local_col = tid_x - curr_offset; int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset; int segment_width = curr_col_offset - curr_offset;
T* output_ptr = outputs_data[curr_segment]; T* output_ptr = outputs_data[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; if (output_ptr != nullptr) {
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
output_ptr[tid_y * segment_width + local_col] = for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
input_data[tid_y * in_col + tid_x]; output_ptr[tid_y * segment_width + local_col] =
input_data[tid_y * in_col + tid_x];
}
} }
} }
...@@ -118,10 +101,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row, ...@@ -118,10 +101,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
int split = tid_x / fixed_out_col; int split = tid_x / fixed_out_col;
int in_offset = tid_x - split * fixed_out_col; int in_offset = tid_x - split * fixed_out_col;
T* output_ptr = outputs_data[split]; T* output_ptr = outputs_data[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; if (output_ptr != nullptr) {
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
output_ptr[tid_y * fixed_out_col + in_offset] = for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
input_data[tid_y * in_col + tid_x]; output_ptr[tid_y * fixed_out_col + in_offset] =
input_data[tid_y * in_col + tid_x];
}
} }
} }
...@@ -203,17 +188,18 @@ template <typename T> ...@@ -203,17 +188,18 @@ template <typename T>
class ConcatGradFunctor<platform::CUDADeviceContext, T> { class ConcatGradFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const int axis, const framework::Tensor& input,
std::vector<framework::Tensor>* outputs) { const std::vector<const framework::Tensor*>& ref_inputs,
const int axis, std::vector<framework::Tensor*>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int o_num = outputs->size(); int o_num = outputs->size();
int out_row = 1; int out_row = 1;
auto dim_0 = outputs->at(0).dims(); auto dim_0 = ref_inputs[0]->dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
out_row *= dim_0[i]; out_row *= dim_0[i];
} }
int out_col = outputs->at(0).numel() / out_row; int out0_col = ref_inputs[0]->numel() / out_row;
int in_col = 0, in_row = out_row; int in_col = 0, in_row = out_row;
bool sameShape = true; bool sameShape = true;
...@@ -223,13 +209,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -223,13 +209,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
outputs_cols[0] = 0; outputs_cols[0] = 0;
for (int i = 0; i < o_num; ++i) { for (int i = 0; i < o_num; ++i) {
int t_col = outputs->at(i).numel() / out_row; int t_col = ref_inputs.at(i)->numel() / out_row;
if (sameShape) { if (sameShape) {
if (t_col != out_col) sameShape = false; if (t_col != out0_col) sameShape = false;
} }
in_col += t_col; in_col += t_col;
outputs_cols[i + 1] = in_col; outputs_cols[i + 1] = in_col;
outputs_ptr[i] = outputs->at(i).data<T>(); if (outputs->at(i) != nullptr) {
outputs_ptr[i] = outputs->at(i)->data<T>();
} else {
outputs_ptr[i] = nullptr;
}
} }
T** dev_out_gpu_data = T** dev_out_gpu_data =
...@@ -255,7 +245,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -255,7 +245,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
if (sameShape) { if (sameShape) {
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data); input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
} else { } else {
const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace()); const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
......
...@@ -57,7 +57,8 @@ template <typename DeviceContext, typename T> ...@@ -57,7 +57,8 @@ template <typename DeviceContext, typename T>
class ConcatGradFunctor { class ConcatGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const int axis, std::vector<framework::Tensor>* outputs); const std::vector<const framework::Tensor*>& ref_inputs,
const int axis, std::vector<framework::Tensor*>* outputs);
}; };
} // namespace math } // namespace math
......
...@@ -295,7 +295,7 @@ class ParallelDoGradOp : public framework::OperatorBase { ...@@ -295,7 +295,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
framework::AttributeMap{}); framework::AttributeMap{{"use_mkldnn", {false}}});
VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]); VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
sum_op->Run(*sub_scopes[0], places[0]); sum_op->Run(*sub_scopes[0], places[0]);
WaitOnPlace(places[0]); WaitOnPlace(places[0]);
......
...@@ -429,7 +429,8 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -429,7 +429,8 @@ class RecurrentGradOp : public RecurrentBase {
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {pg_names[param_id], new_inside_name}}}, "sum", {{"X", {pg_names[param_id], new_inside_name}}},
{{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); {{"Out", {pg_names[param_id]}}},
framework::AttributeMap{{"use_mkldnn", {false}}});
sum_op->Run(cur_scope, place); sum_op->Run(cur_scope, place);
cur_scope.Rename(new_inside_name, inside_grad_name); cur_scope.Rename(new_inside_name, inside_grad_name);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*Licensed under the Apache License, Version 2.0(the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::CPUDeviceContext;
using framework::DataLayout;
using mkldnn::memory;
using mkldnn::primitive;
using mkldnn::stream;
using mkldnn::sum;
using mkldnn::reorder;
using platform::to_void_cast;
template <typename T>
class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto in_vars = ctx.MultiInputVar("X");
const int N = in_vars.size();
auto out_var = ctx.OutputVar("Out");
bool in_place = out_var == in_vars[0];
if (out_var->IsType<framework::LoDTensor>()) {
LoDTensor* output = ctx.Output<LoDTensor>("Out");
T* output_data = output->mutable_data<T>(ctx.GetPlace());
std::vector<int> dst_tz = framework::vectorize2int(output->dims());
auto src_tz = dst_tz;
memory::format output_format{memory::format::format_undef};
std::vector<float> scales;
std::vector<memory::primitive_desc> srcs_mpd;
std::vector<mkldnn::memory> srcs_mem;
PADDLE_ENFORCE(in_vars[0]->IsType<LoDTensor>(),
"Input[0] must be LoDTensors");
auto& input0 = in_vars[0]->Get<LoDTensor>();
PADDLE_ENFORCE(input0.layout() == DataLayout::kMKLDNN &&
input0.format() != memory::format::format_undef,
"Wrong layout/format for inputs[0]");
memory::format input_format = input0.format();
if (src_tz.size() == 1 && (input_format == memory::format::nchw ||
input_format == memory::format::nhwc)) {
input_format = memory::format::x;
}
if (src_tz.size() == 2 && (input_format == memory::format::nchw ||
input_format == memory::format::nhwc)) {
input_format = memory::format::nc;
}
for (int i = in_place ? 1 : 0; i < N; i++) {
PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
"all inputs must be all LoDTensors");
auto& input = in_vars[i]->Get<LoDTensor>();
PADDLE_ENFORCE(input.layout() == DataLayout::kMKLDNN &&
input.format() != memory::format::format_undef,
"Wrong layout/format for inputs");
if (input.numel() == 0) {
continue;
}
const T* input_data = input.data<T>();
auto src_md =
memory::desc(src_tz, memory::data_type::f32, input_format);
auto src_mpd = memory::primitive_desc(src_md, mkldnn_engine);
auto src_mem = memory(src_mpd, to_void_cast(input_data));
srcs_mpd.push_back(src_mpd);
srcs_mem.push_back(src_mem);
scales.push_back(1.0);
}
auto dst_md =
memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
std::shared_ptr<memory> dst_mem;
if (in_place) {
dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
} else {
dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
}
std::vector<mkldnn::primitive::at> inputs;
for (size_t i = 0; i < srcs_mem.size(); ++i) {
inputs.push_back(srcs_mem[i]);
}
auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem);
output_format = (memory::format)platform::GetMKLDNNFormat(sum_pd);
primitive reorder_prim;
std::shared_ptr<memory> target_mem;
if (in_place) {
output_format = input_format;
target_mem.reset(new memory(
{{{src_tz}, memory::data_type::f32, output_format}, mkldnn_engine},
output_data));
reorder_prim = reorder(*dst_mem, *target_mem);
}
std::vector<primitive> pipeline;
pipeline.push_back(sum_prim);
if (in_place) pipeline.push_back(reorder_prim);
stream(stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN);
output->set_format(output_format);
} else if (out_var->IsType<framework::SelectedRows>()) {
// TODO(@mozga-intel) Add MKLDNN SelectedRows support
std::unique_ptr<framework::SelectedRows> in0;
if (in_place) {
// If is in_place, we store the input[0] to in0
auto& in_sel0 = in_vars[0]->Get<SelectedRows>();
auto& rows = in_sel0.rows();
in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
in0->mutable_value()->ShareDataWith(in_sel0.value());
}
auto get_selected_row = [&](size_t i) -> const SelectedRows& {
if (i == 0 && in0) {
return *in0.get();
} else {
return in_vars[i]->Get<SelectedRows>();
}
};
auto* out = ctx.Output<SelectedRows>("Out");
out->mutable_rows()->clear();
auto* out_value = out->mutable_value();
// Runtime InferShape
size_t first_dim = 0;
for (int i = 0; i < N; i++) {
auto& sel_row = get_selected_row(i);
first_dim += sel_row.rows().size();
}
auto in_dim =
framework::vectorize(get_selected_row(N - 1).value().dims());
in_dim[0] = static_cast<int64_t>(first_dim);
out_value->Resize(framework::make_ddim(in_dim));
// if all the input sparse vars are empty, no need to
// merge these vars.
if (first_dim == 0UL) {
return;
}
out_value->mutable_data<T>(ctx.GetPlace());
math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
int64_t offset = 0;
for (int i = 0; i < N; i++) {
auto& sel_row = get_selected_row(i);
if (sel_row.rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
functor(ctx.template device_context<CPUDeviceContext>(), sel_row,
offset, out);
offset += sel_row.value().numel();
}
} else if (out_var->IsType<framework::LoDTensorArray>()) {
// TODO(@mozga-intel) Add MKLDNN LoDTensorArray support
auto& out_array = *out_var->GetMutable<framework::LoDTensorArray>();
for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
"Only support all inputs are TensorArray");
auto& in_array = in_vars[i]->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < in_array.size(); ++i) {
if (in_array[i].numel() != 0) {
if (i >= out_array.size()) {
out_array.resize(i + 1);
}
if (out_array[i].numel() == 0) {
framework::TensorCopy(in_array[i], in_array[i].place(),
ctx.device_context(), &out_array[i]);
out_array[i].set_lod(in_array[i].lod());
} else {
PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
auto in = EigenVector<T>::Flatten(in_array[i]);
auto result = EigenVector<T>::Flatten(out_array[i]);
result.device(*ctx.template device_context<MKLDNNDeviceContext>()
.eigen_device()) = result + in;
}
}
}
}
} else {
PADDLE_THROW("Unexpected branch, output variable type is %s",
out_var->Type().name());
}
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_KERNEL(sum, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::SumMKLDNNOpKernel<float>);
...@@ -18,6 +18,10 @@ limitations under the License. */ ...@@ -18,6 +18,10 @@ limitations under the License. */
#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::Tensor; using framework::Tensor;
...@@ -63,6 +67,18 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -63,6 +67,18 @@ class SumOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto x_vars = ctx.MultiInputVar("X"); auto x_vars = ctx.MultiInputVar("X");
framework::LibraryType library{framework::LibraryType::kPlain};
framework::DataLayout layout{framework::DataLayout::kAnyLayout};
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
#endif
if (x_vars[0]->IsType<framework::LoDTensor>()) { if (x_vars[0]->IsType<framework::LoDTensor>()) {
int dtype = -1; int dtype = -1;
for (auto& x_var : x_vars) { for (auto& x_var : x_vars) {
...@@ -80,26 +96,27 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -80,26 +96,27 @@ class SumOp : public framework::OperatorWithKernel {
"Sum operator should have at least one tensor"); "Sum operator should have at least one tensor");
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(dtype), static_cast<framework::proto::VarType::Type>(dtype), ctx.GetPlace(),
ctx.device_context()); layout, library);
} else if (x_vars[0]->IsType<framework::SelectedRows>()) { } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
for (auto& var : x_vars) { for (auto& var : x_vars) {
auto& value = var->Get<framework::SelectedRows>().value(); auto& value = var->Get<framework::SelectedRows>().value();
if (value.IsInitialized()) { if (value.IsInitialized()) {
return framework::OpKernelType(framework::ToDataType(value.type()), return framework::OpKernelType(framework::ToDataType(value.type()),
ctx.device_context()); ctx.device_context(), layout, library);
} }
} }
// if input sparse vars are not initialized, use an default kernel type. // if input sparse vars are not initialized, use an default kernel type.
return framework::OpKernelType(framework::proto::VarType::FP32, return framework::OpKernelType(framework::proto::VarType::FP32,
ctx.device_context()); ctx.device_context(), layout, library);
} else if (x_vars[0]->IsType<framework::LoDTensorArray>()) { } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
for (auto& x_var : x_vars) { for (auto& x_var : x_vars) {
auto& array = x_var->Get<framework::LoDTensorArray>(); auto& array = x_var->Get<framework::LoDTensorArray>();
for (auto& each : array) { for (auto& each : array) {
if (each.numel() != 0) { if (each.numel() != 0) {
return framework::OpKernelType(framework::ToDataType(each.type()), return framework::OpKernelType(framework::ToDataType(each.type()),
ctx.device_context()); ctx.device_context(), layout,
library);
} }
} }
} }
...@@ -116,6 +133,9 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -116,6 +133,9 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "(vector<Tensor>) The input tensors of sum operator.") AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
.AsDuplicable(); .AsDuplicable();
AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X"); AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Sum operator. Sum operator.
...@@ -132,7 +152,6 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -132,7 +152,6 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
framework::BlockDesc* block) const override { framework::BlockDesc* block) const override {
auto& inputs = op_desc.Input("X"); auto& inputs = op_desc.Input("X");
auto var_type = framework::proto::VarType::SELECTED_ROWS; auto var_type = framework::proto::VarType::SELECTED_ROWS;
for (auto& name : op_desc.Input("X")) { for (auto& name : op_desc.Input("X")) {
VLOG(10) << name << " " VLOG(10) << name << " "
<< block->FindRecursiveOrCreateVar(name).GetType(); << block->FindRecursiveOrCreateVar(name).GetType();
...@@ -206,6 +225,7 @@ namespace ops = paddle::operators; ...@@ -206,6 +225,7 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker, REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
ops::SumOpVarTypeInference); ops::SumOpVarTypeInference);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>, sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
ops::SumKernel<paddle::platform::CPUDeviceContext, double>, ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
......
...@@ -203,11 +203,11 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -203,11 +203,11 @@ class WhileGradOp : public framework::OperatorBase {
->set_lod(inside_tensor.lod()); ->set_lod(inside_tensor.lod());
} }
} }
auto new_inside_name = cur_scope.Rename(inside_grad_name); auto new_inside_name = cur_scope.Rename(inside_grad_name);
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {pg_names[param_id], new_inside_name}}}, "sum", {{"X", {pg_names[param_id], new_inside_name}}},
{{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); {{"Out", {pg_names[param_id]}}},
framework::AttributeMap{{"use_mkldnn", {false}}});
sum_op->Run(cur_scope, dev_place); sum_op->Run(cur_scope, dev_place);
cur_scope.Rename(new_inside_name, inside_grad_name); cur_scope.Rename(new_inside_name, inside_grad_name);
} }
......
...@@ -28,9 +28,15 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1, ...@@ -28,9 +28,15 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle," "Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
DEFINE_uint64( DEFINE_uint64(initial_cpu_memory_in_mb,
initial_cpu_memory_in_mb, 500, #ifdef PADDLE_WITH_MKLDNN
"Default initial 500MB of CPU memory for PaddlePaddle, in MD unit."); /* Aligned with mozga-intel, MKLDNN need at least 5000 MB
* to obtain the best performance*/
5000,
#else
500,
#endif
"Initial CPU memory for PaddlePaddle, in MD unit.");
DEFINE_double( DEFINE_double(
fraction_of_cuda_pinned_memory_to_use, 0.5, fraction_of_cuda_pinned_memory_to_use, 0.5,
...@@ -59,10 +65,7 @@ inline size_t CpuTotalPhysicalMemory() { ...@@ -59,10 +65,7 @@ inline size_t CpuTotalPhysicalMemory() {
size_t CpuMaxAllocSize() { size_t CpuMaxAllocSize() {
// For distributed systems, it requires configuring and limiting // For distributed systems, it requires configuring and limiting
// the fraction of memory to use. // the fraction of memory to use.
return std::min( return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
CpuTotalPhysicalMemory()),
static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
} }
size_t CpuMinChunkSize() { size_t CpuMinChunkSize() {
...@@ -71,8 +74,11 @@ size_t CpuMinChunkSize() { ...@@ -71,8 +74,11 @@ size_t CpuMinChunkSize() {
} }
size_t CpuMaxChunkSize() { size_t CpuMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 3% of CPU memory. // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
return CpuMaxAllocSize() / 32; // or the initial_cpu_memory_in_mb.
return std::min(
static_cast<size_t>(CpuMaxAllocSize() / 32),
static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
} }
size_t CUDAPinnedMaxAllocSize() { size_t CUDAPinnedMaxAllocSize() {
......
...@@ -99,5 +99,11 @@ inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) { ...@@ -99,5 +99,11 @@ inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
memory.get_primitive_desc().desc().data.format); memory.get_primitive_desc().desc().data.format);
} }
inline mkldnn::memory::format GetMKLDNNFormat(
const mkldnn::sum::primitive_desc& memory) {
return static_cast<mkldnn::memory::format>(
memory.dst_primitive_desc().desc().data.format);
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -30,8 +30,9 @@ int main(int argc, char** argv) { ...@@ -30,8 +30,9 @@ int main(int argc, char** argv) {
new_argv.push_back( new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
#else #else
new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn")); new_argv.push_back(strdup(
new_argv.push_back(strdup("--undefok=use_mkldnn")); "--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_mb"));
new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
#endif #endif
int new_argc = static_cast<int>(new_argv.size()); int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data(); char** new_argv_address = new_argv.data();
......
...@@ -117,7 +117,7 @@ def __bootstrap__(): ...@@ -117,7 +117,7 @@ def __bootstrap__():
read_env_flags = [ read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn' 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
] ]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
......
...@@ -36,6 +36,25 @@ def _is_number_or_matrix_(var): ...@@ -36,6 +36,25 @@ def _is_number_or_matrix_(var):
class WeightedAverage(object): class WeightedAverage(object):
"""
Calculate weighted average.
The average calculating is accomplished via Python totally.
They do not change Paddle's Program, nor do anything to
modify NN model's configuration. They are completely
wrappers of Python functions.
Examples:
.. code-block:: python
avg = fluid.average.WeightedAverage()
avg.add(value=2.0, weight=1)
avg.add(value=4.0, weight=2)
avg.eval()
# The result is 3.333333333.
# For (2.0 * 1 + 4.0 * 2) / (1 + 2) = 3.333333333
"""
def __init__(self): def __init__(self):
warnings.warn( warnings.warn(
"The %s is deprecated, please use fluid.metrics.Accuracy instead." % "The %s is deprecated, please use fluid.metrics.Accuracy instead." %
......
...@@ -132,9 +132,9 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -132,9 +132,9 @@ def _addup_repetitive_outputs_(op_descs):
for idx, op_desc in enumerate(op_descs): for idx, op_desc in enumerate(op_descs):
for var_name in op_desc.input_arg_names(): for var_name in op_desc.input_arg_names():
if len(renamed_vars[var_name]) > 1: if len(renamed_vars[var_name]) > 1:
pending_sum_ops.append( pending_sum_ops.append((_create_op_desc_(
(_create_op_desc_("sum", {"X": renamed_vars[var_name]}, "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
{"Out": [var_name]}, {}), idx)) {"use_mkldnn": False}), idx))
renamed_vars[var_name] = [var_name] renamed_vars[var_name] = [var_name]
for var_name in op_desc.output_arg_names(): for var_name in op_desc.output_arg_names():
if var_name == core.empty_var_name( if var_name == core.empty_var_name(
...@@ -147,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -147,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs):
else: else:
if len(renamed_vars[var_name]) == 1: if len(renamed_vars[var_name]) == 1:
new_name = var_name + "@RENAME@" + \ new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name]) str(var_rename_count[var_name])
var_rename_count[var_name] += 1 var_rename_count[var_name] += 1
# rename original var_name # rename original var_name
renamed_vars[var_name][0] = new_name renamed_vars[var_name][0] = new_name
...@@ -155,14 +155,15 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -155,14 +155,15 @@ def _addup_repetitive_outputs_(op_descs):
_rename_arg_(pending_sum_ops, var_name, new_name) _rename_arg_(pending_sum_ops, var_name, new_name)
new_name = var_name + "@RENAME@" + \ new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name]) str(var_rename_count[var_name])
var_rename_count[var_name] += 1 var_rename_count[var_name] += 1
op_desc.rename_output(var_name, new_name) op_desc.rename_output(var_name, new_name)
renamed_vars[var_name].append(new_name) renamed_vars[var_name].append(new_name)
for var_name, inputs in renamed_vars.iteritems(): for var_name, inputs in renamed_vars.iteritems():
if len(inputs) > 1: if len(inputs) > 1:
pending_sum_ops.append((_create_op_desc_( pending_sum_ops.append(
"sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs))) (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
{"use_mkldnn": False}), len(op_descs)))
# sum_op descs are sorted according to their insert position # sum_op descs are sorted according to their insert position
for p in reversed(pending_sum_ops): for p in reversed(pending_sum_ops):
op_descs.insert(p[1], p[0]) op_descs.insert(p[1], p[0])
...@@ -434,18 +435,65 @@ def _get_stop_gradients_(program): ...@@ -434,18 +435,65 @@ def _get_stop_gradients_(program):
def append_backward(loss, parameter_list=None, no_grad_set=None, def append_backward(loss, parameter_list=None, no_grad_set=None,
callbacks=None): callbacks=None):
""" """
Append backward part to main_program Append backward part to main_program.
Args: A complete neural network training is made up of forward and backward
loss(Variable): The variable generated by cost function. propagation. However, when we configure a network, we only need to
parameter_list(list[string]): Parameters that need to be updated by specify its forwrd part. The backward part is generated automatically
optimizer. If None, it means all parameters need to be updated. according to the forward part by this function.
no_grad_set(set): Variables that have no gradients in Block 0.
All variables with `step_gradient=True` from all blocks will be
automatically added.
Return: In most cases, users do not need to invoke this function manually. It
(list[(Variable,Variable)]): list of (parameter, gradient) pair. will be automatically invoked by the optimizer's `minimize` function.
Args:
loss(Variable): The loss variable of the network.
parameter_list(list[string]|None): Names of parameters that need
to be updated by optimizers.
If it is None, all parameters
will be updated.
Default: None
no_grad_set(set|None): Variables in the Block 0 whose gradients
should be ignored. All variables with
`step_gradient=True` from all blocks will
be automatically added into this set.
Default: None
callbacks(list[callable object]|None): The callbacks are used for
doing some custom jobs during
backward part building. All
callable objects in it will
be invoked once each time a
new gradient operator is added
into the program. The callable
object must has two input
parameters: 'block' and 'context'.
The 'block' is the block which
the new gradient operator will
be added to. The 'context' is a
map, whose keys are gradient
variable names and values are
corresponding original variables.
In addition to this, the 'context'
has another special key-value pair:
the key is string '__current_op_desc__'
and the value is the op_desc of the
gradient operator who has just
triggered the callable object.
Returns:
list[(Variable,Variable)]: Pairs of parameter and its
corresponding gradients. The key is the parameter and the
value is gradient variable.
Raises:
AssertionError: If `loss` is not an instance of Variable.
Examples:
.. code-block:: python
# network configuration code
# ...
avg_loss = fluid.layers.mean(loss)
param_grad_list = fluid.backward.append_backward(loss=avg_loss)
""" """
assert isinstance(loss, framework.Variable) assert isinstance(loss, framework.Variable)
......
...@@ -24,8 +24,6 @@ __all__ = [ ...@@ -24,8 +24,6 @@ __all__ = [
'GradientClipByValue', 'GradientClipByValue',
'GradientClipByNorm', 'GradientClipByNorm',
'GradientClipByGlobalNorm', 'GradientClipByGlobalNorm',
'append_gradient_clip_ops',
'error_clip_callback',
] ]
...@@ -38,6 +36,25 @@ class BaseErrorClipAttr(object): ...@@ -38,6 +36,25 @@ class BaseErrorClipAttr(object):
class ErrorClipByValue(BaseErrorClipAttr): class ErrorClipByValue(BaseErrorClipAttr):
"""
Clips tensor values to the range [min, max].
Given a tensor t, this operation clips its value to min and max inplace.
- Any values less than min are set to min.
- Any values greater than max are set to max.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, \
will be set to -max by framework.
Examples:
.. code-block:: python
var = fluid.framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
"""
def __init__(self, max, min=None): def __init__(self, max, min=None):
max = float(max) max = float(max)
if min is None: if min is None:
...@@ -99,6 +116,31 @@ class NullGradientClipAttr(BaseGradientClipAttr): ...@@ -99,6 +116,31 @@ class NullGradientClipAttr(BaseGradientClipAttr):
class GradientClipByValue(BaseGradientClipAttr): class GradientClipByValue(BaseGradientClipAttr):
"""
Clips gradient values to the range [min, max].
Given a tensor t, this operation clips its value to min and max inplace.
- Any values less than min are set to min.
- Any values greater than max are set to max.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, \
will be set to -max by framework.
Examples:
.. code-block:: python
w_param_attrs = ParamAttr(name=None,
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
learning_rate=1.0,
regularizer=L1Decay(1.0),
trainable=True,
clip=GradientClipByValue(-1.0, 1.0))
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
"""
def __init__(self, max, min=None): def __init__(self, max, min=None):
max = float(max) max = float(max)
if min is None: if min is None:
...@@ -120,6 +162,37 @@ class GradientClipByValue(BaseGradientClipAttr): ...@@ -120,6 +162,37 @@ class GradientClipByValue(BaseGradientClipAttr):
class GradientClipByNorm(BaseGradientClipAttr): class GradientClipByNorm(BaseGradientClipAttr):
"""
Clips tensor values to a maximum L2-norm.
This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
:math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
:math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
.. math::
Out = \\frac{max\_norm * X}{norm(X)},
where :math:`norm(X)` represents the L2 norm of :math:`X`.
Args:
clip_norm (float): The maximum norm value
Examples:
.. code-block:: python
w_param_attrs = ParamAttr(name=None,
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
learning_rate=1.0,
regularizer=L1Decay(1.0),
trainable=True,
clip=GradientClipByNorm(clip_norm=2.0))
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
"""
def __init__(self, clip_norm): def __init__(self, clip_norm):
self.clip_norm = clip_norm self.clip_norm = clip_norm
...@@ -135,6 +208,44 @@ class GradientClipByNorm(BaseGradientClipAttr): ...@@ -135,6 +208,44 @@ class GradientClipByNorm(BaseGradientClipAttr):
class GradientClipByGlobalNorm(BaseGradientClipAttr): class GradientClipByGlobalNorm(BaseGradientClipAttr):
"""
Clips values of multiple tensors by the ratio of the sum of their norms.
Given a list of tensors t_list, and a clipping ratio clip_norm, this
operation returns a list of clipped tensors list_clipped and the global
norm (global_norm) of all tensors in t_list.
To perform the clipping, the values :math:`t\_list[i]` are set to:
.. math::
t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}
where:
.. math::
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are,
otherwise they're all shrunk by the global ratio.
Args:
clip_norm (float): The maximum norm value
group_name (str, optional): The group name for this clip.
Examples:
.. code-block:: python
p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
with fluid.program_guard(main_program=prog_clip):
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
"""
def __init__(self, clip_norm, group_name="default_group"): def __init__(self, clip_norm, group_name="default_group"):
if not isinstance(group_name, basestring): if not isinstance(group_name, basestring):
raise TypeError("'group_name' must be a basestring.") raise TypeError("'group_name' must be a basestring.")
...@@ -183,15 +294,16 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -183,15 +294,16 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
def set_gradient_clip(clip, param_list=None, program=None): def set_gradient_clip(clip, param_list=None, program=None):
""" """
To specify parameters that require gradient clip. To specify parameters that require gradient clip.
Args:
clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, Args:
which describes the type and detailed attributes of required gradient clip. clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
param_list(list, None by default): Parameters that require gradient clip. which describes the type and detailed attributes of required gradient clip.
It can be a list of parameter or a list of parameter's name. param_list(list(Variable)): Parameters that require gradient clip.
When it's None, all parameters in the program will be included. It can be a list of parameter or a list of parameter's name.
program(Program, None by default): The program where parameters are. When it's None, all parameters in the program will be included.
Will be the default main program when assigned with None. program(Program): The program where parameters are.
Will be the default main program when assigned with None.
""" """
if not isinstance(clip, BaseGradientClipAttr): if not isinstance(clip, BaseGradientClipAttr):
raise TypeError( raise TypeError(
......
...@@ -29,6 +29,13 @@ class DataToLoDTensorConverter(object): ...@@ -29,6 +29,13 @@ class DataToLoDTensorConverter(object):
self.place = place self.place = place
self.lod_level = lod_level self.lod_level = lod_level
self.shape = shape self.shape = shape
negtive_count = 0
for s in self.shape:
if s < 0:
negtive_count += 1
if negtive_count > 1:
self.shape = None
break
if dtype == core.VarDesc.VarType.FP32: if dtype == core.VarDesc.VarType.FP32:
self.dtype = 'float32' self.dtype = 'float32'
elif dtype == core.VarDesc.VarType.INT64: elif dtype == core.VarDesc.VarType.INT64:
...@@ -61,7 +68,9 @@ class DataToLoDTensorConverter(object): ...@@ -61,7 +68,9 @@ class DataToLoDTensorConverter(object):
self._feed_impl_(each_data, lod[1:], lod_level - 1) self._feed_impl_(each_data, lod[1:], lod_level - 1)
def done(self): def done(self):
arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape) arr = numpy.array(self.data, dtype=self.dtype)
if self.shape:
arr = arr.reshape(self.shape)
t = core.LoDTensor() t = core.LoDTensor()
t.set(arr, self.place) t.set(arr, self.place)
if self.lod_level > 0: if self.lod_level > 0:
......
...@@ -645,7 +645,13 @@ class Operator(object): ...@@ -645,7 +645,13 @@ class Operator(object):
def set_attr(self, name, val): def set_attr(self, name, val):
self.attrs[name] = val self.attrs[name] = val
self.desc.set_attr(name, val) if isinstance(val, Block):
self.desc.set_block_attr(name, val.desc)
elif isinstance(val, core.BlockDesc) or \
isinstance(val, core.ProgramDesc):
self.desc.set_serialized_attr(name, val.serialize_to_string())
else:
self.desc.set_attr(name, val)
@property @property
def attr_names(self): def attr_names(self):
......
...@@ -27,13 +27,30 @@ __all__ = ['Inferencer', ] ...@@ -27,13 +27,30 @@ __all__ = ['Inferencer', ]
class Inferencer(object): class Inferencer(object):
"""
Inferencer High Level API.
Args:
infer_func (Python func): Infer function that will return predict Variable
param_path (str): The path where the inference model is saved by fluid.io.save_params
place (Place): place to do the inference
parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
Examples:
.. code-block:: python
def inference_program():
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
return y_predict
place = fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path="/tmp/model", place=place)
"""
def __init__(self, infer_func, param_path, place=None, parallel=False): def __init__(self, infer_func, param_path, place=None, parallel=False):
"""
:param infer_func: a function that will return predict Variable
:param param_path: the path where the inference model is saved by fluid.io.save_params
:param place: place to do the inference
:param parallel: use parallel_executor to run the inference, it will use multi CPU/GPU.
"""
self.param_path = param_path self.param_path = param_path
self.scope = core.Scope() self.scope = core.Scope()
self.parallel = parallel self.parallel = parallel
...@@ -60,9 +77,20 @@ class Inferencer(object): ...@@ -60,9 +77,20 @@ class Inferencer(object):
def infer(self, inputs, return_numpy=True): def infer(self, inputs, return_numpy=True):
""" """
:param inputs: a map of {"input_name": input_var} that will be feed into the inference program Do Inference for Inputs
to get the predict value
:return: the predict value of the inference model Args:
inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
return_numpy (bool): transform return value into numpy or not
Returns:
Tensor or Numpy: the predict value of the inference model for the inputs
Examples:
.. code-block:: python
tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
results = inferencer.infer({'x': tensor_x})
""" """
if not isinstance(inputs, dict): if not isinstance(inputs, dict):
raise ValueError( raise ValueError(
......
...@@ -19,26 +19,39 @@ from framework import convert_np_dtype_to_dtype_ ...@@ -19,26 +19,39 @@ from framework import convert_np_dtype_to_dtype_
from core import VarDesc from core import VarDesc
__all__ = [ __all__ = [
'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'force_init_on_cpu', 'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
'init_on_cpu', 'ConstantInitializer', 'UniformInitializer', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
'NormalInitializer', 'XavierInitializer', 'BilinearInitializer' 'UniformInitializer', 'NormalInitializer', 'XavierInitializer',
'BilinearInitializer', 'MSRAInitializer'
] ]
_force_init_on_cpu_ = False _force_init_on_cpu_ = False
def force_init_on_cpu(): def force_init_on_cpu():
"""
The flag of whether force to init variables on CPU.
Examples:
.. code-block:: python
if force_init_on_cpu():
pass
"""
return _force_init_on_cpu_ return _force_init_on_cpu_
@contextlib.contextmanager @contextlib.contextmanager
def init_on_cpu(): def init_on_cpu():
""" """
Switch program with `with` statement Force the variable to be inited on CPU.
Examples: Examples:
>>> with init_on_cpu(): .. code-block:: python
>>> step = layers.create_global_var()
with init_on_cpu():
step = layers.create_global_var()
""" """
global _force_init_on_cpu_ global _force_init_on_cpu_
...@@ -104,14 +117,18 @@ class Initializer(object): ...@@ -104,14 +117,18 @@ class Initializer(object):
class ConstantInitializer(Initializer): class ConstantInitializer(Initializer):
"""Implements the constant initializer """Implements the constant initializer
Args:
value (float): constant value to initialize the variable
Examples:
.. code-block:: python
fc = fluid.layers.fc(input=x, size=10,
param_attr=fluid.initializer.Constant(value=2.0))
""" """
def __init__(self, value=0.0, force_cpu=False): def __init__(self, value=0.0, force_cpu=False):
"""Constructor for ConstantInitializer
Args:
value: constant value to initialize the variable
"""
assert value is not None assert value is not None
super(ConstantInitializer, self).__init__() super(ConstantInitializer, self).__init__()
self._value = value self._value = value
...@@ -146,16 +163,20 @@ class ConstantInitializer(Initializer): ...@@ -146,16 +163,20 @@ class ConstantInitializer(Initializer):
class UniformInitializer(Initializer): class UniformInitializer(Initializer):
"""Implements the random uniform distribution initializer """Implements the random uniform distribution initializer
Args:
low (float): lower boundary of the uniform distribution
high (float): upper boundary of the uniform distribution
seed (int): random seed
Examples:
.. code-block:: python
fc = fluid.layers.fc(input=x, size=10,
param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
""" """
def __init__(self, low=-1.0, high=1.0, seed=0): def __init__(self, low=-1.0, high=1.0, seed=0):
"""Constructor for UniformInitializer
Args:
low: lower boundary of the uniform distribution
high: upper boundary of the uniform distribution
seed: random seed
"""
assert low is not None assert low is not None
assert high is not None assert high is not None
assert high >= low assert high >= low
...@@ -196,17 +217,21 @@ class UniformInitializer(Initializer): ...@@ -196,17 +217,21 @@ class UniformInitializer(Initializer):
class NormalInitializer(Initializer): class NormalInitializer(Initializer):
"""Implements the random Normal(Gaussian) distribution initializer """Implements the Random Normal(Gaussian) distribution initializer
Args:
loc (float): mean of the normal distribution
scale (float): standard deviation of the normal distribution
seed (int): random seed
Examples:
.. code-block:: python
fc = fluid.layers.fc(input=x, size=10,
param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
""" """
def __init__(self, loc=0.0, scale=1.0, seed=0): def __init__(self, loc=0.0, scale=1.0, seed=0):
"""Constructor for NormalInitializer
Args:
loc: mean of the normal distribution
scale: standard deviation of the normal distribution
seed: random seed
"""
assert loc is not None assert loc is not None
assert scale is not None assert scale is not None
assert seed is not None assert seed is not None
...@@ -246,39 +271,49 @@ class NormalInitializer(Initializer): ...@@ -246,39 +271,49 @@ class NormalInitializer(Initializer):
class XavierInitializer(Initializer): class XavierInitializer(Initializer):
"""Implements the Xavier initializer """
This class implements the Xavier weight initializer from the paper This class implements the Xavier weight initializer from the paper
Understanding the difficulty of training deep feedforward neural `Understanding the difficulty of training deep feedforward neural
networks[1] by Xavier Glorot and Yoshua Bengio. networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
by Xavier Glorot and Yoshua Bengio.
This initializer is designed to keep the scale of the gradients This initializer is designed to keep the scale of the gradients
approximately same in all the layers. In case of Uniform distribution, approximately same in all the layers. In case of Uniform distribution,
the range is [-x, x], where x = sqrt(6 / (fan_in + fan_out)). the range is [-x, x], where
.. math::
x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
In case of Normal distribution, the mean is 0 and the standard deviation In case of Normal distribution, the mean is 0 and the standard deviation
is sqrt(2/ (fan_in + fan_out)). is
.. math::
\sqrt{\\frac{2.0}{fan\_in + fan\_out}}
Args:
uniform (bool): whether to use uniform or normal distribution
fan_in (float): fan_in for Xavier initialization. If None, it is
inferred from the variable.
fan_out (float): fan_out for Xavier initialization. If None, it is
inferred from the variable.
seed (int): random seed
Note:
It is recommended to set fan_in and fan_out to None for most cases.
Examples:
.. code-block:: python
fc = fluid.layers.fc(
input=queries, size=10,
param_attr=fluid.initializer.Xavier(uniform=False))
References:
[1] Understanding the difficulty of training deep feedforward neural
networks. International conference on artificial intelligence and
statistics.
(http://proceedings.mlr.press/v9/glorot10a.html)
""" """
def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0): def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
"""Constructor for XavierInitializer
Args:
uniform: whether to use uniform or normal distribution
fan_in: fan_in for Xavier initialization. If None, it is
inferred from the variable.
fan_out: fan_out for Xavier initialization. If None, it is
inferred from the variable.
seed: random seed
Note: It is recommended to set fan_in and fan_out to None for
most cases.
"""
assert uniform is not None assert uniform is not None
assert seed is not None assert seed is not None
super(XavierInitializer, self).__init__() super(XavierInitializer, self).__init__()
...@@ -342,30 +377,42 @@ class MSRAInitializer(Initializer): ...@@ -342,30 +377,42 @@ class MSRAInitializer(Initializer):
"""Implements the MSRA initializer a.k.a. Kaiming Initializer """Implements the MSRA initializer a.k.a. Kaiming Initializer
This class implements the weight initialization from the paper This class implements the weight initialization from the paper
Delving Deep into Rectifiers: Surpassing Human-Level Performance on `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
and Jian Sun. This is a robust initialization method that particularly by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
considers the rectifier nonlinearities. In case of Uniform distribution, robust initialization method that particularly considers the rectifier
the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal nonlinearities. In case of Uniform distribution, the range is [-x, x], where
distribution, the mean is 0 and the standard deviation
is sqrt(2/ fan_in). .. math::
References: x = \sqrt{\\frac{6.0}{fan\_in}}
[1] Delving Deep into Rectifiers: Surpassing Human-Level Performance
on ImageNet Classification In case of Normal distribution, the mean is 0 and the standard deviation
(https://arxiv.org/abs/1502.01852) is
.. math::
\sqrt{\\frac{2.0}{fan\_in}}
Args:
uniform (bool): whether to use uniform or normal distribution
fan_in (float): fan_in for MSRAInitializer. If None, it is\
inferred from the variable.
seed (int): random seed
Note:
It is recommended to set fan_in to None for most cases.
Examples:
.. code-block:: python
fc = fluid.layers.fc(
input=queries, size=10,
param_attr=fluid.initializer.MSRA(uniform=False))
""" """
def __init__(self, uniform=True, fan_in=None, seed=0): def __init__(self, uniform=True, fan_in=None, seed=0):
"""Constructor for MSRAInitializer """Constructor for MSRAInitializer
Args:
uniform: whether to use uniform or normal distribution
fan_in: fan_in for MSRAInitializer. If None, it is
inferred from the variable.
seed: random seed
Note: It is recommended to set fan_in to None for most cases.
""" """
assert uniform is not None assert uniform is not None
assert seed is not None assert seed is not None
...@@ -425,34 +472,37 @@ class MSRAInitializer(Initializer): ...@@ -425,34 +472,37 @@ class MSRAInitializer(Initializer):
class BilinearInitializer(Initializer): class BilinearInitializer(Initializer):
"""Implements the bilinear initializer. """
This initializer can be used in transposed convolution operator to This initializer can be used in transposed convolution operator to
act as upsampling. Users can upsample a feature map with shape of act as upsampling. Users can upsample a feature map with shape of
(B, C, H, W) by any integer factor. The usage is: (B, C, H, W) by any integer factor. The usage is:
>>> factor = 2 Examples:
>>> w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
>>> initializer=Bilinear()) .. code-block:: python
>>> conv_up = fluid.layers.conv2d_transpose(
>>> input, factor = 2
>>> num_filters=C, w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
>>> output_size=None, initializer=Bilinear())
>>> filter_size=2 * factor - factor % 2, conv_up = fluid.layers.conv2d_transpose(
>>> padding=ceil((factor - 1) / 2.), input,
>>> stride=factor, num_filters=C,
>>> groups=C, output_size=None,
>>> param_attr=w_attr, filter_size=2 * factor - factor % 2,
>>> bias_attr=False) padding=ceil((factor - 1) / 2.),
stride=factor,
groups=C,
Where, `num_filters=C` and `groups=C` means this is channel-wise tranposed param_attr=w_attr,
bias_attr=False)
Where, `num_filters=C` and `groups=C` means this is channel-wise transposed
convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`, convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
This initializer will set a (K, K) interpolation kernel for every channel This initializer will set a (K, K) interpolation kernel for every channel
of the filter identically. The resulting shape of the output feature map of the filter identically. The resulting shape of the output feature map
will be (B, C, factor * H, factor * W). Note that the learning rate and the will be (B, C, factor * H, factor * W). Note that the learning rate and the
weight decay are set to 0 in order to keep coefficient values of bilinear weight decay are set to 0 in order to keep coefficient values of bilinear
interpolation unchanged during training. interpolation unchanged during training.
""" """
def __init__(self): def __init__(self):
...@@ -469,7 +519,7 @@ class BilinearInitializer(Initializer): ...@@ -469,7 +519,7 @@ class BilinearInitializer(Initializer):
be added. be added.
Returns: Returns:
the initialization op Operator: the initialization op
Raises: Raises:
ValueError: If type of `var` and `block` is not right. ValueError: If type of `var` and `block` is not right.
......
...@@ -31,20 +31,42 @@ __all__ = [ ...@@ -31,20 +31,42 @@ __all__ = [
def is_parameter(var): def is_parameter(var):
"""Check whether the variable is a Parameter. """
Check whether the given variable is an instance of Parameter.
This function checks whether the input variable is a Parameter.
Args: Args:
var : The input variable. var(Variable): The variable to be checked.
Returns: Returns:
boolean result whether the variable is a Parameter. bool: True if the given `var` is an instance of Parameter,
False if not.
Examples:
.. code-block:: python
param = fluid.default_main_program().global_block().var('fc.w')
res = fluid.io.is_parameter(param)
""" """
return isinstance(var, Parameter) return isinstance(var, Parameter)
def is_persistable(var): def is_persistable(var):
"""
Check whether the given variable is persistable.
Args:
var(Variable): The variable to be checked.
Returns:
bool: True if the given `var` is persistable
False if not.
Examples:
.. code-block:: python
param = fluid.default_main_program().global_block().var('fc.w')
res = fluid.io.is_persistable(param)
"""
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST: var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
return False return False
...@@ -69,20 +91,69 @@ def save_vars(executor, ...@@ -69,20 +91,69 @@ def save_vars(executor,
predicate=None, predicate=None,
filename=None): filename=None):
""" """
Save variables to directory by executor. Save variables to the given directory by executor.
There are two ways to specify variables to be saved: The first way, list
variables in a list and assign it to the `vars`. The second way, assign the
`main_program` with an existing program, then all variables in the program
will be saved. The first way has a higher priority. In other words, if `vars`
are assigned, the `main_program` and the `predicate` will be ignored.
:param executor: executor that save variable The `dirname` are used to specify the folder where to save variables.
:param dirname: directory path If you prefer to save variables in separate files in the folder `dirname`,
:param main_program: program. If vars is None, then filter all variables in this set `filename` None; if you prefer to save all variables in a single file,
program which fit `predicate`. Default default_main_program. use `filename` to specify it.
:param predicate: The Predicate describes a callable that returns a variable
as a bool. If it returns true, the corresponding input variable will be saved.
:param vars: variables need to be saved. If vars is specified, program & predicate
will be ignored
:param filename: The name of a single file that all vars are saved to.
If it is None, save variables to separate files.
:return: None Args:
executor(Executor): The executor to run for saving variables.
dirname(str): The directory path.
main_program(Program|None): The program whose variables will be saved.
If it is None, the default main program will
be used automatically.
Default: None
vars(list[Variable]|None): The list that contains all variables to save.
It has a higher priority than the `main_program`.
Default: None
predicate(function|None): If it is not None, only variables in the
`main_program` that makes predicate(variable)==True
will be saved. It only works when we are using the
`main_program` to specify variables (In other words
`vars` is None).
Default: None
filename(str|None): The file which to save all variables. If you prefer to save
variables separately, set it to None.
Default: None
Returns:
None
Raises:
TypeError: If `main_program` is not an instance of Program nor None.
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
# The first usage: using `main_program` to specify variables
def name_has_fc(var):
res = "fc" in var.name
return res
prog = fluid.default_main_program()
fluid.io.save_vars(executor=exe, dirname=path, main_program=prog,
vars=None)
# All variables in `main_program` whose name includes "fc" will be saved.
# And variables are going to be saved separately.
# The second usage: using `vars` to specify variables
var_list = [var_a, var_b, var_c]
fluid.io.save_vars(executor=exe, dirname=path, vars=var_list,
filename="vars_file")
# var_a, var_b and var_c will be saved. And they are going to be
# saved in the same file named 'var_file' in the path "./my_paddle_model".
""" """
if vars is None: if vars is None:
if main_program is None: if main_program is None:
...@@ -130,7 +201,42 @@ def save_vars(executor, ...@@ -130,7 +201,42 @@ def save_vars(executor,
def save_params(executor, dirname, main_program=None, filename=None): def save_params(executor, dirname, main_program=None, filename=None):
""" """
Save all parameters to directory with executor. This function filters out all parameters from the give `main_program`
and then save them to the folder `dirname` or the file `filename`.
Use the `dirname` to specify the saving folder. If you would like to
save parameters in separate files, set `filename` None; if you would
like to save all parameters in a single file, use `filename` to specify
the file name.
NOTICE: Some variables are not Parameter while they are necessary for
training. So you can NOT save and continue your training just by
`save_params()` and `load_params()`. Please use `save_persistables()`
and `load_persistables()` instead.
Args:
executor(Executor): The executor to run for saving parameters.
dirname(str): The saving directory path.
main_program(Program|None): The program whose parameters will be
saved. If it is None, the default
main program will be used automatically.
Default: None
filename(str|None): The file to save all parameters. If you prefer
to save parameters in differnet files, set it
to None.
Default: None
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
fluid.io.save_params(executor=exe, dirname=param_path,
main_program=None)
""" """
save_vars( save_vars(
executor, executor,
...@@ -143,7 +249,37 @@ def save_params(executor, dirname, main_program=None, filename=None): ...@@ -143,7 +249,37 @@ def save_params(executor, dirname, main_program=None, filename=None):
def save_persistables(executor, dirname, main_program=None, filename=None): def save_persistables(executor, dirname, main_program=None, filename=None):
""" """
Save all persistables to directory with executor. This function filters out all variables with `persistable==True` from the
give `main_program` and then saves these variables to the folder `dirname`
or file `filename`.
The `dirname` is used to specify the folder where persistable variables
are going to be saved. If you would like to save variables in separate
files, set `filename` None; if you would like to save all variables in a
single file, use `filename` to specify the file name.
Args:
executor(Executor): The executor to run for saving persistable variables.
dirname(str): The directory path.
main_program(Program|None): The program whose persistbale variables will
be saved. If it is None, the default main
program will be used automatically.
Default: None
filename(str|None): The file to saved all variables. If you prefer to
save variables in differnet files, set it to None.
Default: None
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
fluid.io.save_persistables(executor=exe, dirname=param_path,
main_program=None)
""" """
save_vars( save_vars(
executor, executor,
...@@ -161,20 +297,69 @@ def load_vars(executor, ...@@ -161,20 +297,69 @@ def load_vars(executor,
predicate=None, predicate=None,
filename=None): filename=None):
""" """
Load variables from directory by executor. Load variables from the given directory by executor.
There are two ways to specify variables to be loaded: The first way, list
variables in a list and assign it to the `vars`. The second way, assign the
`main_program` with an existing program, then all variables in the program
will be loaded. The first way has a higher priority. In other words if `vars`
are assigned, the `main_program` and the `predicate` will be ignored.
The `dirname` are used to specify the folder where to load variables.
If variables were saved in separate files in the folder `dirname`,
set `filename` None; if all variables were saved in a single file,
use `filename` to specify it.
:param executor: executor that load variable Args:
:param dirname: directory path executor(Executor): The executor to run for loading variables.
:param main_program: program. If vars is None, then filter all variables in this dirname(str): The directory path.
program which fit `predicate`. Default default_main_program(). main_program(Program|None): The program whose variables will be loaded.
:param predicate: The Predicate describes a callable that returns a variable If it is None, the default main program will
as a bool. If it returns true, the corresponding input variable will be loaded. be used automatically.
:param vars: variables need to be loaded. If vars is specified, program & Default: None
predicate will be ignored vars(list[Variable]|None): The list that contains all variables to load.
:param filename: The name of the single file that all vars are loaded from. It has a higher priority than the `main_program`.
If it is None, load variables from separate files. Default: None
predicate(function|None): If it is not None, only variables in the
`main_program` that makes predicate(variable)==True
will be loaded. It only works when we are using the
`main_program` to specify variables (In other words
`vars` is None).
Default: None
filename(str|None): The file which saved all required variables. If variables
were saved in differnet files, set it to None.
Default: None
Returns:
None
Raises:
TypeError: If `main_program` is not an instance of Program nor None.
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
# The first usage: using `main_program` to specify variables
def name_has_fc(var):
res = "fc" in var.name
return res
:return: None prog = fluid.default_main_program()
fluid.io.load_vars(executor=exe, dirname=path, main_program=prog,
vars=None)
# All variables in `main_program` whose name includes "fc" will be loaded.
# And all the variables are supposed to have been saved in differnet files.
# The second usage: using `vars` to specify variables
var_list = [var_a, var_b, var_c]
fluid.io.load_vars(executor=exe, dirname=path, vars=var_list,
filename="vars_file")
# var_a, var_b and var_c will be loaded. And they are supposed to haven
# been saved in the same file named 'var_file' in the path "./my_paddle_model".
""" """
if vars is None: if vars is None:
if main_program is None: if main_program is None:
...@@ -222,7 +407,42 @@ def load_vars(executor, ...@@ -222,7 +407,42 @@ def load_vars(executor,
def load_params(executor, dirname, main_program=None, filename=None): def load_params(executor, dirname, main_program=None, filename=None):
""" """
load all parameters from directory by executor. This function filters out all parameters from the give `main_program`
and then trys to load these parameters from the folder `dirname` or
the file `filename`.
Use the `dirname` to specify the folder where parameters were saved. If
parameters were saved in separate files in the folder `dirname`, set
`filename` None; if all parameters were saved in a single file, use
`filename` to specify the file name.
NOTICE: Some variables are not Parameter while they are necessary for
training. So you can NOT save and continue your training just by
`save_params()` and `load_params()`. Please use `save_persistables()`
and `load_persistables()` instead.
Args:
executor(Executor): The executor to run for loading parameters.
dirname(str): The directory path.
main_program(Program|None): The program whose parameters will be
loaded. If it is None, the default
main program will be used automatically.
Default: None
filename(str|None): The file which saved all parameters. If parameters
were saved in differnet files, set it to None.
Default: None
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
fluid.io.load_params(executor=exe, dirname=param_path,
main_program=None)
""" """
load_vars( load_vars(
executor, executor,
...@@ -234,7 +454,37 @@ def load_params(executor, dirname, main_program=None, filename=None): ...@@ -234,7 +454,37 @@ def load_params(executor, dirname, main_program=None, filename=None):
def load_persistables(executor, dirname, main_program=None, filename=None): def load_persistables(executor, dirname, main_program=None, filename=None):
""" """
load all persistables from directory by executor. This function filters out all variables with `persistable==True` from the
give `main_program` and then trys to load these variables from the folder
`dirname` or the file `filename`.
Use the `dirname` to specify the folder where persistable variables were
saved. If variables were saved in separate files, set `filename` None;
if all variables were saved in a single file, use `filename` to specify
the file name.
Args:
executor(Executor): The executor to run for loading persistable variables.
dirname(str): The directory path.
main_program(Program|None): The program whose persistbale variables will
be loaded. If it is None, the default main
program will be used automatically.
Default: None
filename(str|None): The file which saved all variables. If variables were
saved in differnet files, set it to None.
Default: None
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
fluid.io.load_persistables(executor=exe, dirname=param_path,
main_program=None)
""" """
load_vars( load_vars(
executor, executor,
...@@ -307,22 +557,48 @@ def save_inference_model(dirname, ...@@ -307,22 +557,48 @@ def save_inference_model(dirname,
model_filename=None, model_filename=None,
params_filename=None): params_filename=None):
""" """
Build a model especially for inference, Prune the given `main_program` to build a new program especially for inference,
and save it to directory by the executor. and then save it and all related parameters to given `dirname` by the `executor`.
Args:
dirname(str): The directory path to save the inference model.
feeded_var_names(list[str]): Names of variables that need to be feeded data
during inference.
target_vars(list[Variable]): Variables from which we can get inference
results.
executor(Executor): The executor that saves the inference model.
main_program(Program|None): The original program, which will be pruned to
build the inference model. If is setted None,
the default main program will be used.
Default: None.
model_filename(str|None): The name of file to save the inference program
itself. If is setted None, a default filename
`__model__` will be used.
params_filename(str|None): The name of file to save all related parameters.
If it is setted None, parameters will be saved
in separate files .
:param dirname: directory path Returns:
:param feeded_var_names: Names of variables that need to be feeded data during inference None
:param target_vars: Variables from which we can get inference results.
:param executor: executor that save inference model Raises:
:param main_program: original program, which will be pruned to build the inference model. ValueError: If `feed_var_names` is not a list of basestring.
Default default_main_program(). ValueError: If `target_vars` is not a list of Variable.
:param model_filename: The name of file to save inference program.
If not specified, default filename `__model__` will be used. Examples:
:param params_filename: The name of file to save parameters. .. code-block:: python
It is used for the case that all parameters are saved in a single binary file.
If not specified, parameters are considered saved in separate files. exe = fluid.Executor(fluid.CPUPlace())
path = "./infer_model"
fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
target_vars=[predict_var], executor=exe)
# In this exsample, the function will prune the default main program
# to make it suitable for infering the `predict_var`. The pruned
# inference program is going to be saved in the "./infer_model/__model__"
# and parameters are going to be saved in separate files under folder
# "./infer_model".
:return: None
""" """
if isinstance(feeded_var_names, basestring): if isinstance(feeded_var_names, basestring):
feeded_var_names = [feeded_var_names] feeded_var_names = [feeded_var_names]
...@@ -383,18 +659,49 @@ def load_inference_model(dirname, ...@@ -383,18 +659,49 @@ def load_inference_model(dirname,
""" """
Load inference model from a directory Load inference model from a directory
:param dirname: directory path Args:
:param executor: executor that load inference model dirname(str): The directory path
:param model_filename: The name of file to load inference program. executor(Executor): The executor to run for loading inference model.
If not specified, default filename `__model__` will be used. model_filename(str|None): The name of file to load inference program.
:param params_filename: The name of file to load parameters. If it is None, the default filename
It is used for the case that all parameters are saved in a single binary file. '__model__' will be used.
If not specified, parameters are considered saved in separate files. Default: None
params_filename(str|None): The name of file to load all parameters.
It is only used for the case that all
parameters were saved in a single binary
file. If parameters were saved in separate
files, set it as 'None'.
Returns:
tuple: The return of this function is a tuple with three elements:
(program, feed_target_names, fetch_targets). The `program` is a
Program, it's the program for inference. The `feed_target_names` is
a list of str, it contains Names of variables that need to feed
data in the inference program. The `fetch_targets` is a list of
Variable. It contains variables from which we can get inference
results.
Raises:
ValueError: If `dirname` is not a existing directory.
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
path = "./infer_model"
[inference_program, feed_target_names, fetch_targets] =
fluid.io.load_inference_model(dirname=path, executor=exe)
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
# In this exsample, the inference program was saved in the
# "./infer_model/__model__" and parameters were saved in
# separate files in ""./infer_model".
# After getting inference program, feed target names and
# fetch targets, we can use an Executor to run the inference
# program to get the inference result.
:return: [program, feed_target_names, fetch_targets]
program: program especially for inference.
feed_target_names: Names of variables that need to feed data
fetch_targets: Variables from which we can get inference results.
""" """
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname) raise ValueError("There is no directory named '%s'", dirname)
...@@ -425,12 +732,25 @@ def load_inference_model(dirname, ...@@ -425,12 +732,25 @@ def load_inference_model(dirname,
def get_parameter_value(para, executor): def get_parameter_value(para, executor):
""" """
Get the LoDTensor for the parameter Get the LoDTensor value of the given parameter.
Args:
para(Parameter): The parameter to get value from.
executor(Executor): The executor to run for retrieving the value.
Returns:
numpy.array: The given parameter's values.
Raises:
AssertionError: If the `para` is not an instance of Parameter.
:param executor: executor for retrieving the value Examples:
:param para: the given parameter .. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param = fluid.default_main_program().global_block().var('fc.w')
p = fluid.io.get_parameter_value(param, exe)
:return: the LoDTensor for the parameter
""" """
assert is_parameter(para) assert is_parameter(para)
...@@ -442,14 +762,30 @@ def get_parameter_value(para, executor): ...@@ -442,14 +762,30 @@ def get_parameter_value(para, executor):
def get_parameter_value_by_name(name, executor, program=None): def get_parameter_value_by_name(name, executor, program=None):
""" """
Get the LoDTensor for paramter with the given name Get the LoDTensor value of a certain parameter by its name.
Args:
name(str): The parameter's name.
executor(Executor): The executor to run for retrieving the value.
program(Program | None): The program where to find the parameter.
If it's set to be None, the function will
try to find the parameter in the default
main program.
:param executor: executor for retrieving the value Returns:
:param name: the name of the parameter numpy.array: The parameter's values.
:param program: the program where the variable is found
Default default_main_program().
:return: the LoDTensor for the variable Raises:
TypeError: If given `name` is not an instance of basestring.
TypeError: If the parameter with the given name doesn't exist.
AssertionError: If there is a varibale named `name` in the
given program but it is not a Parameter.
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
p = fluid.io.get_parameter_value('fc.w', exe)
""" """
if program is None: if program is None:
program = default_main_program() program = default_main_program()
...@@ -474,16 +810,58 @@ def save_checkpoint(executor, ...@@ -474,16 +810,58 @@ def save_checkpoint(executor,
lookup_table=None, lookup_table=None,
ps_endpoint_list=None): ps_endpoint_list=None):
""" """
Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory, This function filters out all checkpoint variables from the give
the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy main_program and then saves these variables to the `checkpoint_dir`
to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most, directory.
The interval between two saved checkpoints must greater than save_interval_secs.
In the training precess, we generally save a checkpoint in each
iteration. So there might be a lot of checkpoints in the
`checkpoint_dir`. To avoid them taking too much disk space, the
`max_num_checkpoints` are introduced to limit the total number of
checkpoints. If the number of existing checkpints is greater than
the `max_num_checkpoints`, oldest ones will be scroll deleted.
A variable is a checkpoint variable and will be saved if it meets
all following conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
:param executor executor for save the value Args:
:param checkpoint_dir the checkpoint directory executor(Executor): The executor to run for save checkpoint.
:param trainer_id currect trainer id, if id is equal to 0, the trainer is chief checkpoint_dir(str): The folder where to save checkpoints.
:param main_program will save all variables in program trainer_id(int): currect trainer id, if id is equal to 0, the trainer
:param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints is chief.
trainer_args(dict|None): Current training arguments. Such as 'epoch_id'
and 'step_id'.
Defaut: None
main_program(Program|None): The program whose checkpoint variables will
be saved. If it is None, the default main program will be used.
max_num_checkpoints(int): The max number of total number of existing
checkpoints.
Default: 3
Returns:
None
Raises:
ValueError: If `checkpoint_dir` is None.
AssertionError: If `trainer_args` is not a dict.
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
path = "./checkpoints"
prog = fluid.default_main_program()
trainer_args = {"epoch_id": 200,
"step_id": 20} # just an example
fluid.io.save_checkpoint(executor=exe,
checkpoint_dir=path,
trainer_id=0,
trainer_args=trainer_args,
main_program=prog,
max_num_checkpoints=3)
""" """
if checkpoint_dir is None: if checkpoint_dir is None:
raise ValueError("'checkpoint_dir' should not be None") raise ValueError("'checkpoint_dir' should not be None")
...@@ -512,13 +890,50 @@ def save_checkpoint(executor, ...@@ -512,13 +890,50 @@ def save_checkpoint(executor,
def load_checkpoint(executor, checkpoint_dir, serial, main_program): def load_checkpoint(executor, checkpoint_dir, serial, main_program):
""" """
Load checkpoint from a directory by executor, This function filters out all checkpoint variables from the give
it will find the most recent saved checkpoint file and load it auto. main_program and then try to load these variables from the
`checkpoint_dir` directory.
In the training precess, we generally save a checkpoint in each
iteration. So there are more than one checkpoint in the
`checkpoint_dir` (each checkpoint has its own sub folder), use
`serial` to specify which serial of checkpoint you would like to
load.
A variable is a checkpoint variable and will be loaded if it meets
all following conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
Args:
executor(Executor): The executor to run for loading checkpoint.
checkpoint_dir(str): The folder where all checkpoints are.
serial(int): The serial of checkpoint you would like to load.
main_program(Program): The program whose checkpoint variables will
be loaded.
:param executor executor for load the value Returns:
:param checkpoint_dir the checkpoint directory None
:param serial the serial folder in checkpoint directory will be load
:param main_program will load all variables in program Raises:
ValueError: If `checkpoint_dir` is None.
ValueError: If `serial` is None or `serial` is less than 0.
ValueError: If `main_program` is None.
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
path = "./checkpoints"
prog = fluid.default_main_program()
fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
serial=9, main_program=prog)
# In this example, `load_checkpoint` function
# will first filters out all checkpoint variables in the default
# main program, and then try to load these variables form the
# folder "./checkpoints/checkpoint_9/__model__".
""" """
if checkpoint_dir is None: if checkpoint_dir is None:
...@@ -540,8 +955,8 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False): ...@@ -540,8 +955,8 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
the trainer will call clean_checkpoint to delete checkpoint directory saved before. the trainer will call clean_checkpoint to delete checkpoint directory saved before.
delete_dir only works when the directory is empty, otherwise, OSError is raised. delete_dir only works when the directory is empty, otherwise, OSError is raised.
:param checkpoint_dir : param checkpoint_dir
:param delete_dir : param delete_dir
""" """
if checkpoint_dir is None: if checkpoint_dir is None:
...@@ -557,13 +972,40 @@ def load_persist_vars_without_grad(executor, ...@@ -557,13 +972,40 @@ def load_persist_vars_without_grad(executor,
program, program,
has_model_dir=False): has_model_dir=False):
""" """
load_persist_vars_without_grad will load variables from a directory by an executor, This function filters out all checkpoint variables from the give
the variable named end with "@GRAD" will not be loaded. program and then trys to load these variables from the given directory.
A variable is a checkpoint variable if it meets all following
conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
:param executor executor for load the value Args:
:param dirname the checkpoint directory executor(Executor): The executor to run for loading variables.
:param program will load all variables in program dirname(str): The directory path.
:param has_model_dir if has_model_dir is True, will load variables from sub directory named __model__ program(Program): The program whose checkpoint variables will
be loaded.
has_model_dir(bool): if True, the function loads variables
from a sub directory named '__model__'.
Default: False
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
fluid.io.load_persist_vars_without_grad(executor=exe,
dirname=param_path, program=prog, has_model_dir=True)
# In this example, `load_persist_vars_without_grad` function
# will first filters out all checkpoint variables in the default
# main program, and then trys to load these variables form the
# folder "./my_paddle_model/__model__".
""" """
if has_model_dir: if has_model_dir:
...@@ -603,12 +1045,38 @@ def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name): ...@@ -603,12 +1045,38 @@ def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
def save_persist_vars_without_grad(executor, dirname, program): def save_persist_vars_without_grad(executor, dirname, program):
""" """
save_persist_vars_without_grad will save variables to a directory by an executor, This function filters out all checkpoint variables from the give
the variable named end with "@GRAD" will not be saved. program and then save these variables to a sub-folder '__model__' of
the given directory.
A variable is a checkpoint variable if it meets all following
conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
Args:
executor(Executor): The executor to run for saving variables.
dirname(str): The directory path.
program(Program): The program whose checkpoint variables will
be saved.
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
fluid.io.save_persist_vars_without_grad(executor=exe,
dirname=param_path, program=prog)
:param executor executor for load the value # In this example, `save_persist_vars_without_grad` function
:param dirname the checkpoint directory # will first filters out all checkpoint variables in the default
:param program will load all variables in program # main program, and then saves these variables to the folder
# "./my_paddle_model/__model__".
""" """
cur_dir = _get_model_dir(dirname) cur_dir = _get_model_dir(dirname)
save_vars( save_vars(
...@@ -673,7 +1141,7 @@ def _is_checkpoint_var(var): ...@@ -673,7 +1141,7 @@ def _is_checkpoint_var(var):
the checkpoint will not save or load all the variables. the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
:param var : param var
""" """
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
...@@ -763,7 +1231,7 @@ def _write_success(dirname): ...@@ -763,7 +1231,7 @@ def _write_success(dirname):
""" """
write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct. write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
:param dirname : param dirname
""" """
success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
with open(success_file, 'a') as f: with open(success_file, 'a') as f:
...@@ -775,7 +1243,7 @@ def get_latest_checkpoint_serial(checkpoint_dir): ...@@ -775,7 +1243,7 @@ def get_latest_checkpoint_serial(checkpoint_dir):
""" """
get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
:param checkpoint_dir : param checkpoint_dir
""" """
if not checkpoint_dir: if not checkpoint_dir:
return -1 return -1
......
...@@ -185,12 +185,14 @@ def Print(input, ...@@ -185,12 +185,14 @@ def Print(input,
Returns: Returns:
Variable: Output tensor, same data with input tensor. Variable: Output tensor, same data with input tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
value = some_layer(...) value = some_layer(...)
Print(value, summarize=10, Print(value, summarize=10,
message="The content of some_layer: ") message="The content of some_layer: ")
''' '''
helper = LayerHelper('print', **locals()) helper = LayerHelper('print', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_tmp_variable(dtype=helper.input_dtype())
...@@ -1201,6 +1203,31 @@ class ConditionalBlockGuard(BlockGuard): ...@@ -1201,6 +1203,31 @@ class ConditionalBlockGuard(BlockGuard):
class ConditionalBlock(object): class ConditionalBlock(object):
'''
**ConditionalBlock**
ConditionalBlock is an operator that bind a block to a specific condition,
if the condition matches, the corresponding block will be executed.
Args:
inputs (Variable): bool conditions.
is_scalar_condition (bool): whether the branch is controled by a scalar.
name(str): name of this ConditionalBlock.
Examples:
.. code-block:: python
cond = layers.less_than(x=label, y=limit)
true_image, false_image = layers.split_lod_tensor(
input=image, mask=cond)
true_cond = layers.ConditionalBlock([true_image])
with true_cond.block():
...
with false_cond.block():
...
'''
def __init__(self, inputs, is_scalar_condition=False, name=None): def __init__(self, inputs, is_scalar_condition=False, name=None):
for each_input in inputs: for each_input in inputs:
if not isinstance(each_input, Variable): if not isinstance(each_input, Variable):
......
...@@ -16,7 +16,7 @@ All layers just related to the detection neural network. ...@@ -16,7 +16,7 @@ All layers just related to the detection neural network.
""" """
from layer_function_generator import generate_layer_fn from layer_function_generator import generate_layer_fn
from layer_function_generator import autodoc from layer_function_generator import autodoc, templatedoc
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
import tensor import tensor
import nn import nn
...@@ -155,7 +155,7 @@ def detection_output(loc, ...@@ -155,7 +155,7 @@ def detection_output(loc,
return nmsed_outs return nmsed_outs
@autodoc() @templatedoc()
def detection_map(detect_res, def detection_map(detect_res,
label, label,
class_num, class_num,
...@@ -166,6 +166,47 @@ def detection_map(detect_res, ...@@ -166,6 +166,47 @@ def detection_map(detect_res,
input_states=None, input_states=None,
out_states=None, out_states=None,
ap_version='integral'): ap_version='integral'):
"""
${comment}
Args:
detect_res: ${detect_res_comment}
label: ${label_comment}
class_num: ${class_num_comment}
background_label: ${background_label_comment}
overlap_threshold: ${overlap_threshold_comment}
evaluate_difficult: ${evaluate_difficult_comment}
has_state: ${has_state_comment}
input_states: If not None, It contains 3 elements:
1. pos_count ${pos_count_comment}.
2. true_pos ${true_pos_comment}.
3. false_pos ${false_pos_comment}.
out_states: If not None, it contains 3 elements.
1. accum_pos_count ${accum_pos_count_comment}.
2. accum_true_pos ${accum_true_pos_comment}.
3. accum_false_pos ${accum_false_pos_comment}.
ap_version: ${ap_type_comment}
Returns:
${map_comment}
Examples:
.. code-block:: python
detect_res = fluid.layers.data(
name='detect_res',
shape=[10, 6],
append_batch_size=False,
dtype='float32')
label = fluid.layers.data(
name='label',
shape=[10, 6],
append_batch_size=False,
dtype='float32')
map_out = fluid.layers.detection_map(detect_res, label, 21)
"""
helper = LayerHelper("detection_map", **locals()) helper = LayerHelper("detection_map", **locals())
def __create_var(type): def __create_var(type):
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
All layers just related to the neural network. All layers just related to the neural network.
""" """
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
...@@ -93,6 +93,7 @@ __all__ = [ ...@@ -93,6 +93,7 @@ __all__ = [
'mean_iou', 'mean_iou',
'relu', 'relu',
'log', 'log',
'crop',
] ]
...@@ -108,14 +109,14 @@ def fc(input, ...@@ -108,14 +109,14 @@ def fc(input,
""" """
**Fully Connected Layer** **Fully Connected Layer**
This function creates a fully connected layer in the network. It can take This function creates a fully connected layer in the network. It can take
multiple tensors as its inputs. It creates a variable called weights for multiple tensors as its inputs. It creates a variable called weights for
each input tensor, which represents a fully connected weight matrix from each input tensor, which represents a fully connected weight matrix from
each input unit to each output unit. The fully connected layer multiplies each input unit to each output unit. The fully connected layer multiplies
each input tensor with its coresponding weight to produce an output Tensor. each input tensor with its coresponding weight to produce an output Tensor.
If multiple input tensors are given, the results of multiple multiplications If multiple input tensors are given, the results of multiple multiplications
will be sumed up. If bias_attr is not None, a bias variable will be created will be sumed up. If bias_attr is not None, a bias variable will be created
and added to the output. Finally, if activation is not None, it will be applied and added to the output. Finally, if activation is not None, it will be applied
to the output as well. to the output as well.
This process can be formulated as follows: This process can be formulated as follows:
...@@ -197,7 +198,10 @@ def fc(input, ...@@ -197,7 +198,10 @@ def fc(input,
else: else:
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias}) type="sum",
inputs={"X": mul_results},
outputs={"Out": pre_bias},
attrs={"use_mkldnn": use_mkldnn})
# add bias # add bias
pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims) pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
# add activation # add activation
...@@ -846,7 +850,7 @@ def crf_decoding(input, param_attr, label=None): ...@@ -846,7 +850,7 @@ def crf_decoding(input, param_attr, label=None):
Returns: Returns:
Variable: ${viterbi_path_comment} Variable: ${viterbi_path_comment}
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1084,7 +1088,7 @@ def chunk_eval(input, ...@@ -1084,7 +1088,7 @@ def chunk_eval(input,
Here is a NER example of labeling for these tagging schemes: Here is a NER example of labeling for these tagging schemes:
.. code-block:: python .. code-block:: python
====== ====== ====== ===== == ============ ===== ===== ===== == ========= ====== ====== ====== ===== == ============ ===== ===== ===== == =========
Li Ming works at Agricultural Bank of China in Beijing. Li Ming works at Agricultural Bank of China in Beijing.
====== ====== ====== ===== == ============ ===== ===== ===== == ========= ====== ====== ====== ===== == ============ ===== ===== ===== == =========
...@@ -1110,7 +1114,7 @@ def chunk_eval(input, ...@@ -1110,7 +1114,7 @@ def chunk_eval(input,
is the num of chunk types, and `tag_type` get its value from the following table. is the num of chunk types, and `tag_type` get its value from the following table.
.. code-block:: python .. code-block:: python
Scheme Begin Inside End Single Scheme Begin Inside End Single
plain 0 - - - plain 0 - - -
IOB 0 1 - - IOB 0 1 - -
...@@ -1146,7 +1150,7 @@ def chunk_eval(input, ...@@ -1146,7 +1150,7 @@ def chunk_eval(input,
tuple: tuple containing: precision, recall, f1_score, tuple: tuple containing: precision, recall, f1_score,
num_infer_chunks, num_label_chunks, num_infer_chunks, num_label_chunks,
num_correct_chunks num_correct_chunks
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1246,7 +1250,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): ...@@ -1246,7 +1250,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
""" """
This function computes the softmax activation among all time-steps for each This function computes the softmax activation among all time-steps for each
sequence. The dimension of each time-step should be 1. Thus, the shape of sequence. The dimension of each time-step should be 1. Thus, the shape of
input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N` input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N`
is the sum of the length of all sequences. is the sum of the length of all sequences.
For i-th sequence in a mini-batch: For i-th sequence in a mini-batch:
...@@ -1266,7 +1270,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): ...@@ -1266,7 +1270,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
param_attr (ParamAttr|None): attributes for parameter param_attr (ParamAttr|None): attributes for parameter
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \ use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
library is installed. Default: True library is installed. Default: True
Returns: Returns:
Variable: output of sequence_softmax Variable: output of sequence_softmax
...@@ -1827,11 +1831,11 @@ def pool2d(input, ...@@ -1827,11 +1831,11 @@ def pool2d(input,
${comment} ${comment}
Args: Args:
input (Variable): The input tensor of pooling operator. The format of input (Variable): The input tensor of pooling operator. The format of
input tensor is NCHW, where N is batch size, C is input tensor is NCHW, where N is batch size, C is
the number of channels, H is the height of the the number of channels, H is the height of the
feature, and W is the width of the feature. feature, and W is the width of the feature.
pool_size (int): The side length of pooling windows. All pooling pool_size (int): The side length of pooling windows. All pooling
windows are squares with pool_size on a side. windows are squares with pool_size on a side.
pool_type: ${pooling_type_comment} pool_type: ${pooling_type_comment}
pool_stride (int): stride of the pooling layer. pool_stride (int): stride of the pooling layer.
...@@ -1840,7 +1844,7 @@ def pool2d(input, ...@@ -1840,7 +1844,7 @@ def pool2d(input,
use_cudnn: ${use_cudnn_comment} use_cudnn: ${use_cudnn_comment}
ceil_mode: ${ceil_mode_comment} ceil_mode: ${ceil_mode_comment}
use_mkldnn: ${use_mkldnn_comment} use_mkldnn: ${use_mkldnn_comment}
name (str|None): A name for this layer(optional). If set None, the name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically. layer will be named automatically.
Returns: Returns:
...@@ -1858,10 +1862,10 @@ def pool2d(input, ...@@ -1858,10 +1862,10 @@ def pool2d(input,
data = fluid.layers.data( data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32') name='data', shape=[3, 32, 32], dtype='float32')
conv2d = fluid.layers.pool2d( conv2d = fluid.layers.pool2d(
input=data, input=data,
pool_size=2, pool_size=2,
pool_type='max', pool_type='max',
pool_stride=1, pool_stride=1,
global_pooling=False) global_pooling=False)
""" """
if pool_type not in ["max", "avg"]: if pool_type not in ["max", "avg"]:
...@@ -2226,14 +2230,14 @@ def beam_search_decode(ids, scores, name=None): ...@@ -2226,14 +2230,14 @@ def beam_search_decode(ids, scores, name=None):
This layers is to pack the output of beam search layer into sentences and This layers is to pack the output of beam search layer into sentences and
associated scores. It is usually called after the beam search layer. associated scores. It is usually called after the beam search layer.
Typically, the output of beam search layer is a tensor of selected ids, with Typically, the output of beam search layer is a tensor of selected ids, with
a tensor of the score of each id. Beam search layer's output ids, however, a tensor of the score of each id. Beam search layer's output ids, however,
are generated directly during the tree search, and they are stacked by each are generated directly during the tree search, and they are stacked by each
level of the search tree. Thus we need to reorganize them into sentences, level of the search tree. Thus we need to reorganize them into sentences,
based on the score of each id. This layer takes the output of beam search based on the score of each id. This layer takes the output of beam search
layer as input and repack them into sentences. layer as input and repack them into sentences.
Args: Args:
ids (Variable): The selected ids, output of beam search layer. ids (Variable): The selected ids, output of beam search layer.
scores (Variable): The associated scores of the ids, out put of beam scores (Variable): The associated scores of the ids, out put of beam
search layer. search layer.
name (str): The name of this layer. It is optional. name (str): The name of this layer. It is optional.
...@@ -2241,7 +2245,7 @@ def beam_search_decode(ids, scores, name=None): ...@@ -2241,7 +2245,7 @@ def beam_search_decode(ids, scores, name=None):
Returns: Returns:
tuple(Variable): a tuple of two output tensors: sentence_ids, sentence_scores. tuple(Variable): a tuple of two output tensors: sentence_ids, sentence_scores.
sentence_ids is a tensor with shape [size, length], where size is the sentence_ids is a tensor with shape [size, length], where size is the
beam size of beam search, and length is the length of each sentence. beam size of beam search, and length is the length of each sentence.
Note that the length of sentences may vary. Note that the length of sentences may vary.
sentence_scores is a tensor with the same shape as sentence_ids. sentence_scores is a tensor with the same shape as sentence_ids.
...@@ -2674,18 +2678,35 @@ def sequence_expand(x, y, ref_level=-1, name=None): ...@@ -2674,18 +2678,35 @@ def sequence_expand(x, y, ref_level=-1, name=None):
def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
''' '''
**beam search**
This function implements the beam search algorithm. This function implements the beam search algorithm.
Beam search is a classical algorithm for selecting candidate words
in a machine translation task.
Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
for more details.
Args: Args:
pre_ids (Variable): ${pre_ids_comment} pre_ids (Variable): ids in previous step.
ids (Variable): ${ids_comment} ids (Variable): a LoDTensor of shape of [None,k]
scores (Variable): ${scores_comment} scores (Variable): a LoDTensor that has the same shape and LoD with `ids`
beam_size (int): ${beam_size_comment} beam_size (int): beam size for beam search
end_id (int): ${end_id_comment} end_id (int): the token id which indicates the end of a sequence
level (int): ${level_comment} level (int): the level of LoDTensor
Returns: Returns:
tuple: a tuple of beam_search output variables: selected_ids, selected_scores tuple: a tuple of beam_search output variables: `selected_ids`, `selected_scores`
Examples:
.. code-block:: python
# current_score is a Tensor of shape (num_batch_size, embed_size), which
# consists score of each candidate word.
topk_scores, topk_indices = pd.topk(current_score, k=50)
selected_ids, selected_scores = pd.beam_search(
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
''' '''
helper = LayerHelper('beam_search', **locals()) helper = LayerHelper('beam_search', **locals())
score_type = scores.dtype score_type = scores.dtype
...@@ -2901,7 +2922,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None): ...@@ -2901,7 +2922,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
`None`, compute the mean over all elements of :attr:`input` `None`, compute the mean over all elements of :attr:`input`
and return a variable with a single element, otherwise it and return a variable with a single element, otherwise it
must be in the range :math:`[-rank(input), rank(input))`. If must be in the range :math:`[-rank(input), rank(input))`. If
:math:`dim[i] < 0`, the dimension to reduce is :math:`dim[i] < 0`, the dimension to reduce is
:math:`rank(input) + dim[i]`. :math:`rank(input) + dim[i]`.
keep_dim (bool): Whether to reserve the reduced dimension in the keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension output Tensor. The result tensor will have one fewer dimension
...@@ -3372,16 +3393,16 @@ def topk(input, k, name=None): ...@@ -3372,16 +3393,16 @@ def topk(input, k, name=None):
Args: Args:
input(Variable): The input variable which can be a vector or Tensor with input(Variable): The input variable which can be a vector or Tensor with
higher rank. higher rank.
k(int): The number of top elements to look for along the last dimension k(int): The number of top elements to look for along the last dimension
of input. of input.
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
Default: None Default: None
Returns: Returns:
Tuple[Variable]: A tuple with two elements. Each element is a Variable. Tuple[Variable]: A tuple with two elements. Each element is a Variable.
The first one is k largest elements along each last The first one is k largest elements along each last
dimensional slice. The second one is indices of values dimensional slice. The second one is indices of values
within the last dimension of input. within the last dimension of input.
Raises: Raises:
...@@ -3576,15 +3597,15 @@ def warpctc(input, label, blank=0, norm_by_times=False): ...@@ -3576,15 +3597,15 @@ def warpctc(input, label, blank=0, norm_by_times=False):
It's shape is [Lp, num_classes + 1], where Lp is the sum of all input It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
sequences' length and num_classes is the true number of classes. sequences' length and num_classes is the true number of classes.
(not including the blank label). (not including the blank label).
label (Variable): The ground truth of variable-length sequence, label (Variable): The ground truth of variable-length sequence,
which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1], which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1],
where Lg is th sum of all labels' length. where Lg is th sum of all labels' length.
blank (int, default 0): The blank label index of Connectionist blank (int, default 0): The blank label index of Connectionist
Temporal Classification (CTC) loss, which is in the Temporal Classification (CTC) loss, which is in the
half-opened interval [0, num_classes + 1). half-opened interval [0, num_classes + 1).
norm_by_times(bool, default false): Whether to normalize the gradients norm_by_times(bool, default false): Whether to normalize the gradients
by the number of time-step, which is also the sequence's length. by the number of time-step, which is also the sequence's length.
There is no need to normalize the gradients if warpctc layer was There is no need to normalize the gradients if warpctc layer was
follewed by a mean_op. follewed by a mean_op.
Returns: Returns:
...@@ -3690,8 +3711,8 @@ def nce(input, ...@@ -3690,8 +3711,8 @@ def nce(input,
input (Variable): input variable. input (Variable): input variable.
label (Variable): label. label (Variable): label.
num_total_classes (int):${num_total_classes_comment} num_total_classes (int):${num_total_classes_comment}
sample_weight (Variable|None): A Variable of shape [batch_size, 1] sample_weight (Variable|None): A Variable of shape [batch_size, 1]
storing a weight for each sample. The default weight for each storing a weight for each sample. The default weight for each
sample is 1.0. sample is 1.0.
param_attr (ParamAttr|None): attributes for parameter param_attr (ParamAttr|None): attributes for parameter
bias_attr (ParamAttr|None): attributes for bias bias_attr (ParamAttr|None): attributes for bias
...@@ -4081,7 +4102,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None): ...@@ -4081,7 +4102,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`. This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
It takes the first dimension of :attr:`x` and :attr:`y` as batch size. It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
For each instance, it computes the smooth L1 loss element by element first For each instance, it computes the smooth L1 loss element by element first
and then sums all the losses. So the shape of ouput Variable is and then sums all the losses. So the shape of ouput Variable is
[batch_size, 1]. [batch_size, 1].
Args: Args:
...@@ -4090,14 +4111,14 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None): ...@@ -4090,14 +4111,14 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
y (Variable): A tensor with rank at least 2. The target value of smooth y (Variable): A tensor with rank at least 2. The target value of smooth
L1 loss op with same shape as :attr:`x`. L1 loss op with same shape as :attr:`x`.
inside_weight (Variable|None): A tensor with rank at least 2. This inside_weight (Variable|None): A tensor with rank at least 2. This
input is optional and should have same shape with :attr:`x`. If input is optional and should have same shape with :attr:`x`. If
provided, the result of (:attr:`x` - :attr:`y`) will be multiplied provided, the result of (:attr:`x` - :attr:`y`) will be multiplied
by this tensor element by element. by this tensor element by element.
outside_weight (Variable|None): A tensor with rank at least 2. This outside_weight (Variable|None): A tensor with rank at least 2. This
input is optional and should have same shape with :attr:`x`. If input is optional and should have same shape with :attr:`x`. If
provided, the out smooth L1 loss will be multiplied by this tensor provided, the out smooth L1 loss will be multiplied by this tensor
element by element. element by element.
sigma (float|None): Hyper parameter of smooth L1 loss layer. A float sigma (float|None): Hyper parameter of smooth L1 loss layer. A float
scalar with default value 1.0. scalar with default value 1.0.
Returns: Returns:
...@@ -4143,7 +4164,7 @@ def one_hot(input, depth): ...@@ -4143,7 +4164,7 @@ def one_hot(input, depth):
Examples: Examples:
.. code-block:: python .. code-block:: python
label = layers.data(name="label", shape=[1], dtype="float32") label = layers.data(name="label", shape=[1], dtype="float32")
one_hot_label = layers.one_hot(input=label, depth=10) one_hot_label = layers.one_hot(input=label, depth=10)
""" """
...@@ -4297,10 +4318,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4297,10 +4318,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
def lod_reset(x, y=None, target_lod=None): def lod_reset(x, y=None, target_lod=None):
""" """
Set LoD of :attr:`x` to a new one specified by :attr:`y` or Set LoD of :attr:`x` to a new one specified by :attr:`y` or
:attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be
considered as target LoD first, otherwise :attr:`y.data` would be considered as target LoD first, otherwise :attr:`y.data` would be
considered as target LoD. If :attr:`y` is not provided, target LoD should considered as target LoD. If :attr:`y` is not provided, target LoD should
be specified by :attr:`target_lod`. If target LoD is specified by be specified by :attr:`target_lod`. If target LoD is specified by
:attr:`Y.data` or :attr:`target_lod`, only one level LoD is supported. :attr:`Y.data` or :attr:`target_lod`, only one level LoD is supported.
.. code-block:: text .. code-block:: text
...@@ -4354,7 +4375,7 @@ def lod_reset(x, y=None, target_lod=None): ...@@ -4354,7 +4375,7 @@ def lod_reset(x, y=None, target_lod=None):
Args: Args:
x (Variable): Input variable which could be a Tensor or LodTensor. x (Variable): Input variable which could be a Tensor or LodTensor.
y (Variable|None): If provided, output's LoD would be derived y (Variable|None): If provided, output's LoD would be derived
from :attr:`y`. from :attr:`y`.
target_lod (list|tuple|None): One level LoD which should be considered target_lod (list|tuple|None): One level LoD which should be considered
as target LoD when :attr:`y` not provided. as target LoD when :attr:`y` not provided.
...@@ -4670,7 +4691,7 @@ def image_resize(input, ...@@ -4670,7 +4691,7 @@ def image_resize(input,
""" """
**Resize a Batch of Images** **Resize a Batch of Images**
The input must be a tensor of the shape (num_batches, channels, in_h, in_w), The input must be a tensor of the shape (num_batches, channels, in_h, in_w),
and the resizing only applies on the last two dimensions(hight and width). and the resizing only applies on the last two dimensions(hight and width).
Supporting resample methods: Supporting resample methods:
...@@ -4766,9 +4787,9 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None): ...@@ -4766,9 +4787,9 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
def image_resize_short(input, out_short_len, resample='BILINEAR'): def image_resize_short(input, out_short_len, resample='BILINEAR'):
""" """
Resize a batch of images. The short edge of input images will be Resize a batch of images. The short edge of input images will be
resized to the given 'out_short_len'. The long edge of input images resized to the given 'out_short_len'. The long edge of input images
will be resized proportionately to make images' length-width ratio will be resized proportionately to make images' length-width ratio
constant. constant.
Args: Args:
...@@ -4801,7 +4822,7 @@ def gather(input, index): ...@@ -4801,7 +4822,7 @@ def gather(input, index):
""" """
**Gather Layer** **Gather Layer**
Output is obtained by gathering entries of the outer-most dimension Output is obtained by gathering entries of the outer-most dimension
of X indexed by `index` and concatenate them together. of X indexed by `index` and concatenate them together.
.. math:: .. math::
...@@ -4826,7 +4847,7 @@ def gather(input, index): ...@@ -4826,7 +4847,7 @@ def gather(input, index):
[5, 6]] [5, 6]]
Args: Args:
input (Variable): The source input with rank>=1. input (Variable): The source input with rank>=1.
index (Variable): The index input with rank=1. index (Variable): The index input with rank=1.
Returns: Returns:
...@@ -4862,7 +4883,7 @@ def random_crop(x, shape, seed=None): ...@@ -4862,7 +4883,7 @@ def random_crop(x, shape, seed=None):
Returns: Returns:
${out_comment} ${out_comment}
Examples: Examples:
>>> img = fluid.layers.data("img", [3, 256, 256]) >>> img = fluid.layers.data("img", [3, 256, 256])
>>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224]) >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
...@@ -4908,7 +4929,7 @@ def log(x): ...@@ -4908,7 +4929,7 @@ def log(x):
Out = \\ln(x) Out = \\ln(x)
Args: Args:
x (Variable): Input tensor. x (Variable): Input tensor.
Returns: Returns:
Variable: The natural log of the input tensor computed element-wise. Variable: The natural log of the input tensor computed element-wise.
...@@ -4937,7 +4958,7 @@ def relu(x): ...@@ -4937,7 +4958,7 @@ def relu(x):
Out = \\max(0, x) Out = \\max(0, x)
Args: Args:
x (Variable): The input tensor. x (Variable): The input tensor.
Returns: Returns:
Variable: The output tensor with the same shape as input. Variable: The output tensor with the same shape as input.
...@@ -4958,15 +4979,15 @@ def relu(x): ...@@ -4958,15 +4979,15 @@ def relu(x):
def mean_iou(input, label, num_classes): def mean_iou(input, label, num_classes):
""" """
Mean Intersection-Over-Union is a common evaluation metric for Mean Intersection-Over-Union is a common evaluation metric for
semantic image segmentation, which first computes the IOU for each semantic image segmentation, which first computes the IOU for each
semantic class and then computes the average over classes. semantic class and then computes the average over classes.
IOU is defined as follows: IOU is defined as follows:
.. math:: .. math::
IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}. IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
The predictions are accumulated in a confusion matrix and mean-IOU The predictions are accumulated in a confusion matrix and mean-IOU
is then calculated from it. is then calculated from it.
...@@ -4979,12 +5000,12 @@ def mean_iou(input, label, num_classes): ...@@ -4979,12 +5000,12 @@ def mean_iou(input, label, num_classes):
Returns: Returns:
mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1]. mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class. out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class.
Examples: Examples:
.. code-block:: python .. code-block:: python
iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes) iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes)
""" """
helper = LayerHelper('mean_iou', **locals()) helper = LayerHelper('mean_iou', **locals())
...@@ -5003,3 +5024,101 @@ def mean_iou(input, label, num_classes): ...@@ -5003,3 +5024,101 @@ def mean_iou(input, label, num_classes):
}, },
attrs={"num_classes": num_classes}) attrs={"num_classes": num_classes})
return out_mean_iou, out_wrong, out_correct return out_mean_iou, out_wrong, out_correct
def crop(x, shape=None, offsets=None, name=None):
"""
Crop input into output, as specified by offsets and shape.
.. code-block:: text
* Case 1:
Given
X = [[0, 1, 2, 0, 0]
[0, 3, 4, 0, 0]
[0, 0, 0, 0, 0]],
and
shape = [2, 2],
offsets = [0, 1],
output is:
Out = [[1, 2],
[3, 4]].
* Case 2:
Given
X = [[0, 1, 2, 5, 0]
[0, 3, 4, 6, 0]
[0, 0, 0, 0, 0]],
and shape is tensor
shape = [[0, 0, 0]
[0, 0, 0]]
and
offsets = [0, 1],
output is:
Out = [[1, 2, 5],
[3, 4, 6]].
Args:
x (Variable): The input tensor variable.
shape (Variable|list/tuple of integer): The output shape is specified
by `shape`, which can a Variable or a list/tupe of integer.
If a tensor Variable, it's rank must be the same as `x`. This way
is suitable for the case that the output shape may be changed each
iteration. If a list/tupe of integer, it's length must be the same
as the rank of `x`
offsets (Variable|list/tuple of integer|None): Specifies the copping
offsets at each dimension. It can be a Variable or or a list/tupe
of integer. If a tensor Variable, it's rank must be the same as `x`.
This way is suitable for the case that the offsets may be changed
each iteration. If a list/tupe of integer, it's length must be the
same as the rank of `x`. If None, the offsets are 0 at each
dimension.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The cropped tensor variable.
Raises:
ValueError: If shape is not a list, tuple or Variable.
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32")
y = fluid.layers.data(name="y", shape=[2, 3], dtype="float32")
crop = fluid.layers.crop(x, shape=y)
# or
z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
crop = fluid.layers.crop(z, shape=[2, 3])
"""
helper = LayerHelper('crop', **locals())
if not (isinstance(shape, list) or isinstance(shape, tuple) or \
isinstance(shape, Variable)):
raise ValueError("The shape should be a list, tuple or Variable.")
if offsets is None:
offsets = [0] * len(x.shape)
out = helper.create_tmp_variable(x.dtype)
ipts = {'X': x}
attrs = {}
if isinstance(shape, Variable):
ipts['Y'] = shape
else:
attrs['shape'] = shape
if isinstance(offsets, Variable):
ipts['Offsets'] = offsets
else:
attrs['offsets'] = offsets
helper.append_op(
type='crop',
inputs=ipts,
outputs={'Out': out},
attrs=None if len(attrs) == 0 else attrs)
return out
...@@ -230,7 +230,11 @@ def sums(input, out=None): ...@@ -230,7 +230,11 @@ def sums(input, out=None):
helper = LayerHelper('sum', **locals()) helper = LayerHelper('sum', **locals())
if out is None: if out is None:
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_tmp_variable(dtype=helper.input_dtype())
helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out}) helper.append_op(
type='sum',
inputs={'X': input},
outputs={'Out': out},
attrs={'use_mkldnn': False})
return out return out
...@@ -380,7 +384,7 @@ def argmin(x, axis=0): ...@@ -380,7 +384,7 @@ def argmin(x, axis=0):
""" """
**argmin** **argmin**
This function computes the indices of the min elements This function computes the indices of the min elements
of the input tensor's element along the provided axis. of the input tensor's element along the provided axis.
Args: Args:
...@@ -395,7 +399,7 @@ def argmin(x, axis=0): ...@@ -395,7 +399,7 @@ def argmin(x, axis=0):
.. code-block:: python .. code-block:: python
out = fluid.layers.argmin(x=in, axis=0) out = fluid.layers.argmin(x=in, axis=0)
out = fluid.layers.argmin(x=in, axis=-1) out = fluid.layers.argmin(x=in, axis=-1)
""" """
helper = LayerHelper("arg_min", **locals()) helper = LayerHelper("arg_min", **locals())
out = helper.create_tmp_variable(VarDesc.VarType.INT64) out = helper.create_tmp_variable(VarDesc.VarType.INT64)
...@@ -411,7 +415,7 @@ def argmax(x, axis=0): ...@@ -411,7 +415,7 @@ def argmax(x, axis=0):
""" """
**argmax** **argmax**
This function computes the indices of the max elements This function computes the indices of the max elements
of the input tensor's element along the provided axis. of the input tensor's element along the provided axis.
Args: Args:
...@@ -426,7 +430,7 @@ def argmax(x, axis=0): ...@@ -426,7 +430,7 @@ def argmax(x, axis=0):
.. code-block:: python .. code-block:: python
out = fluid.layers.argmax(x=in, axis=0) out = fluid.layers.argmax(x=in, axis=0)
out = fluid.layers.argmax(x=in, axis=-1) out = fluid.layers.argmax(x=in, axis=-1)
""" """
helper = LayerHelper("arg_max", **locals()) helper = LayerHelper("arg_max", **locals())
out = helper.create_tmp_variable(VarDesc.VarType.INT64) out = helper.create_tmp_variable(VarDesc.VarType.INT64)
...@@ -495,9 +499,9 @@ def reverse(x, axis): ...@@ -495,9 +499,9 @@ def reverse(x, axis):
Args: Args:
x(Vairbale): the input to be reversed. x(Vairbale): the input to be reversed.
axis(int|tuple|list): Axis that along which order of elements axis(int|tuple|list): Axis that along which order of elements
is reversed. If it is a tuple or a list, reversing is reversed. If it is a tuple or a list, reversing
will be apply on each axis in the tuple or list. will be apply on each axis in the tuple or list.
Returns: Returns:
Variable: The reversed tensor. Variable: The reversed tensor.
...@@ -528,9 +532,9 @@ def save(x, file_path, overwrite=True): ...@@ -528,9 +532,9 @@ def save(x, file_path, overwrite=True):
Args: Args:
x(variable): The Tensor/LoDTensor to be saved. x(variable): The Tensor/LoDTensor to be saved.
file_path(str): The file path where the variable will be saved. file_path(str): The file path where the variable will be saved.
overwrite(bool): Whether or not cover the given file when it has already overwrite(bool): Whether or not cover the given file when it has already
existed. If it's set 'False' and the file is existed, a runtime existed. If it's set 'False' and the file is existed, a runtime
error will be thrown. error will be thrown.
""" """
helper = LayerHelper("save", **locals()) helper = LayerHelper("save", **locals())
helper.append_op( helper.append_op(
...@@ -550,8 +554,8 @@ def save_combine(x, file_path, overwrite=True): ...@@ -550,8 +554,8 @@ def save_combine(x, file_path, overwrite=True):
a single file. a single file.
file_path(str): The file path where variables will be saved. file_path(str): The file path where variables will be saved.
overwrite(bool): Whether or not cover the given file when it has already overwrite(bool): Whether or not cover the given file when it has already
existed. If it's set 'False' and the file is existed, a runtime existed. If it's set 'False' and the file is existed, a runtime
error will be thrown. error will be thrown.
Returns: Returns:
There is no return value. There is no return value.
......
...@@ -26,10 +26,10 @@ from clip import append_gradient_clip_ops, error_clip_callback ...@@ -26,10 +26,10 @@ from clip import append_gradient_clip_ops, error_clip_callback
from contextlib import contextmanager from contextlib import contextmanager
__all__ = [ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
'Adadelta', 'ModelAverage', 'Optimizer' 'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer'
] ]
...@@ -192,15 +192,15 @@ class Optimizer(object): ...@@ -192,15 +192,15 @@ class Optimizer(object):
"""Add optimization operators to update gradients to variables. """Add optimization operators to update gradients to variables.
Args: Args:
loss: the target that this optimization is for. loss(Variable): the target that this optimization is for.
parameters_and_grads: a list of (variable, gradient) pair to update. parameters_and_grads(list(tuple(Variable, Variable))):
a list of (variable, gradient) pair to update.
Returns: Returns:
return_op_list: a list of operators that will complete one step of return_op_list: a list of operators that will complete one step of
optimization. This will include parameter update ops, global step optimization. This will include parameter update ops, global step
update ops and any other custom ops required by subclasses to manage update ops and any other custom ops required by subclasses to manage
their internal state. their internal state.
:param startup_program:
""" """
# This is a default implementation of create_optimization_pass that # This is a default implementation of create_optimization_pass that
# can be shared by most optimizers. This implementation assumes that # can be shared by most optimizers. This implementation assumes that
...@@ -268,7 +268,22 @@ class Optimizer(object): ...@@ -268,7 +268,22 @@ class Optimizer(object):
class SGDOptimizer(Optimizer): class SGDOptimizer(Optimizer):
""" Simple SGD optimizer without any state. """
Optimizer of the stochastic gradient descent algorithm.
.. math::
param\_out = param - learning\_rate * grad
Args:
learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
Examples:
.. code-block:: python
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.2)
sgd_optimizer.minimize(cost)
""" """
def __init__(self, learning_rate, **kwargs): def __init__(self, learning_rate, **kwargs):
...@@ -294,7 +309,37 @@ class SGDOptimizer(Optimizer): ...@@ -294,7 +309,37 @@ class SGDOptimizer(Optimizer):
class MomentumOptimizer(Optimizer): class MomentumOptimizer(Optimizer):
"""Simple Momentum optimizer with velocity state """
Simple Momentum optimizer with velocity state
This optimizer has a flag for Nestrov Momentum.
The update equations are as follows:
.. math::
& velocity = mu * velocity + gradient
& if (use\_nesterov):
&\quad param = param - gradient * learning\_rate + mu * velocity * learning\_rate
& else:
&\quad param = param - learning\_rate * velocity
Args:
learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
momentum (float): momentum factor
use_nesterov (bool): enables Nesterov momentum
Examples:
.. code-block:: python
optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
optimizer.minimize(cost)
""" """
_velocity_acc_str = "velocity" _velocity_acc_str = "velocity"
...@@ -338,7 +383,32 @@ class MomentumOptimizer(Optimizer): ...@@ -338,7 +383,32 @@ class MomentumOptimizer(Optimizer):
class AdagradOptimizer(Optimizer): class AdagradOptimizer(Optimizer):
"""Simple Adagrad optimizer with moment state """
**Adaptive Gradient Algorithm (Adagrad)**
The update is done as follows:
.. math::
moment\_out &= moment + grad * grad
param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
does not have the epsilon attribute. It is added here in our implementation
as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
for numerical stability to avoid the division by zero error.
Args:
learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
epsilon (float): a small float value for numerical stability.
Examples:
.. code-block:: python
optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
optimizer.minimize(cost)
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
...@@ -379,7 +449,40 @@ class AdagradOptimizer(Optimizer): ...@@ -379,7 +449,40 @@ class AdagradOptimizer(Optimizer):
class AdamOptimizer(Optimizer): class AdamOptimizer(Optimizer):
"""Implements the Adam Optimizer """
This implements the Adam optimizer from Section 2 of the Adam
paper : https://arxiv.org/abs/1412.6980.
Adam is a first-order gradient-based optimization method based on
adaptive estimates of lower-order moments.
Adam updates:
.. math::
t & = t + 1
moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
learning\_rate & = learning\_rate * \\
\\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
Args:
learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability.
Examples:
.. code-block:: python
optimizer = fluid.optimizer.Adam(learning_rate=0.2)
optimizer.minimize(cost)
""" """
_moment1_acc_str = "moment1" _moment1_acc_str = "moment1"
_moment2_acc_str = "moment2" _moment2_acc_str = "moment2"
...@@ -484,7 +587,42 @@ class AdamOptimizer(Optimizer): ...@@ -484,7 +587,42 @@ class AdamOptimizer(Optimizer):
class AdamaxOptimizer(Optimizer): class AdamaxOptimizer(Optimizer):
"""Implements the Adamax Optimizer """
We implement the Adamax optimizer from Section 7 of the Adam
paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
Adam algorithm based on the infinity norm.
Adamax updates:
.. math::
t & = t + 1
moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
The original paper does not have an epsilon attribute.
However, it is added here for numerical stability to prevent the
division by 0 error.
Args:
learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability.
Examples:
.. code-block:: python
optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
optimizer.minimize(cost)
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
_inf_norm_acc_str = "inf_norm" _inf_norm_acc_str = "inf_norm"
...@@ -568,7 +706,34 @@ class AdamaxOptimizer(Optimizer): ...@@ -568,7 +706,34 @@ class AdamaxOptimizer(Optimizer):
class DecayedAdagradOptimizer(Optimizer): class DecayedAdagradOptimizer(Optimizer):
"""Simple Decayed Adagrad optimizer with moment state """
**Decayed Adagrad Optimizer**
The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
The update is done as follows:
.. math::
moment\_out & = decay * moment + (1 - decay) * grad * grad
param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
does not have an epsilon attribute. It is added here for numerical
stability to avoid the division by zero error.
Args:
learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
decay (float): decay rate.
epsilon (float): a small float value for numerical stability.
Examples:
.. code-block:: python
optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
optimizer.minimize(cost)
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
...@@ -614,6 +779,7 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -614,6 +779,7 @@ class DecayedAdagradOptimizer(Optimizer):
class AdadeltaOptimizer(Optimizer): class AdadeltaOptimizer(Optimizer):
""" """
**Adadelta Optimizer** **Adadelta Optimizer**
Simple Adadelta optimizer with average squared grad state and Simple Adadelta optimizer with average squared grad state and
average squared update state. average squared update state.
The details of adadelta please refer to this The details of adadelta please refer to this
...@@ -628,7 +794,7 @@ class AdadeltaOptimizer(Optimizer): ...@@ -628,7 +794,7 @@ class AdadeltaOptimizer(Optimizer):
E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2 E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
Args: Args:
learning_rate(float): global leraning rate learning_rate(float): global learning rate
rho(float): rho in equation rho(float): rho in equation
epsilon(float): epsilon in equation epsilon(float): epsilon in equation
...@@ -703,37 +869,37 @@ class RMSPropOptimizer(Optimizer): ...@@ -703,37 +869,37 @@ class RMSPropOptimizer(Optimizer):
.. math:: .. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\ r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w) w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
The first equation calculates moving average of the squared gradient for The first equation calculates moving average of the squared gradient for
each weight. Then dividing the gradient by :math: `sqrt{v(w,t)}`. each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
In some cases, adding a momentum term :math: `\\beta` is beneficial. In some cases, adding a momentum term :math: `\\beta` is beneficial.
In our implementation, Nesterov momentum is used: In our implementation, Nesterov momentum is used:
.. math:: .. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\ r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
\\epsilon}} \\nabla Q_{i}(w) \\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t) w & = w - v(w, t)
where, :math: `\\rho` is a hyperparameter and typical values are 0.9, 0.95 where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
smoothing term to avoid division by zero, usually set somewhere in range smoothing term to avoid division by zero, usually set somewhere in range
from 1e-4 to 1e-8. from 1e-4 to 1e-8.
Args: Args:
learning_rate(float): global leraning rate. learning_rate(float): global learning rate.
rho(float): rho is :math: `\\rho` in equation, set 0.95 by default. rho(float): rho is :math: `\\rho` in equation, set 0.95 by default.
epsilon(float): :math: `\\epsilon` in equation is smoothing term to epsilon(float): :math: `\\epsilon` in equation is smoothing term to
avoid division by zero, set 1e-6 by default. avoid division by zero, set 1e-6 by default.
momentum(float): :math: `\\beta` in equation is the momentum term, momentum(float): :math:`\\beta` in equation is the momentum term,
set 0.0 by default. set 0.0 by default.
Raises: Raises:
...@@ -810,6 +976,113 @@ class RMSPropOptimizer(Optimizer): ...@@ -810,6 +976,113 @@ class RMSPropOptimizer(Optimizer):
return rmsprop_op return rmsprop_op
class FtrlOptimizer(Optimizer):
"""
FTRL (Follow The Regularized Leader) Optimizer.
The paper that proposed Follow The Regularized Leader (FTRL):
(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
.. math::
&new\_accum = squared\_accum + grad^2
&if (lr\_power == -0.5):
&\quad linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param}
&else:
&\quad linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}
&x = l1 * sign(linear\_accum) - linear\_accum
&if (lr\_power == -0.5):
&\quad y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2)
&\quad pre\_shrink = \\frac{x}{y}
&\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
&else:
&\quad y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)
&\quad pre\_shrink = \\frac{x}{y}
&\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
&squared\_accum += grad^2
Args:
learning_rate (float|Variable): global learning rate.
l1 (float):
l2 (float):
lr_power (float):
Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None.
Examples:
.. code-block:: python
optimizer = fluid.optimizer.Ftrl(0.0001)
_, params_grads = optimizer.minimize(cost)
"""
_squared_acc_str = "squared"
_linear_acc_str = "linear"
def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs):
super(FtrlOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
if learning_rate is None:
raise ValueError("learning_rate is not set.")
self.type = "ftrl"
self._l1 = l1
self._l2 = l2
self._lr_power = lr_power
def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
for p in parameters:
self._add_accumulator(self._squared_acc_str, p)
self._add_accumulator(self._linear_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
squared_acc = self._get_accumulator(self._squared_acc_str,
param_and_grad[0])
linear_acc = self._get_accumulator(self._linear_acc_str,
param_and_grad[0])
ftrl_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"SquaredAccumulator": squared_acc,
"LinearAccumulator": linear_acc,
"LearningRate": self._create_param_lr(param_and_grad),
},
outputs={
"ParamOut": param_and_grad[0],
"SquaredAccumOut": squared_acc,
"LinearAccumOut": linear_acc
},
attrs={"l1": self._l1,
"l2": self._l1,
"lr_power": self._lr_power})
return ftrl_op
# We short the class name, since users will use the optimizer with the package # We short the class name, since users will use the optimizer with the package
# name. The sample code: # name. The sample code:
# #
...@@ -826,6 +1099,7 @@ Adamax = AdamaxOptimizer ...@@ -826,6 +1099,7 @@ Adamax = AdamaxOptimizer
DecayedAdagrad = DecayedAdagradOptimizer DecayedAdagrad = DecayedAdagradOptimizer
Adadelta = AdadeltaOptimizer Adadelta = AdadeltaOptimizer
RMSProp = RMSPropOptimizer RMSProp = RMSPropOptimizer
Ftrl = FtrlOptimizer
class ModelAverage(Optimizer): class ModelAverage(Optimizer):
...@@ -844,7 +1118,9 @@ class ModelAverage(Optimizer): ...@@ -844,7 +1118,9 @@ class ModelAverage(Optimizer):
max_average_window: The maximum size of average window. max_average_window: The maximum size of average window.
Examples: Examples:
...
.. code-block:: python
optimizer = fluid.optimizer.Momentum() optimizer = fluid.optimizer.Momentum()
_, params_grads = optimizer.minimize(cost) _, params_grads = optimizer.minimize(cost)
model_average = fluid.optimizer.ModelAverage(params_grads, 0.15, model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,
......
...@@ -42,6 +42,9 @@ def cuda_profiler(output_file, output_mode=None, config=None): ...@@ -42,6 +42,9 @@ def cuda_profiler(output_file, output_mode=None, config=None):
counters/options for profiling by `config` argument. The default config counters/options for profiling by `config` argument. The default config
is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d', is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace']. 'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
Then users can use NVIDIA Visual Profiler
(https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
this output file to visualize results.
Args: Args:
output_file (string) : The output file name, the result will be output_file (string) : The output file name, the result will be
...@@ -50,6 +53,33 @@ def cuda_profiler(output_file, output_mode=None, config=None): ...@@ -50,6 +53,33 @@ def cuda_profiler(output_file, output_mode=None, config=None):
Comma separated values format. It should be 'kvp' or 'csv'. Comma separated values format. It should be 'kvp' or 'csv'.
config (list of string) : The profiler options and counters can refer config (list of string) : The profiler options and counters can refer
to "Compute Command Line Profiler User Guide". to "Compute Command Line Profiler User Guide".
Raises:
ValueError: If `output_mode` is not in ['kvp', 'csv'].
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
epoc = 8
dshape = [4, 3, 28, 28]
data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output_file = 'cuda_profiler.txt'
with profiler.cuda_profiler(output_file, 'csv') as nvprof:
for i in range(epoc):
input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input})
# then use NVIDIA Visual Profiler (nvvp) to load this output file
# to visualize results.
""" """
if output_mode is None: if output_mode is None:
output_mode = 'csv' output_mode = 'csv'
...@@ -69,19 +99,52 @@ def cuda_profiler(output_file, output_mode=None, config=None): ...@@ -69,19 +99,52 @@ def cuda_profiler(output_file, output_mode=None, config=None):
def reset_profiler(): def reset_profiler():
"""The profiler clear interface. """
reset_profiler will clear the previous time record. Clear the previous time record. This interface does not work for
`fluid.profiler.cuda_profiler`, it only works for
`fluid.profiler.start_profiler`, `fluid.profiler.stop_profiler`,
and `fluid.profiler.profiler`.
Examples:
.. code-block:: python
import paddle.fluid.profiler as profiler
with profiler.profiler(state, 'total', '/tmp/profile'):
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
# ...
""" """
core.reset_profiler() core.reset_profiler()
def start_profiler(state): def start_profiler(state):
"""Enable the profiler. """
Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
`fluid.profiler.stop_profiler` to insert the code, except the usage of
`fluid.profiler.profiler` interface.
Args: Args:
state (string) : The profiling state, which should be 'CPU', 'GPU' state (string) : The profiling state, which should be 'CPU', 'GPU'
or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
GPU as well. 'All' also generates timeline. GPU as well. 'All' also generates timeline.
Raises:
ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
Examples:
.. code-block:: python
import paddle.fluid.profiler as profiler
profiler.start_profiler('GPU')
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
# except each iteration
profiler.stop_profiler('total', '/tmp/profile')
""" """
if core.is_profiler_enabled(): if core.is_profiler_enabled():
return return
...@@ -97,7 +160,10 @@ def start_profiler(state): ...@@ -97,7 +160,10 @@ def start_profiler(state):
def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
"""Stop the profiler. """
Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
`fluid.profiler.stop_profiler` to insert the code, except the usage of
`fluid.profiler.profiler` interface.
Args: Args:
sorted_key (string) : If None, the profiling results will be printed sorted_key (string) : If None, the profiling results will be printed
...@@ -111,6 +177,23 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): ...@@ -111,6 +177,23 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
The `ave` means sorting by the average execution time. The `ave` means sorting by the average execution time.
profile_path (string) : If state == 'All', it will write a profile profile_path (string) : If state == 'All', it will write a profile
proto output file. proto output file.
Raises:
ValueError: If `sorted_key` is not in
['calls', 'total', 'max', 'min', 'ave'].
Examples:
.. code-block:: python
import paddle.fluid.profiler as profiler
profiler.start_profiler('GPU')
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
# except each iteration
profiler.stop_profiler('total', '/tmp/profile')
""" """
if not core.is_profiler_enabled(): if not core.is_profiler_enabled():
return return
...@@ -137,7 +220,12 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): ...@@ -137,7 +220,12 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
Different from cuda_profiler, this profiler can be used to profile both CPU Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels, and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial if you want to profile other program, you can refer the profiling tutorial
to add more records. to add more records in C++ code.
If the state == 'All', a profile proto file will be written to
`profile_path`. This file records timeline information during the execution.
Then users can visualize this file to see the timeline, please refer
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
Args: Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU', state (string) : The profiling state, which should be 'CPU' or 'GPU',
...@@ -156,6 +244,25 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): ...@@ -156,6 +244,25 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
The `ave` means sorting by the average execution time. The `ave` means sorting by the average execution time.
profile_path (string) : If state == 'All', it will write a profile profile_path (string) : If state == 'All', it will write a profile
proto output file. proto output file.
Raises:
ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
not in ['calls', 'total', 'max', 'min', 'ave'].
Examples:
.. code-block:: python
import paddle.fluid.profiler as profiler
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[],
use_program_cache=True)
# ...
""" """
start_profiler(state) start_profiler(state)
yield yield
......
...@@ -16,8 +16,8 @@ import framework ...@@ -16,8 +16,8 @@ import framework
from . import core from . import core
__all__ = [ __all__ = [
'append_regularization_ops', 'WeightDecayRegularizer', 'L1Decay', 'L2Decay', 'append_regularization_ops', 'L1Decay', 'L2Decay', 'L1DecayRegularizer',
'L1DecayRegularizer', 'L2DecayRegularizer' 'L2DecayRegularizer'
] ]
...@@ -36,7 +36,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None): ...@@ -36,7 +36,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
set. It will be applied with regularizer. set. It will be applied with regularizer.
Returns: Returns:
list of (parameters, gradients) pair with the regularized gradient list[(Variable, Variable)]: list of (parameters, gradients) \
pair with the regularized gradient
Raises: Raises:
Exception: Unknown regularization type Exception: Unknown regularization type
...@@ -100,6 +101,24 @@ class WeightDecayRegularizer(object): ...@@ -100,6 +101,24 @@ class WeightDecayRegularizer(object):
class L2DecayRegularizer(WeightDecayRegularizer): class L2DecayRegularizer(WeightDecayRegularizer):
"""Implements the L2 Weight Decay Regularization """Implements the L2 Weight Decay Regularization
Small values of L2 can help prevent over fitting the training data.
.. math::
L2WeightDecay = reg\_coeff * parameter
Args:
regularization_coeff(float): regularization coeff
Examples:
.. code-block:: python
optimizer = fluid.optimizer.Adagrad(
learning_rate=1e-4,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.1))
optimizer.minimize(avg_cost)
""" """
def __init__(self, regularization_coeff=0.0): def __init__(self, regularization_coeff=0.0):
...@@ -154,6 +173,27 @@ class L2DecayRegularizer(WeightDecayRegularizer): ...@@ -154,6 +173,27 @@ class L2DecayRegularizer(WeightDecayRegularizer):
class L1DecayRegularizer(WeightDecayRegularizer): class L1DecayRegularizer(WeightDecayRegularizer):
"""Implements the L1 Weight Decay Regularization """Implements the L1 Weight Decay Regularization
L1 regularization encourages sparsity.
.. math::
L1WeightDecay = reg\_coeff * sign(parameter)
Args:
regularization_coeff(float): regularization coeff
Examples:
.. code-block:: python
program = fluid.framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
regularizer=fluid.regularizer.L1DecayRegularizer(0.5))
""" """
def __init__(self, regularization_coeff=0.0): def __init__(self, regularization_coeff=0.0):
......
...@@ -194,16 +194,16 @@ def train(word_dict, ...@@ -194,16 +194,16 @@ def train(word_dict,
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -69,16 +69,16 @@ def train(use_cuda, save_dirname, is_local): ...@@ -69,16 +69,16 @@ def train(use_cuda, save_dirname, is_local):
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -178,16 +178,16 @@ def train(net_type, use_cuda, save_dirname, is_local): ...@@ -178,16 +178,16 @@ def train(net_type, use_cuda, save_dirname, is_local):
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -209,16 +209,16 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -209,16 +209,16 @@ def train(use_cuda, save_dirname=None, is_local=True):
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -200,16 +200,16 @@ def train_main(use_cuda, is_sparse, is_local=True): ...@@ -200,16 +200,16 @@ def train_main(use_cuda, is_sparse, is_local=True):
if is_local: if is_local:
train_loop(framework.default_main_program()) train_loop(framework.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -151,16 +151,16 @@ def train(nn_type, ...@@ -151,16 +151,16 @@ def train(nn_type,
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -220,16 +220,16 @@ def train(use_cuda, save_dirname, is_local=True): ...@@ -220,16 +220,16 @@ def train(use_cuda, save_dirname, is_local=True):
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -125,16 +125,16 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): ...@@ -125,16 +125,16 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -43,7 +43,7 @@ class TestConcatOp(OpTest): ...@@ -43,7 +43,7 @@ class TestConcatOp(OpTest):
self.axis = 1 self.axis = 1
class TestConcatOp2(OpTest): class TestConcatOp2(TestConcatOp):
def init_test_data(self): def init_test_data(self):
self.x0 = np.random.random((2, 3, 4, 5)).astype('float32') self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
self.x1 = np.random.random((2, 3, 4, 5)).astype('float32') self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
...@@ -51,5 +51,16 @@ class TestConcatOp2(OpTest): ...@@ -51,5 +51,16 @@ class TestConcatOp2(OpTest):
self.axis = 1 self.axis = 1
class TestConcatOp3(TestConcatOp):
def init_test_data(self):
self.x0 = np.random.random((1, 256, 170, 256)).astype('float32')
self.x1 = np.random.random((1, 128, 170, 256)).astype('float32')
self.x2 = np.random.random((1, 128, 170, 256)).astype('float32')
self.axis = 1
def test_check_grad(self):
pass
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
#
if(APPLE) import unittest
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE) from test_gaussian_random_op import TestGaussianRandomOp
class TestMKLDNN(TestGaussianRandomOp):
def init_kernel_type(self):
self.use_mkldnn = True
cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES} device_context framework_proto proto_desc operator)
cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
cc_test(test_tape if __name__ == '__main__':
SRCS test_tape.cc unittest.main()
DEPS tape tape_variable)
...@@ -25,7 +25,15 @@ class TestGaussianRandomOp(unittest.TestCase): ...@@ -25,7 +25,15 @@ class TestGaussianRandomOp(unittest.TestCase):
def setUp(self): def setUp(self):
self.op_type = "gaussian_random" self.op_type = "gaussian_random"
self.inputs = {} self.inputs = {}
self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10} self.use_mkldnn = False
self.init_kernel_type()
self.attrs = {
"shape": [1000, 784],
"mean": .0,
"std": 1.,
"seed": 10,
"use_mkldnn": self.use_mkldnn
}
self.outputs = ["Out"] self.outputs = ["Out"]
...@@ -58,6 +66,9 @@ class TestGaussianRandomOp(unittest.TestCase): ...@@ -58,6 +66,9 @@ class TestGaussianRandomOp(unittest.TestCase):
self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1) self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1) self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
def init_kernel_type(self):
pass
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -401,6 +401,15 @@ class TestBook(unittest.TestCase): ...@@ -401,6 +401,15 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(output) self.assertIsNotNone(output)
print(str(program)) print(str(program))
def test_maxout(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[3, 5], dtype="float32")
y = layers.data(name='y', shape=[2, 3], dtype="float32")
output = layers.crop(x, shape=y)
self.assertIsNotNone(output)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -434,5 +434,71 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): ...@@ -434,5 +434,71 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
self.assertAlmostEqual(init_ops[1].attr('value'), 0.0) self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
class TestFtrlOptimizer(unittest.TestCase):
class MockFtrl(optimizer.FtrlOptimizer):
def get_accumulators(self):
return self._accumulators
def get_squared_str(self):
return self._squared_acc_str
def get_linear_str(self):
return self._linear_acc_str
def test_ftrl_optimizer(self):
init_program = framework.Program()
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1})
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
block.append_op(
type="mul",
inputs={"X": mul_x,
"Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1})
mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
learning_rate = 0.01
ftrl_optimizer = self.MockFtrl(
learning_rate=learning_rate, l1=0.0, l2=0.0, lr_power=-0.5)
params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1)
self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out,
init_program)
self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts],
["fill_constant", "elementwise_mul", "ftrl"])
# Check accumulators
accumulators = ftrl_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 2)
self.assertTrue(ftrl_optimizer.get_squared_str() in accumulators)
self.assertTrue(ftrl_optimizer.get_linear_str() in accumulators)
squared_acc = accumulators[ftrl_optimizer.get_squared_str()]
linear_acc = accumulators[ftrl_optimizer.get_linear_str()]
self.assertEqual(len(squared_acc), 1)
self.assertEqual(len(linear_acc), 1)
self.assertTrue(mul_x.name in squared_acc)
self.assertTrue(mul_x.name in linear_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 3)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from test_sum_op import TestSumOp
class TestMKLDNN(TestSumOp):
def init_kernel_type(self):
self.use_mkldnn = True
if __name__ == '__main__':
unittest.main()
...@@ -20,12 +20,15 @@ from op_test import OpTest ...@@ -20,12 +20,15 @@ from op_test import OpTest
class TestSumOp(OpTest): class TestSumOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "sum" self.op_type = "sum"
self.use_mkldnn = False
self.init_kernel_type()
x0 = np.random.random((3, 4)).astype('float32') x0 = np.random.random((3, 4)).astype('float32')
x1 = np.random.random((3, 4)).astype('float32') x1 = np.random.random((3, 4)).astype('float32')
x2 = np.random.random((3, 4)).astype('float32') x2 = np.random.random((3, 4)).astype('float32')
self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
y = x0 + x1 + x2 y = x0 + x1 + x2
self.outputs = {'Out': y} self.outputs = {'Out': y}
self.attrs = {'use_mkldnn': self.use_mkldnn}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -33,6 +36,9 @@ class TestSumOp(OpTest): ...@@ -33,6 +36,9 @@ class TestSumOp(OpTest):
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['x0'], 'Out') self.check_grad(['x0'], 'Out')
def init_kernel_type(self):
pass
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -24,7 +24,7 @@ Steps to transpile trainer: ...@@ -24,7 +24,7 @@ Steps to transpile trainer:
1. split variable to multiple blocks, aligned by product(dim[1:]) (width). 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
2. rename splited grad variables to add trainer_id suffix ".trainer_%d". 2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
3. modify trainer program add split_op to each grad variable. 3. modify trainer program add split_op to each grad variable.
4. append send_op to send splited variables to server and 4. append send_op to send splited variables to server and
5. add recv_op to fetch params(splited blocks or origin param) from server. 5. add recv_op to fetch params(splited blocks or origin param) from server.
6. append concat_op to merge splited blocks to update local weights. 6. append concat_op to merge splited blocks to update local weights.
...@@ -44,7 +44,7 @@ import numpy as np ...@@ -44,7 +44,7 @@ import numpy as np
from ps_dispatcher import RoundRobin, HashName, PSDispatcher from ps_dispatcher import RoundRobin, HashName, PSDispatcher
from .. import core, framework from .. import core, framework
from ..framework import Program, default_main_program, \ from ..framework import Program, default_main_program, \
default_startup_program, \ default_startup_program, Block, \
Variable, Parameter, grad_var_name Variable, Parameter, grad_var_name
from details import * from details import *
...@@ -471,7 +471,7 @@ class DistributeTranspiler: ...@@ -471,7 +471,7 @@ class DistributeTranspiler:
self._append_pserver_ops(block, op, endpoint, grad_to_block_id, self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
self.origin_program, merged_var) self.origin_program, merged_var)
else: else:
self._append_pserver_non_opt_ops(block, op, endpoint) self._append_pserver_non_opt_ops(block, op)
def __op_have_grad_input__(op): def __op_have_grad_input__(op):
for varname in op.input_arg_names: for varname in op.input_arg_names:
...@@ -479,13 +479,39 @@ class DistributeTranspiler: ...@@ -479,13 +479,39 @@ class DistributeTranspiler:
return varname return varname
return "" return ""
def __clone_lr_op_sub_block__(op, program, new_block):
if not op.has_attr('sub_block'):
return
origin_block_desc = op.attr('sub_block')
origin_block = self.origin_program.block(origin_block_desc.id)
assert isinstance(origin_block, Block)
# we put the new sub block to new block to follow the block
# hierarchy of the original blocks
new_sub_block = program.create_block(new_block.idx)
# clone vars
for var in origin_block.vars:
new_sub_block.clone_variable(var)
# clone ops
for op in origin_block.ops:
self._clone_lr_op(program, new_sub_block, op)
# clone sub_block of op
__clone_lr_op_sub_block__(op, program, new_sub_block)
# reset the block of op
op.set_attr('sub_block', new_sub_block)
# append lr decay ops to the child block if exists # append lr decay ops to the child block if exists
lr_ops = self._get_lr_ops() lr_ops = self._get_lr_ops()
if len(lr_ops) > 0: if len(lr_ops) > 0:
lr_decay_block = pserver_program.create_block( lr_decay_block = pserver_program.create_block(
pserver_program.num_blocks - 1) pserver_program.num_blocks - 1)
for _, op in enumerate(lr_ops): for _, op in enumerate(lr_ops):
self._append_pserver_non_opt_ops(lr_decay_block, op, endpoint) self._append_pserver_non_opt_ops(lr_decay_block, op)
# append sub blocks to pserver_program in lr_decay_op
__clone_lr_op_sub_block__(op, pserver_program, lr_decay_block)
# append op to the current block # append op to the current block
grad_to_block_id = [] grad_to_block_id = []
...@@ -801,7 +827,8 @@ class DistributeTranspiler: ...@@ -801,7 +827,8 @@ class DistributeTranspiler:
table_opt_block.append_op( table_opt_block.append_op(
type="sum", type="sum",
inputs={"X": pserver_side_table_grad_list}, inputs={"X": pserver_side_table_grad_list},
outputs={"Out": [grad_var]}) outputs={"Out": [grad_var]},
attrs={"use_mkldnn": False})
else: else:
# in async_mode, for table gradient, it also need to be splited to each parameter server # in async_mode, for table gradient, it also need to be splited to each parameter server
origin_grad_name = grad_var.name origin_grad_name = grad_var.name
...@@ -1052,7 +1079,8 @@ class DistributeTranspiler: ...@@ -1052,7 +1079,8 @@ class DistributeTranspiler:
optimize_block.append_op( optimize_block.append_op(
type="sum", type="sum",
inputs={"X": vars2merge}, inputs={"X": vars2merge},
outputs={"Out": merged_var}) outputs={"Out": merged_var},
attrs={"use_mkldnn": False})
# TODO(panyx0718): What if it's SELECTED_ROWS. # TODO(panyx0718): What if it's SELECTED_ROWS.
if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS: if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
optimize_block.append_op( optimize_block.append_op(
...@@ -1138,7 +1166,29 @@ class DistributeTranspiler: ...@@ -1138,7 +1166,29 @@ class DistributeTranspiler:
break break
return grad_block return grad_block
def _append_pserver_non_opt_ops(self, optimize_block, opt_op, endpoint): def _clone_lr_op(self, program, block, op):
inputs = self._get_input_map_from_op(
self.origin_program.global_block().vars, op)
for key, varlist in inputs.iteritems():
if not isinstance(varlist, list):
varlist = [varlist]
for var in varlist:
if var not in program.global_block().vars:
block.clone_variable(var)
outputs = self._get_output_map_from_op(
self.origin_program.global_block().vars, op)
for key, varlist in outputs.iteritems():
if not isinstance(varlist, list):
varlist = [varlist]
for var in varlist:
if var not in program.global_block().vars:
block.clone_variable(var)
block.append_op(
type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
program = optimize_block.program program = optimize_block.program
# Append the ops for parameters that do not need to be optimized/updated # Append the ops for parameters that do not need to be optimized/updated
inputs = self._get_input_map_from_op( inputs = self._get_input_map_from_op(
......
...@@ -19,16 +19,30 @@ from ..executor import global_scope ...@@ -19,16 +19,30 @@ from ..executor import global_scope
class InferenceTranspiler: class InferenceTranspiler:
'''
Convert the fluid program to optimized inference program.
There are several optimizations, only fuse batch normalization is supported now.
Examples:
.. code-block:: python
# As InferenceTranspiler will modify the original program,
# please clone before use it.
inference_transpiler_program = program.clone()
t = fluid.InferenceTranspiler()
t.transpile(inference_transpiler_program, place)
'''
def transpile(self, program, place, scope=None): def transpile(self, program, place, scope=None):
''' '''
Transpile the program. Support only fuse batch normalization now. Run the transpiler.
:param program: program to transpile Args:
:type program: Program program (Program): program to transpile
:param place: inference place place (Place): inference place
:type place: Place scope (Scope|None): inference Scope
:param scope: inference scope
:type scope: Scope or None
''' '''
if not isinstance(program, Program): if not isinstance(program, Program):
raise TypeError("program should be as Program type") raise TypeError("program should be as Program type")
...@@ -49,36 +63,43 @@ class InferenceTranspiler: ...@@ -49,36 +63,43 @@ class InferenceTranspiler:
can be integrated with them. Doing so will give us a forward acceleration, can be integrated with them. Doing so will give us a forward acceleration,
especially in environments like mobile or embedded. especially in environments like mobile or embedded.
For input X: For input :math:`X`:
- Conv process: X = input * W + bias
- Batch norm process: X' = (X - mean) / std - Conv process: :math:`X = input * W + bias`
- Scale Process: Y = a * X' + b - Batch norm process: :math:`X' = (X - mean) / std`
- Scale Process: :math:`Y = a * X' + b`
After fuse into one operation: After fuse into one operation:
Y = (input * W + bias - mean) / std * a + b .. math::
= input * a * W / std + ((bias - mean) / std * a + b)
Y &= (input * W + bias - mean) / std * a + b \\\\
&= input * a * W / std + ((bias - mean) / std * a + b)
The operator transformation is: The operator transformation is:
- before: - before:
- conv->batch_norm->any_other_op (bias == 0) - conv->batch_norm->any_other_op (bias == 0)
- conv->elementwise_add->batch_norm->any_other_op (bias != 0) - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
- after: - after:
- conv->elementwise_add->any_other_op - conv->elementwise_add->any_other_op
The transpile stages are: The transpile stages are:
1. insert elementwise_add op when bias == 0. 1. insert elementwise_add op when bias == 0.
2. fuse the batch_norm's parameters to conv and elementwise_add operators. 2. fuse the batch_norm's parameters to conv and elementwise_add operators.
3. remove batch_norm ops which are not used in any other ops. 3. remove batch_norm ops which are not used in any other ops.
4. adjust the input of any_other_op to be the output of elementwise_add operator. 4. adjust the input of any_other_op to be the output of elementwise_add operator.
5. remove unused variables. 5. remove unused variables.
:param program: program to transpile Args:
:type program: Program program (Program): program to transpile
:param place: inference place place (Place): inference place
:type place: Place scope (Scope): inference Scope
:param scope: inference scope
:type scope: Scope
''' '''
self.scope = scope self.scope = scope
self.place = place self.place = place
......
...@@ -336,7 +336,7 @@ def _buf2lines(buf, line_break="\n"): ...@@ -336,7 +336,7 @@ def _buf2lines(buf, line_break="\n"):
class PipeReader: class PipeReader:
""" """
PipeReader read data by stream from a command, take it's PipeReader read data by stream from a command, take it's
stdout into a pipe buffer and redirect it to the parser to stdout into a pipe buffer and redirect it to the parser to
parse, then yield data as your desired format. parse, then yield data as your desired format.
...@@ -352,7 +352,7 @@ class PipeReader: ...@@ -352,7 +352,7 @@ class PipeReader:
An example: An example:
.. code-block:: python .. code-block:: python
def example_reader(): def example_reader():
for f in myfiles: for f in myfiles:
pr = PipeReader("cat %s"%f) pr = PipeReader("cat %s"%f)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Print all signature of a python module in alphabet order.
Usage:
./print_signature "paddle.fluid" > signature.txt
"""
import importlib
import inspect
import collections
import sys
import pydoc
member_dict = collections.OrderedDict()
def visit_member(parent_name, member):
cur_name = ".".join([parent_name, member.__name__])
if inspect.isclass(member):
for name, value in inspect.getmembers(member):
if hasattr(value, '__name__') and (not name.startswith("_") or
name == "__init__"):
visit_member(cur_name, value)
elif callable(member):
try:
member_dict[cur_name] = inspect.getargspec(member)
except TypeError: # special for PyBind method
member_dict[cur_name] = " ".join([
line.strip() for line in pydoc.render_doc(member).split('\n')
if "->" in line
])
else:
raise RuntimeError("Unsupported generate signature of member, type {0}".
format(str(type(member))))
def visit_all_module(mod):
for member_name in (
name
for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
if not name.startswith("_")):
instance = getattr(mod, member_name, None)
if instance is None:
continue
if inspect.ismodule(instance):
visit_all_module(instance)
else:
visit_member(mod.__name__, instance)
visit_all_module(importlib.import_module(sys.argv[1]))
for name in member_dict:
print name, member_dict[name]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册