提交 adc7ba2e 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into refine/jit

......@@ -2,8 +2,8 @@
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
......@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
### Install Latest Stable Release:
```
# Linux CPU
......@@ -27,9 +27,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.1.0.post87
pip install paddlepaddle-gpu==1.2.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.1.0.post85
pip install paddlepaddle-gpu==1.2.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/
```
......@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.1.0.post85
## Installation
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
## Documentation
We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.
- [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
We appreciate your contributions!
......
......@@ -66,6 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr
paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
......
add_subdirectory(memory)
add_subdirectory(platform)
add_subdirectory(framework)
add_subdirectory(imperative)
add_subdirectory(operators)
add_subdirectory(string)
add_subdirectory(recordio)
......
......@@ -33,11 +33,7 @@ void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
CheckInit();
for (size_t i = 0; i < use_slots_.size(); ++i) {
if (name == use_slots_[i]) {
if (use_slots_is_dense_[i]) {
feed_vec_[i] = MixTensor(var->GetMutable<Tensor>());
} else {
feed_vec_[i] = MixTensor(var->GetMutable<LoDTensor>());
}
feed_vec_[i] = var->GetMutable<LoDTensor>();
}
}
}
......@@ -301,6 +297,7 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
"the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s",
str);
if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]);
if ((*instance)[idx].GetType()[0] == 'f') { // float
......@@ -337,6 +334,7 @@ void MultiSlotDataFeed::AddInstanceToInsVec(
(*ins_vec)[i].InitOffset();
}
}
for (size_t i = 0; i < instance.size(); ++i) {
(*ins_vec)[i].AddIns(instance[i]);
}
......@@ -348,36 +346,25 @@ void MultiSlotDataFeed::PutToFeedVec(
const auto& type = ins_vec[i].GetType();
const auto& offset = ins_vec[i].GetOffset();
int total_instance = static_cast<int>(offset.back());
if (type[0] == 'f') { // float
const auto& feasign = ins_vec[i].GetFloatData();
if (feed_vec_[i].IsDense()) {
int size_in_each_batch = total_instance / batch_size_;
float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<float>(
{batch_size_, size_in_each_batch}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
} else {
float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data<float>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
LoD data_lod{offset};
feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
}
float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
} else if (type[0] == 'u') { // uint64
// no uint64_t type in paddlepaddle
const auto& feasign = ins_vec[i].GetUint64Data();
if (feed_vec_[i].IsDense()) {
int size_in_each_batch = total_instance / batch_size_;
int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<int64_t>(
{batch_size_, size_in_each_batch}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
} else {
int64_t* tensor_ptr =
feed_vec_[i].GetLoDTensor()->mutable_data<int64_t>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
LoD data_lod{offset};
feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
}
int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
}
LoD data_lod{offset};
feed_vec_[i]->set_lod(data_lod);
if (use_slots_is_dense_[i]) {
int dim = total_instance / batch_size_;
feed_vec_[i]->Resize({batch_size_, dim});
}
}
}
......
......@@ -30,35 +30,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
// Pack Tensor type and LoDTensor type into MixTensor type, in order
// to record either Tensor or LoDTensor information at the same time.
class MixTensor {
public:
MixTensor() {}
explicit MixTensor(LoDTensor* lodtensor) {
is_dense_ = false;
lodtensor_ = lodtensor;
}
explicit MixTensor(Tensor* tensor) {
is_dense_ = true;
tensor_ = tensor;
}
bool IsDense() { return is_dense_; }
LoDTensor* GetLoDTensor() {
PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr.");
return lodtensor_;
}
Tensor* GetTensor() {
PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr.");
return tensor_;
}
private:
bool is_dense_;
LoDTensor* lodtensor_;
Tensor* tensor_;
};
// DataFeed is the base virtual class for all ohther DataFeeds.
// It is used to read files and parse the data for subsequent trainer.
// Example:
......@@ -133,7 +104,7 @@ class DataFeed {
use_slots_index_; // -1: not used; >=0: the index of use_slots_
// The data read by DataFeed will be stored here
std::vector<MixTensor> feed_vec_;
std::vector<LoDTensor*> feed_vec_;
// the batch size defined by user
int default_batch_size_;
......
......@@ -152,19 +152,13 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
std::map<std::string, const paddle::framework::LoDTensor*>
lodtensor_targets;
std::map<std::string, const paddle::framework::Tensor*> tensor_targets;
for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
const auto& slot = multi_slot_desc.slots(i);
if (slot.is_used()) {
const auto& name = slot.name();
readers[idx]->AddFeedVar(scope->Var(name), name);
if (slot.is_dense()) {
tensor_targets[name] =
&scope->FindVar(name)->Get<paddle::framework::Tensor>();
} else {
lodtensor_targets[name] =
&scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
}
lodtensor_targets[name] =
&scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
}
}
readers[idx]->Start();
......@@ -175,8 +169,9 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
if (!slot.is_used()) {
continue;
}
const paddle::framework::LoDTensor* tens =
lodtensor_targets[slot.name()];
if (slot.is_dense()) { // dense branch
const paddle::framework::Tensor* tens = tensor_targets[slot.name()];
if (slot.type() == "uint64") {
const int64_t* data = tens->data<int64_t>();
int batch_size = tens->dims()[0];
......@@ -202,8 +197,6 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
PADDLE_THROW("Error type in proto file.");
}
} else { // sparse branch
const paddle::framework::LoDTensor* tens =
lodtensor_targets[slot.name()];
if (slot.type() == "uint64") {
const int64_t* data = tens->data<int64_t>();
for (size_t i = 0; i < tens->NumElements(); ++i) {
......
......@@ -151,19 +151,22 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
auto out_format =
platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
void* in_data = GetDataFromTensor(in, in_type);
// output tensor has the same dims as input. Reorder don't change dims
out->Resize(in.dims());
auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
auto out_memory =
memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
if (in_format != out_format) {
void* in_data = GetDataFromTensor(in, in_type);
auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
platform::Reorder(in_memory, out_memory);
auto in_memory =
memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
auto out_memory =
memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
platform::Reorder(in_memory, out_memory);
} else {
out->ShareDataWith(in);
}
out->set_layout(out_layout);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out->set_format(memory::format::format_undef);
......
......@@ -15,14 +15,26 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro
if(WITH_GPU)
nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
if(WITH_DISTRIBUTE)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor sendrecvop_grpc)
else()
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor)
endif()
nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
else()
cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
variable_visitor)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
if(WITH_DISTRIBUTE)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim selected_rows_functor sendrecvop_grpc)
else()
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim selected_rows_functor)
endif()
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
endif()
......
......@@ -58,6 +58,17 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
}
}
CollectiveContext *context = CollectiveContext::GetInstance();
context->endpoints_ = strategy_.trainers_endpoints_;
context->trainer_id_ = strategy_.trainer_id_;
PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0");
if (strategy_.trainer_id_ > 0) {
PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) <
strategy_.trainers_endpoints_.size(),
"trainer_id_ < endpoints_ size");
}
VLOG(1) << "CollectiveContext:" << context->String();
// Convert graph to run on multi-devices.
auto multi_devices_pass = AppendPass("multi_devices_pass");
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
......@@ -135,16 +146,16 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
} else if (pass->Type() == "sequential_execution_pass") {
VLOG(1) << "set enable_sequential_execution:"
<< enable_sequential_execution_;
LOG(INFO) << "set enable_sequential_execution:"
<< enable_sequential_execution_;
pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} else if (pass->Type() == "all_reduce_deps_pass") {
VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
<< ", num_trainers:" << num_trainers_;
LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
<< ", num_trainers:" << num_trainers_;
pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>(
......
......@@ -74,6 +74,8 @@ struct BuildStrategy {
bool fuse_broadcast_op_{false};
int num_trainers_{1};
int trainer_id_{0};
std::vector<std::string> trainers_endpoints_;
bool remove_unnecessary_lock_{false};
// NOTE:
......
......@@ -53,7 +53,7 @@ struct ReduceLoDTensor {
}
};
inline void GatherSelectedRows(
inline void GatherLocalSelectedRows(
const std::vector<const SelectedRows *> &src_selecte_rows_,
const std::vector<platform::Place> &in_places,
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
......
......@@ -16,6 +16,12 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/collective_client.h"
#include "paddle/fluid/operators/distributed/collective_server.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#endif
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(
......@@ -26,6 +32,112 @@ namespace paddle {
namespace framework {
namespace details {
std::once_flag CollectiveContext::init_flag_;
std::unique_ptr<CollectiveContext> CollectiveContext::context_;
static inline std::string GetRemoteVarName(const std::string &var_name,
int trainer_id) {
return string::Sprintf("%s_merged_tmp@trainer_%d", var_name, trainer_id);
}
void ReduceOpHandle::Wait(
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes) {
// TODO(gongwb): use event wait?
for (auto &dev_ctx : dev_ctxes) {
dev_ctx.second->Wait();
}
}
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
template <typename DevCtx, typename DataType>
void ReduceOpHandle::GatherSelectedRows(
const std::vector<const SelectedRows *> &src_selected_rows,
const std::vector<platform::Place> &in_places,
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
VarHandle *out_var_handle, const platform::Place &out_place,
SelectedRows *dst_selected_rows) {
const CollectiveContext &collective_context =
*CollectiveContext::GetInstance();
// 1. gather local selected rows, merge them
std::string gathered_var_name = out_var_handle->name_ + "_gathered_tmp";
auto scope = local_scopes_.at(out_var_handle->scope_idx_);
auto gathered_var_mid = scope->Var(gathered_var_name);
auto gathered_select_rows =
gathered_var_mid->GetMutable<framework::SelectedRows>();
GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place,
gathered_select_rows);
// FIXME(gongwb): remove this Wait.
Wait(dev_ctxes);
// merge them
auto merged_dev_ctx = dynamic_cast<DevCtx *>(dev_ctxes.at(out_place));
std::string merged_var_name =
GetRemoteVarName(out_var_handle->name_, collective_context.trainer_id_);
auto merged_select_rows =
scope->Var(merged_var_name)->GetMutable<SelectedRows>();
operators::math::scatter::MergeAdd<DevCtx, DataType> merge_func;
merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows);
// 2. start collective server if it doesn't exist
operators::distributed::CollectiveServer *server =
operators::distributed::CollectiveServer::GetInstance(
collective_context.endpoints_[collective_context.trainer_id_],
collective_context.endpoints_.size() - 1);
auto rpc_server = server->GetRPCServer();
rpc_server->RegisterVar(merged_var_name,
operators::distributed::kRequestGetMonomerVariable,
scope, merged_dev_ctx);
// 3. gather them from all remote nodes.
std::vector<const SelectedRows *> remote;
operators::distributed::CollectiveClient *client =
operators::distributed::CollectiveClient::GetInstance();
std::vector<operators::distributed::RemoteVar> vars;
for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) {
if (i == (unsigned)collective_context.trainer_id_) continue;
operators::distributed::RemoteVar var;
var.trainer_id_ = i;
var.var_name_ = GetRemoteVarName(out_var_handle->name_, i);
var.ep_ = collective_context.endpoints_[i];
vars.push_back(var);
VLOG(4) << "gather from:" << var.String();
}
// erase gathered vars
merged_dev_ctx->Wait();
scope->EraseVars(std::vector<std::string>{gathered_var_name});
PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
PADDLE_ENFORCE(remote.size() == vars.size());
// 4. merged local selected rows.
std::vector<const SelectedRows *> all;
all.resize(collective_context.endpoints_.size());
for (auto v : vars) {
all[v.trainer_id_] =
scope->FindVar(v.var_name_)->GetMutable<SelectedRows>();
}
all[collective_context.trainer_id_] = merged_select_rows;
merge_func(*merged_dev_ctx, all, dst_selected_rows);
rpc_server->WaitVarBarrier(merged_var_name);
rpc_server->ClearVar(merged_var_name);
// 5. clear mid vars
std::vector<std::string> tmp_vars{merged_var_name};
for (auto r : vars) {
tmp_vars.push_back(r.var_name_);
}
scope->EraseVars(tmp_vars);
}
#endif
void ReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
......@@ -90,8 +202,36 @@ void ReduceOpHandle::RunImpl() {
this->RunAndRecordEvent([&] {
std::vector<const SelectedRows *> in_selected_rows =
GetInputValues<SelectedRows>(in_var_handles, var_scopes);
GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
out_var->GetMutable<framework::SelectedRows>());
const CollectiveContext &collective_context =
*CollectiveContext::GetInstance();
VLOG(10) << "GatherSelectedRows CollectiveContext:"
<< collective_context.String();
// TODO(gongwb): add cpu support
if (collective_context.endpoints_.size() <= 1 ||
is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_,
t_out_p,
out_var->GetMutable<framework::SelectedRows>());
return;
}
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
if (framework::IsType<const float>(in_selected_rows[0]->value().type())) {
GatherSelectedRows<platform::CUDADeviceContext, float>(
in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
out_var->GetMutable<framework::SelectedRows>());
} else if (framework::IsType<const double>(
in_selected_rows[0]->value().type())) {
GatherSelectedRows<platform::CUDADeviceContext, double>(
in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
out_var->GetMutable<framework::SelectedRows>());
} else {
PADDLE_ENFORCE(false,
"only support double or float when gahter SelectedRows");
}
#endif
});
} else {
std::vector<const LoDTensor *> lod_tensors =
......
......@@ -30,6 +30,32 @@
namespace paddle {
namespace framework {
namespace details {
struct CollectiveContext {
std::vector<std::string> endpoints_;
int trainer_id_{0};
std::string String() const {
std::stringstream ss;
ss << "endpoints_:";
for (auto e : endpoints_) {
ss << e << ",";
}
ss << "trainer_id_:" << trainer_id_;
return ss.str();
}
static CollectiveContext *GetInstance() {
std::call_once(init_flag_,
[&]() { context_.reset(new CollectiveContext()); });
return context_.get();
}
private:
static std::once_flag init_flag_;
static std::unique_ptr<CollectiveContext> context_;
};
struct ReduceOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_;
......@@ -64,6 +90,19 @@ struct ReduceOpHandle : public OpHandleBase {
protected:
void RunImpl() override;
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
template <typename DevCtx, typename DataType>
void GatherSelectedRows(
const std::vector<const SelectedRows *> &src_selecte_rows_,
const std::vector<platform::Place> &in_places,
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
VarHandle *out_var_handle, const platform::Place &out_place,
SelectedRows *dst_selecte_rows);
#endif
void Wait(
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes);
template <typename T>
std::vector<const T *> GetInputValues(
const std::vector<VarHandle *> &in_var_handles,
......
......@@ -97,7 +97,7 @@ void ExecutorThreadWorker::SetDevice() {
static unsigned concurrency_cap = std::thread::hardware_concurrency();
int thread_id = this->thread_id_;
if (thread_id < concurrency_cap) {
if (static_cast<unsigned>(thread_id) < concurrency_cap) {
unsigned proc = thread_id;
cpu_set_t mask;
......
......@@ -16,7 +16,9 @@ limitations under the License. */
#include <string>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
......@@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
return tensor;
}
LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
Variable* var = scope.FindVar(var_name);
PADDLE_ENFORCE(var, "%s no in scope", var_name);
PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
return *var->GetMutable<LoDTensor>();
}
} // namespace framework
} // namespace paddle
......@@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index);
LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
} // namespace framework
} // namespace paddle
......@@ -46,14 +46,16 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
auto* scope = param_scope();
PADDLE_ENFORCE(scope);
std::string type = is_conv3d() ? "conv3d" : "conv2d";
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
->assert_is_op_input(type, "Input");
patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
conv_bias_pattern(conv_input);
conv_bias_pattern(conv_input, is_conv3d());
int found_conv_bias_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
......@@ -109,7 +111,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
desc.SetType("conv2d");
desc.SetType(type);
for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second);
......@@ -135,3 +137,5 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
} // namespace paddle
REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
paddle::framework::ir::ConvBiasFusePass);
REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
paddle::framework::ir::Conv3DBiasFusePass);
......@@ -26,11 +26,19 @@ namespace ir {
class ConvBiasFusePass : public FusePassBase {
public:
virtual ~ConvBiasFusePass() {}
virtual bool is_conv3d() const { return false; }
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"conv_bias_mkldnn_fuse"};
};
/*
* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
*/
class Conv3DBiasFusePass : public ConvBiasFusePass {
public:
bool is_conv3d() const override { return true; }
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) {
switch (role_id) {
case _INT(OpRole::kForward):
if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
LOG(ERROR)
<< "Cannot add backward operator before forward operator %s."
<< op->Type();
LOG(ERROR) << "Cannot add backward operator before forward operator "
<< op->Type();
}
break;
case _INT(OpRole::kBackward):
......
......@@ -1030,10 +1030,11 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
}
PDNode *patterns::ConvBias::operator()(
paddle::framework::ir::PDNode *conv_input) {
paddle::framework::ir::PDNode *conv_input, bool is_conv3d) {
std::string type = is_conv3d ? "conv3d" : "conv2d";
// Create Operators
conv_input->assert_is_op_input("conv2d", "Input");
auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
conv_input->assert_is_op_input(type, "Input");
auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(type);
auto *eltiwse_op =
pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
// Create variables
......@@ -1041,11 +1042,11 @@ PDNode *patterns::ConvBias::operator()(
auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Filter");
->assert_is_op_input(type, "Filter");
// intermediate variable, will be removed in the IR after fuse.
auto *conv_out_var = pattern->NewNode(conv_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("conv2d")
->assert_is_only_output_of_op(type)
->assert_is_op_input("elementwise_add");
// Bias stored in elementwise_add
auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
......
......@@ -623,7 +623,7 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
struct ConvBias : public PatternBase {
ConvBias(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_bias") {}
PDNode* operator()(PDNode* conv_input);
PDNode* operator()(PDNode* conv_input, bool is_conv3d = false);
// declare operator node's name
PATTERN_DECL_NODE(conv);
PATTERN_DECL_NODE(eltwise);
......
......@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
auto* op = n->Op();
if (n->RuntimeHasAttr("is_test")) {
if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) {
op->SetAttr("is_test", true);
} else if (std::find(begin(op_list), end(op_list), op->Type()) !=
end(op_list)) {
......
......@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
auto* op = node->Op();
auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "conv3") {
ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
ASSERT_FALSE(op->HasAttr("is_test"));
} else {
ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
ASSERT_TRUE(op->HasAttr("is_test"));
EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
}
}
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
#include <string>
namespace paddle {
namespace framework {
......@@ -21,9 +22,19 @@ namespace ir {
std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Aplies MKL-DNN placement strategy.";
const auto& op_types_list =
Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
for (const Node* n : graph->Nodes()) {
if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
n->Op()->SetAttr("use_mkldnn", true);
if (n->IsOp()) {
auto* op = n->Op();
if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) {
if (op_types_list.empty()) {
op->SetAttr("use_mkldnn", true);
} else if (std::find(op_types_list.begin(), op_types_list.end(),
n->Name()) != op_types_list.end()) {
op->SetAttr("use_mkldnn", true);
}
}
}
}
return graph;
......@@ -33,5 +44,5 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
} // namespace framework
} // namespace paddle
REGISTER_PASS(mkldnn_placement_pass,
paddle::framework::ir::MKLDNNPlacementPass);
REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass)
.RequirePassAttr("mkldnn_enabled_op_types");
......@@ -30,28 +30,6 @@ std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
return std::unique_ptr<Node>(new Node(name, type));
}
bool Node::RuntimeHasAttr(const std::string &name) const {
if (Op()->HasAttr(name)) {
return true;
} else {
auto &op_info = OpInfoMap::Instance();
auto op_type = Op()->Type();
if (op_info.Has(op_type)) {
auto op_info_ptr = op_info.Get(op_type);
if (op_info_ptr.HasOpProtoAndChecker()) {
const proto::OpProto &proto = op_info_ptr.Proto();
for (int i = 0; i != proto.attrs_size(); ++i) {
const proto::OpProto::Attr &attr = proto.attrs(i);
if (attr.name() == name) {
return true;
}
}
}
}
}
return false;
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -108,18 +108,6 @@ class Node {
Name().find(ir::Node::kControlDepVarName) != std::string::npos;
}
// RuntimeHasAttr is different with HasAttr now.
// 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
// thus, if stored program_desc_ are old which don't have an attr, a new
// library which adds the attr already will fail on this function.
// Details:
// https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
// 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
// problem.
// TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
// RuntimeHasAttr.
bool RuntimeHasAttr(const std::string& name) const;
std::vector<Node*> inputs;
std::vector<Node*> outputs;
......
......@@ -237,6 +237,23 @@ void OpDesc::SetOutput(const std::string &param_name,
this->outputs_[param_name] = args;
}
bool OpDesc::HasProtoAttr(const std::string &name) const {
auto &op_info = OpInfoMap::Instance();
if (op_info.Has(desc_.type())) {
auto op_info_ptr = op_info.Get(desc_.type());
if (op_info_ptr.HasOpProtoAndChecker()) {
const proto::OpProto &proto = op_info_ptr.Proto();
for (int i = 0; i != proto.attrs_size(); ++i) {
const proto::OpProto::Attr &attr = proto.attrs(i);
if (attr.name() == name) {
return true;
}
}
}
}
return false;
}
proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
......
......@@ -65,6 +65,8 @@ class OpDesc {
return attrs_.find(name) != attrs_.end();
}
bool HasProtoAttr(const std::string &name) const;
proto::AttrType GetAttrType(const std::string &name) const;
std::vector<std::string> AttrNames() const;
......
cc_library(layer SRCS layer.cc DEPS proto_desc operator)
cc_library(tracer SRCS tracer.cc DEPS proto_desc)
cc_library(engine SRCS engine.cc)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/engine.h"
#include <mutex> // NOLINT
#include <vector>
#include "glog/logging.h"
namespace paddle {
namespace imperative {
static std::once_flag init_engine;
static Engine* engine;
class DummyEngine : public Engine {
public:
void Enqueue(Runnable* runnable) override {
queued_runnables_.push_back(runnable);
}
size_t Size() const override { return queued_runnables_.size(); }
void Sync() override {
for (Runnable* l : queued_runnables_) {
LOG(INFO) << "running " << reinterpret_cast<void*>(l);
}
queued_runnables_.clear();
}
private:
std::vector<Runnable*> queued_runnables_;
};
Engine* GetEngine() {
std::call_once(init_engine, []() { engine = new DummyEngine(); });
return engine;
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstddef>
#include <cstdint>
namespace paddle {
namespace imperative {
struct Runnable {};
class Engine {
public:
virtual ~Engine() {}
virtual void Enqueue(Runnable* runnable) = 0;
virtual size_t Size() const = 0;
virtual void Sync() = 0;
};
Engine* GetEngine();
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/layer.h"
#include <deque>
#include <limits>
#include <map>
#include <random>
#include <utility>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/string/printf.h"
namespace paddle {
namespace imperative {
using framework::Variable;
void AddTo(Variable* src, Variable* dst) {
framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld",
dst_tensor->numel(), src_tensor->numel());
float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
const float* src_data = src_tensor->data<float>();
for (size_t i = 0; i < src_tensor->numel(); ++i) {
dst_data[i] += src_data[i];
}
}
class Autograd {
public:
explicit Autograd(framework::Scope* scope) : scope_(scope) {}
void RunBackward(VarBase* var) {
PADDLE_ENFORCE(var->pre_op_->op_desc_);
// TODO(panyx0718): Only create for vars that "require_grad"
(*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
std::deque<OpBase*> ready;
ready.push_back(var->pre_op_);
std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->pre_op_);
while (!ready.empty()) {
OpBase* ready_op = ready.front();
ready.pop_front();
std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
for (size_t i = 0; i < input_grads.size(); ++i) {
if (!input_grads[i]) continue;
OpBase* pre_op = ready_op->pre_ops_->at(i);
if (!pre_op) continue;
dep_counts[pre_op] -= 1;
PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
bool pre_op_ready = dep_counts[pre_op] == 0;
if (pre_op_ready) {
ready.push_back(pre_op);
}
}
}
}
private:
std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
std::map<OpBase*, int> ret;
std::deque<OpBase*> queue;
queue.push_back(op);
std::unordered_set<OpBase*> visited;
visited.insert(op);
while (!queue.empty()) {
OpBase* candidate = queue.front();
queue.pop_front();
for (OpBase* pre_op : *(candidate->pre_ops_)) {
if (!pre_op) continue;
if (visited.find(pre_op) == visited.end()) {
visited.insert(pre_op);
queue.push_back(pre_op);
}
ret[pre_op] += 1;
}
}
return ret;
}
framework::Scope* scope_;
};
framework::Variable* CreateVariable(const std::string& name,
const framework::DDim& dim, float val,
framework::Scope* scope,
bool random_name = true) {
std::string varname = name;
if (random_name) {
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6(
1, std::numeric_limits<int>::max());
int id = dist6(rng);
varname = string::Sprintf("%s@%d", varname, id);
}
VLOG(3) << "creating var " << varname;
framework::Variable* var = scope->Var(varname);
framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
std::fill(data, data + tensor->numel(), val);
return var;
}
framework::LoDTensor& VarBase::Grad() {
VLOG(3) << "get var grad " << var_desc_->Name();
return *grads_->GetMutable<framework::LoDTensor>();
}
void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
VLOG(3) << "apply var grad " << var_desc_->Name() << " "
<< grad->Get<framework::LoDTensor>().data<float>()[0];
if (!grads_) {
grads_ =
CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
}
AddTo(grad, grads_);
VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
<< grads_->Get<framework::LoDTensor>().data<float>()[0];
}
std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
VLOG(3) << "op grad " << grad_op_desc_->Type();
for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
// grad op inputs can be forward inputs, so not in grad_to_var.
continue;
}
VLOG(3) << "op grad in var " << grad_invar;
block_->FindRecursiveOrCreateVar(grad_invar);
framework::Variable* var = scope->Var(grad_invar);
const std::string& invar = grad_to_var_->at(grad_invar);
for (VarBase* varbase : *output_vars_) {
// Use the accumulated grads_ by sharing the input with grads_.
if (varbase->var_desc_->Name() == invar) {
var->GetMutable<framework::LoDTensor>()->ShareDataWith(
varbase->grads_->Get<framework::LoDTensor>());
break;
}
}
}
for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
VLOG(3) << "grad outvar " << outvar;
block_->FindRecursiveOrCreateVar(outvar);
framework::Variable* var = scope->Var(outvar);
if (!var->IsInitialized()) {
framework::VarDesc* var_desc = block_->FindVar(outvar);
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
}
}
grad_op_desc_->InferShape(*block_);
grad_op_desc_->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc_);
opbase->Run(*scope, platform::CPUPlace());
// `ret` matches exactly with `input_vars_` of forward op.
std::vector<Variable*> ret;
for (size_t i = 0; i < input_vars_->size(); ++i) {
bool found = false;
for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
Variable* var = scope->FindVar(outvar);
VarBase* origin_var = (*input_vars_)[i];
std::string orig_var = grad_to_var_->at(outvar);
PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var);
VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
origin_var->ApplyGrad(scope, var);
found = true;
ret.push_back(var);
// TODO(panyx0718): There might be another outvar with the same name.
// In that case, it doesn't matter the first one or the second one is
// used.
break;
}
if (!found) {
ret.push_back(nullptr);
}
}
return ret;
}
void VarBase::RunBackward(framework::Scope* scope) {
grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
false);
if (!pre_op_) return;
Autograd(scope).RunBackward(this);
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace imperative {
class OpBase;
class VarBase {
public:
VarBase()
: pre_op_(nullptr),
pre_op_out_idx_(-1),
var_desc_(nullptr),
var_(nullptr),
grads_(nullptr) {}
virtual ~VarBase() {}
void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
void RunBackward(framework::Scope* scope);
framework::LoDTensor& Grad();
OpBase* pre_op_;
int pre_op_out_idx_;
framework::VarDesc* var_desc_;
framework::Variable* var_;
framework::Variable* grads_;
};
class OpBase {
public:
OpBase()
: input_vars_(new std::vector<VarBase*>()),
output_vars_(new std::vector<VarBase*>()),
pre_ops_(new std::vector<OpBase*>()),
pre_ops_out_idx_(new std::vector<int>()),
op_desc_(nullptr),
grad_op_desc_(nullptr) {}
virtual ~OpBase() {
delete input_vars_;
delete output_vars_;
delete pre_ops_;
delete pre_ops_out_idx_;
if (grad_op_desc_) delete grad_op_desc_;
if (grad_to_var_) delete grad_to_var_;
}
std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
std::vector<VarBase*>* input_vars_;
std::vector<VarBase*>* output_vars_;
std::vector<OpBase*>* pre_ops_;
std::vector<int>* pre_ops_out_idx_;
framework::OpDesc* op_desc_;
framework::OpDesc* grad_op_desc_;
std::unordered_map<std::string, std::string>* grad_to_var_;
framework::BlockDesc* block_;
};
class Layer {
public:
virtual ~Layer() {}
virtual std::vector<VarBase> Forward(const std::vector<VarBase>& inputs) {
std::vector<VarBase> vars;
return vars;
}
virtual void Backward() { LOG(ERROR) << "To support customize"; }
};
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/tracer.h"
namespace paddle {
namespace imperative {} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/layer.h"
namespace paddle {
namespace imperative {
void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block,
framework::OpDesc** grad_op_desc,
std::unordered_map<std::string, std::string>* grad_to_var) {
std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
framework::OpInfoMap::Instance()
.Get(op_desc.Type())
.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
// TODO(panyx0718): Leak?
*grad_op_desc = grad_op_descs[0].release();
}
class Tracer {
public:
explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
root_scope_ = new framework::Scope();
scopes_[root_block_] = root_scope_;
}
virtual ~Tracer() { delete root_scope_; }
void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
const std::vector<VarBase*>& outputs,
framework::BlockDesc* block) {
framework::Scope* scope = GetScope(block);
framework::OpDesc* op_desc = op->op_desc_;
VLOG(3) << "tracer tracing " << op_desc->Type();
op_desc->InferShape(*block);
op_desc->InferVarType(block);
std::unique_ptr<framework::OperatorBase> op_base =
framework::OpRegistry::CreateOp(*op_desc);
*op->input_vars_ = inputs;
for (VarBase* input : inputs) {
const std::string vname = input->var_desc_->Name();
framework::Variable* var = scope->Var(vname);
input->var_ = var;
if (!var->IsInitialized()) {
framework::VarDesc* var_desc = block->FindVar(vname);
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
}
if (input->pre_op_) {
op->pre_ops_->push_back(input->pre_op_);
op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
} else {
op->pre_ops_->push_back(nullptr);
}
}
*op->output_vars_ = outputs;
for (size_t i = 0; i < outputs.size(); ++i) {
const std::string vname = outputs[i]->var_desc_->Name();
framework::Variable* var = scope->Var(vname);
if (!var->IsInitialized()) {
framework::VarDesc* var_desc = block->FindVar(vname);
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
}
outputs[i]->var_ = var;
outputs[i]->pre_op_ = op;
outputs[i]->pre_op_out_idx_ = i;
}
op_base->Run(*scope, platform::CPUPlace());
framework::OpDesc* grad_op_desc;
auto grad_to_var = new std::unordered_map<std::string, std::string>();
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
op->grad_op_desc_ = grad_op_desc;
op->grad_to_var_ = grad_to_var;
op->block_ = block;
}
framework::Scope* GetScope(framework::BlockDesc* block) {
if (scopes_.find(block) != scopes_.end()) {
return scopes_.at(block);
}
framework::BlockDesc* parent_block = block->ParentBlock();
PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end());
framework::Scope* scope = &scopes_[parent_block]->NewScope();
scopes_[block] = scope;
return scope;
}
private:
std::map<framework::BlockDesc*, framework::Scope*> scopes_;
framework::BlockDesc* root_block_;
framework::Scope* root_scope_;
};
} // namespace imperative
} // namespace paddle
......@@ -103,6 +103,7 @@ struct Argument {
// Model specified with program and parameters files.
DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
// The overall graph to work on.
DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
......@@ -115,6 +116,10 @@ struct Argument {
DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
std::vector<std::string>);
// Pass a set of op types to enable its mkldnn kernel
DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
std::unordered_set<std::string>);
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
......
......@@ -63,6 +63,11 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
pass_num++;
}
if (pass_name == "mkldnn_placement_pass") {
pass->Set("mkldnn_enabled_op_types",
new std::unordered_set<std::string>(
argument->mkldnn_enabled_op_types()));
}
if (pass_name == "tensorrt_subgraph_pass") {
PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
......
......@@ -46,7 +46,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
argument->model_params_path_valid()) {
auto program =
LoadModel(argument->model_program_path(), argument->model_params_path(),
argument->scope_ptr(), place);
argument->scope_ptr(), place, argument->model_from_memory());
argument->SetMainProgram(program.release());
} else {
PADDLE_THROW(
......@@ -68,9 +68,14 @@ std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
const std::string &program_path, const std::string &params_path,
framework::Scope *scope, const platform::Place &place) {
framework::Scope *scope, const platform::Place &place,
bool model_from_memory) {
framework::Executor exe(place);
return Load(&exe, scope, program_path, params_path);
if (!model_from_memory) {
return Load(&exe, scope, program_path, params_path);
} else {
return LoadFromMemory(&exe, scope, program_path, params_path);
}
}
std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; }
......
......@@ -24,7 +24,7 @@ namespace inference {
namespace analysis {
/*
* Load program and parameter to memory from the disk.
* Load program and parameter to memory from the disk or directly from memory.
*/
class IrGraphBuildPass : public AnalysisPass {
public:
......@@ -38,7 +38,8 @@ class IrGraphBuildPass : public AnalysisPass {
const platform::Place &place);
std::unique_ptr<framework::ProgramDesc> LoadModel(
const std::string &program_path, const std::string &params_path,
framework::Scope *scope, const platform::Place &place);
framework::Scope *scope, const platform::Place &place,
bool model_from_memory);
std::string model_binary_str_;
};
......
......@@ -49,10 +49,15 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
// fields from this.
enable_ir_optim = other.enable_ir_optim;
// For mkldnn
use_mkldnn_ = other.use_mkldnn_;
mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
use_feed_fetch_ops = other.use_feed_fetch_ops;
use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
model_from_memory_ = other.model_from_memory_;
if (use_gpu) {
pass_builder_.reset(new GpuPassStrategy(
......@@ -76,10 +81,16 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
// fields from this.
enable_ir_optim = other.enable_ir_optim;
// For mkldnn
use_mkldnn_ = other.use_mkldnn_;
mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
use_feed_fetch_ops = other.use_feed_fetch_ops;
use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
model_from_memory_ = other.model_from_memory_;
pass_builder_ = std::move(other.pass_builder_);
}
......@@ -102,4 +113,13 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
}
void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size,
const char *param_buffer,
size_t param_buffer_size) {
prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size);
param_file = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true;
}
} // namespace paddle
......@@ -308,6 +308,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetUseGPU(config_.use_gpu);
argument_.SetGPUDeviceId(config_.device);
argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program
if (!config_.model_dir.empty()) {
argument_.SetModelDir(config_.model_dir);
......@@ -326,6 +327,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
}
if (config_.use_mkldnn_) {
argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
}
auto passes = config_.pass_builder()->AllPasses();
if (!config_.enable_ir_optim) passes.clear();
argument_.SetIrAnalysisPasses(passes);
......@@ -448,20 +453,24 @@ bool AnalysisPredictor::LoadProgramDesc() {
return false;
}
std::string pb_content;
// Read binary
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
fin.seekg(0, std::ios::end);
pb_content.resize(fin.tellg());
fin.seekg(0, std::ios::beg);
fin.read(&(pb_content.at(0)), pb_content.size());
fin.close();
// Create ProgramDesc
framework::proto::ProgramDesc proto;
proto.ParseFromString(pb_content);
if (!config_.model_from_memory()) {
std::string pb_content;
// Read binary
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
filename);
fin.seekg(0, std::ios::end);
pb_content.resize(fin.tellg());
fin.seekg(0, std::ios::beg);
fin.read(&(pb_content.at(0)), pb_content.size());
fin.close();
proto.ParseFromString(pb_content);
} else {
proto.ParseFromString(config_.prog_file);
}
inference_program_.reset(new framework::ProgramDesc(proto));
return true;
}
......@@ -469,6 +478,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
bool AnalysisPredictor::LoadParameters() {
PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
"The inference program should be loaded first.");
const auto &global_block = inference_program_->MutableBlock(0);
// create a temporary program to load parameters.
......
......@@ -16,6 +16,7 @@
#include <cassert>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
// Here we include some header files with relative paths, for that in deploy,
......@@ -52,18 +53,26 @@ struct AnalysisConfig : public NativeConfig {
bool use_tensorrt() const { return use_tensorrt_; }
void EnableMKLDNN();
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
bool use_mkldnn() const { return use_mkldnn_; }
void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
mkldnn_enabled_op_types_ = op_list;
}
// Specify the memory buffer of program and parameter
void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
const char* program_buffer, size_t program_buffer_size);
bool model_from_memory() const { return model_from_memory_; }
friend class ::paddle::AnalysisPredictor;
protected:
bool use_tensorrt_{false};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
int tensorrt_workspace_size_;
int tensorrt_max_batchsize_;
std::unique_ptr<PassStrategy> pass_builder_;
bool model_from_memory_{false};
};
// Configurations for Anakin engine.
......
......@@ -98,9 +98,10 @@ class CpuPassStrategy : public PassStrategy {
passes_.insert(passes_.begin(), "mkldnn_placement_pass");
for (auto &pass :
std::vector<std::string>({"depthwise_conv_mkldnn_pass", //
"conv_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", //
std::vector<std::string>({"depthwise_conv_mkldnn_pass", //
"conv_bias_mkldnn_fuse_pass", //
"conv3d_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass"})) {
passes_.push_back(pass);
}
......
......@@ -69,7 +69,8 @@ bool IsPersistable(const framework::VarDesc* var) {
void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const std::string& param_filename) {
const std::string& param_filename,
bool model_from_memory = false) {
const framework::BlockDesc& global_block = main_program.Block(0);
framework::ProgramDesc* load_program = new framework::ProgramDesc();
......@@ -108,6 +109,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
op->SetType("load_combine");
op->SetOutput("Out", paramlist);
op->SetAttr("file_path", {param_filename});
op->SetAttr("model_from_memory", {model_from_memory});
op->CheckAttrs();
}
......@@ -130,16 +132,17 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
"model version %ld is not supported.",
main_program->Version());
LoadPersistables(executor, scope, *main_program, dirname, "");
// model_from_memory is false in seperate parameters.
LoadPersistables(executor, scope, *main_program, dirname, "",
false /* model_from_memory */);
return main_program;
}
std::unique_ptr<framework::ProgramDesc> Load(
framework::Executor* executor, framework::Scope* scope,
const std::string& prog_filename, const std::string& param_filename) {
std::string model_filename = prog_filename;
std::string program_desc_str;
ReadBinaryFile(model_filename, &program_desc_str);
ReadBinaryFile(prog_filename, &program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
......@@ -147,7 +150,22 @@ std::unique_ptr<framework::ProgramDesc> Load(
"model version %ld is not supported.",
main_program->Version());
LoadPersistables(executor, scope, *main_program, "", param_filename);
LoadPersistables(executor, scope, *main_program, "", param_filename,
false /* model_from_memory */);
return main_program;
}
std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
framework::Executor* executor, framework::Scope* scope,
const std::string& prog_buffer, const std::string& param_buffer) {
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(prog_buffer));
PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
"model version %ld is not supported.",
main_program->Version());
LoadPersistables(executor, scope, *main_program, "", param_buffer,
true /* model_filename */);
return main_program;
}
......
......@@ -30,7 +30,8 @@ void Init(const std::vector<std::string> argv);
void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const std::string& param_filename);
const std::string& param_filename,
bool model_from_memory);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
framework::Scope* scope,
......@@ -41,6 +42,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
const std::string& prog_filename,
const std::string& param_filename);
std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
framework::Executor* executor, framework::Scope* scope,
const std::string& prog_buffer, const std::string& param_buffer);
// Save the variables from a scope to disk.
void SaveVars(const framework::Scope& scope,
const std::vector<std::string>& vars, const std::string& dirname,
......
......@@ -109,8 +109,12 @@ class Pool2dOpConverter : public OpConverter {
}
if (pool_type == "max") {
nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]);
nvinfer1::DimsHW post_pad(paddings[0], paddings[1]);
// Under ceil mode, the pre_pad and post_pad are used to
// record the the padding size. In some ceil mode cases,
// we do not need padding, so we initialize the two vars to 0.
nvinfer1::DimsHW pre_pad(0, 0);
nvinfer1::DimsHW post_pad(0, 0);
if (ceil_mode) {
// If ceil mode is true, we will pad the appropriate size to the input.
DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
......
......@@ -188,10 +188,16 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
}
// Easy for profiling independently.
TEST(Analyzer_dam, profile) {
void profile(bool use_mkldnn = false) {
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
if (use_mkldnn) {
cfg.EnableMKLDNN();
std::unordered_set<std::string> op_list = {"conv3d"};
cfg.SetMKLDNNOp(op_list);
}
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
......@@ -209,6 +215,11 @@ TEST(Analyzer_dam, profile) {
}
}
TEST(Analyzer_dam, profile) { profile(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); }
#endif
// Check the fuse status
TEST(Analyzer_dam, fuse_statis) {
contrib::AnalysisConfig cfg;
......@@ -222,9 +233,14 @@ TEST(Analyzer_dam, fuse_statis) {
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_dam, compare) {
contrib::AnalysisConfig cfg;
void compare(bool use_mkldnn = false) {
AnalysisConfig cfg;
SetConfig(&cfg);
if (use_mkldnn) {
cfg.EnableMKLDNN();
std::unordered_set<std::string> op_list = {"conv3d"};
cfg.SetMKLDNNOp(op_list);
}
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
......@@ -233,5 +249,10 @@ TEST(Analyzer_dam, compare) {
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
}
TEST(Analyzer_dam, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
#endif
} // namespace inference
} // namespace paddle
......@@ -93,9 +93,17 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
}
void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param";
void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
if (memory_load) {
std::string buffer_prog, buffer_param;
ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog);
ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param);
cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
buffer_param.size());
} else {
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param";
}
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
......@@ -114,9 +122,9 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
}
// Easy for profiling independently.
TEST(Analyzer_Chinese_ner, profile) {
void profile(bool memory_load = false) {
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
SetConfig(&cfg, memory_load);
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
......@@ -138,6 +146,12 @@ TEST(Analyzer_Chinese_ner, profile) {
}
}
TEST(Analyzer_Chinese_ner, profile) { profile(); }
TEST(Analyzer_Chinese_ner, profile_memory_load) {
profile(true /* memory_load */);
}
// Check the fuse status
TEST(Analyzer_Chinese_ner, fuse_statis) {
contrib::AnalysisConfig cfg;
......
......@@ -49,8 +49,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
os << GenSpaces(num_spaces)
<< "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
os << GenSpaces(num_spaces)
<< "specify_input_name: " << config.specify_input_name << "\n";
os << GenSpaces(num_spaces)
......@@ -65,6 +63,13 @@ std::ostream &operator<<(std::ostream &os,
os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
num_spaces++;
os << *reinterpret_cast<const NativeConfig *>(&config);
if (!config.model_from_memory()) {
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
} else {
os << GenSpaces(num_spaces)
<< "prog_file and param_file: load from memory \n";
}
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
<< "\n";
os << GenSpaces(num_spaces)
......
cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
cc_binary(visualizer SRCS visualizer.cc DEPS analysis
paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
if(WIN32)
target_link_libraries(visualizer shlwapi)
endif(WIN32)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/visualizer.h"
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <fstream>
#include <memory>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/platform/init.h"
DEFINE_string(model_dir, "", "model directory");
DEFINE_string(model_program_path, "", "model program path");
DEFINE_string(model_params_path, "", "model params path");
USE_PASS(graph_viz_pass);
USE_PASS(graph_to_program_pass);
using paddle::inference::analysis::Argument;
namespace paddle {
namespace inference {
namespace utils {
void Visualizer::SetArgument(Argument *argument) { argument_ = argument; }
bool Visualizer::Run() {
paddle::framework::InitDevices(false);
paddle::inference::analysis::Analyzer().Run(argument_);
return true;
}
} // namespace utils
} // namespace inference
} // namespace paddle
// Generate a dot file describing the structure of graph.
// To use this tool, run command: ./visualizer [options...]
// Options:
// --model_dir: the directory of model
// --model_program_path: the path of program
// --model_params_path: the path of params
int main(int argc, char *argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
paddle::inference::analysis::Argument argument;
argument.SetUseGPU(false);
argument.SetUseTensorRT(false);
if (FLAGS_model_dir.empty()) {
if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) {
LOG(ERROR) << "Please set model_dir"
" or model_program_path and model_params_path";
return -1;
} else {
argument.SetModelProgramPath(FLAGS_model_program_path);
argument.SetModelParamsPath(FLAGS_model_params_path);
}
} else {
argument.SetModelDir(FLAGS_model_dir);
}
// Only 1 pass, default filename is 0_ir_origin.dot
// For more details, looking for paddle::inference::analysis::IRPassManager
argument.SetIrAnalysisPasses({"graph_viz_pass"});
std::unique_ptr<paddle::framework::Scope> scope{
new paddle::framework::Scope()};
argument.SetScopeNotOwned(
const_cast<paddle::framework::Scope *>(scope.get()));
paddle::inference::utils::Visualizer visualizer;
visualizer.SetArgument(&argument);
visualizer.Run();
return 0;
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/argument.h"
namespace paddle {
namespace inference {
namespace utils {
using paddle::inference::analysis::Argument;
class Visualizer final {
public:
Visualizer() = default;
~Visualizer() = default;
Visualizer(const Visualizer &) = delete;
Visualizer &operator=(const Visualizer &) = delete;
void SetArgument(Argument *);
bool Run();
private:
Argument *argument_;
};
} // namespace utils
} // namespace inference
} // namespace paddle
......@@ -100,8 +100,9 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
const T *x_data = x->data<T>();
T *y_data = y->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
"Input dim must be with 2 or 4");
PADDLE_ENFORCE(
x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4,
"Input dim must be with 2, 3 or 4");
std::vector<int> src_tz = framework::vectorize2int(x->dims());
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/bpr_loss_op.h"
namespace paddle {
namespace operators {
class BprLossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto label_dims = ctx->GetInputDim("Label");
int rank = x_dims.size();
PADDLE_ENFORCE_EQ(rank, label_dims.size(),
"Input(X) and Input(Label) shall have the same rank.");
PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
framework::slice_ddim(label_dims, 0, rank - 1),
"Input(X) and Input(Label) shall have the same shape "
"except the last dimension.");
auto y_dims = x_dims;
y_dims[rank - 1] = 1;
ctx->SetOutputDim("Y", y_dims);
ctx->ShareLoD("X", /*->*/ "Y");
}
protected:
// Explicitly set that the data type of computation kernel of Seq-bpr
// is determined by its input "X".
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()),
platform::CPUPlace());
}
};
class BprLossGradientOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) shoudl be not null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@GRAD) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto label_dims = ctx->GetInputDim("Label");
auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
int rank = x_dims.size();
PADDLE_ENFORCE_EQ(dy_dims.size(), rank,
"Input(Y@Grad) and Input(X) should have the same rank.");
PADDLE_ENFORCE_EQ(label_dims.size(), rank,
"Input(Label) and Input(X) should have the same rank.");
PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
framework::slice_ddim(label_dims, 0, rank - 1),
"The Input(X) and Input(Label) should have the same "
"shape except the last dimension.");
PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
framework::slice_ddim(dy_dims, 0, rank - 1),
"The Input(X) and Input(Y@Grad) should have the same "
"shape except the last dimension.");
PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
"The last dimension of Input(Y@Grad) should be 1.");
PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
" the last dimension of Input(Label) should be 1.");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X"));
}
protected:
// Explicitly set that the data type of computation kernel of cross_entropy
// is determined by its input "X".
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()),
platform::CPUPlace());
}
};
class BprLossOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, default Tensor<float>), a tensor whose last dimension "
"size is equal to the number of classes. This input is a "
"real number.");
AddInput(
"Label",
"(Tensor), the tensor which represents the ground truth. It has the "
"same shape with 'X' except the last dimension. the last dimension "
"size is 1.");
AddOutput("Y",
"(Tensor, default Tensor<float>), a tensor whose shape is same "
"with 'X' except that the last dimension size is 1. It "
"represents the sequence bpr loss.");
AddComment(R"DOC(
Bayesian Personalized Ranking Loss Operator.
This operator belongs to pairwise ranking loss. Label is the desired item.
The loss at a given point in one session is defined as:
$Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$
Learn more details by reading paper <session-based recommendations with recurrent
neural networks>(https://arxiv.org/abs/1511.06939)
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPUCtx = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel<CPUCtx, float>,
ops::BprLossOpKernel<CPUCtx, double>);
REGISTER_OP_CPU_KERNEL(bpr_loss_grad,
ops::BprLossGradientOpKernel<CPUCtx, float>,
ops::BprLossGradientOpKernel<CPUCtx, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
/*Todo:
*Find a way to adapt TolerableValue, using blas or eigen.
*/
template <typename T>
struct TolerableValue {
HOSTDEVICE T operator()(const T& x) const {
PADDLE_ASSERT(std::is_floating_point<T>::value);
const T kApproInf = 1e20;
if (x == INFINITY) return kApproInf;
if (x == -INFINITY) return -kApproInf;
return x;
}
};
template <typename DeviceContext, typename T>
class BprLossOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
auto* label = ctx.Input<Tensor>("Label");
auto* y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
int rank = x->dims().size();
Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
Tensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1);
Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
const framework::Tensor* logits = &x_2d;
const framework::Tensor* labels = &labels_2d;
framework::Tensor* out = &y_2d;
const int step_size = logits->dims()[0];
const int class_num = logits->dims()[1];
const T* logits_data = logits->data<T>();
T* loss_data = out->data<T>();
const int64_t* label_data = labels->data<int64_t>();
for (int i = 0; i < step_size; ++i) {
int lbl_pos = label_data[i];
PADDLE_ENFORCE_GE(lbl_pos, 0);
PADDLE_ENFORCE_LT(lbl_pos, class_num);
int index_pos = i * class_num + lbl_pos;
T sum = static_cast<T>(0);
for (int j = 0; j < class_num; j++) {
if (j == lbl_pos) continue;
int index_neg = i * class_num + j;
sum += TolerableValue<T>()(-std::log(
1.0f + TolerableValue<T>()(std::exp(logits_data[index_neg] -
logits_data[index_pos]))));
}
loss_data[i] = -sum / (class_num - 1);
}
}
};
template <typename DeviceContext, typename T>
class BprLossGradientOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto* label = ctx.Input<Tensor>("Label");
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
const int step_size = x->dims()[0];
const int num_classes = x->dims()[1];
T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
const T* dy_data = dy->data<T>();
const T* x_data = x->data<T>();
const int64_t* label_data = label->data<int64_t>();
for (size_t sample_id = 0; sample_id < step_size; sample_id++) {
for (size_t x_offset = sample_id * num_classes;
x_offset < (sample_id + 1) * num_classes; x_offset++) {
dx_data[x_offset] = static_cast<T>(0);
}
auto p_index = sample_id * num_classes + label_data[sample_id];
for (size_t ni = 0; ni < num_classes; ni++) {
if (label_data[sample_id] == ni) continue;
auto n_index = sample_id * num_classes + ni;
auto grad_ = -dy_data[sample_id] /
((num_classes - 1) *
(1.0f + TolerableValue<T>()(std::exp(x_data[p_index] -
x_data[n_index]))));
dx_data[p_index] += grad_;
dx_data[n_index] -= grad_;
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims());
if (activation == "identity") {
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else if (!exhaustive_search) {
if (!exhaustive_search) {
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
......@@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit");
// ------------------- cudnn conv+bias+act forward --------------------
ScalingParamType<T> alpha1 = 1.0f;
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
if ((activation == "identity") &&
(algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
(!residual)) {
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
// But test in some case, the speed is slower, change to use
// cudnnConvolutionForward and cudnnAddTensor
// ------------- cudnn conv forward and bias add ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
} else {
if (activation == "identity") {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
}
// ------------------- cudnn conv+bias+act forward --------------------
ScalingParamType<T> alpha1 = 1.0f;
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
}
}
};
#endif
......
......@@ -28,6 +28,46 @@ using mkldnn::stream;
using platform::to_void_cast;
using platform::GetMKLDNNFormat;
inline void GetWeightsTz(std::vector<int>& weights_tz, int groups, // NOLINT
bool is_conv3d) {
if (groups > 1) {
if (is_conv3d) {
int output = weights_tz[0];
int input = weights_tz[1];
int dimension = weights_tz[2];
int height = weights_tz[3];
int width = weights_tz[4];
weights_tz.resize(6);
weights_tz[0] = groups;
weights_tz[1] = output / groups;
weights_tz[2] = input;
weights_tz[3] = dimension;
weights_tz[4] = height;
weights_tz[5] = width;
} else {
int output = weights_tz[0];
int input = weights_tz[1];
int height = weights_tz[2];
int width = weights_tz[3];
weights_tz.resize(5);
weights_tz[0] = groups;
weights_tz[1] = output / groups;
weights_tz[2] = input;
weights_tz[3] = height;
weights_tz[4] = width;
}
}
}
inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format,
int groups, bool is_conv3d) {
if (is_conv3d) {
return (groups == 1) ? format : mkldnn::memory::format::goidhw;
} else {
return (groups == 1) ? format : mkldnn::memory::format::goihw;
}
}
template <typename T>
class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
......@@ -52,10 +92,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
filter->format() != memory::format::format_undef,
"Wrong layout/format set for Filter tensor");
PADDLE_ENFORCE(input->dims().size() == 4,
"Input must be with 4 dimensions, i.e. NCHW");
PADDLE_ENFORCE(filter->dims().size() == 4,
"Filter must be with 4 dimensions, i.e. OIHW");
PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
"Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
if (bias) {
PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
bias->format() != memory::format::format_undef,
......@@ -71,9 +111,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
int groups = ctx.Attr<int>("groups");
bool is_conv3d = strides.size() == 3U;
// TODO(tpatejko): add support for dilation
PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
is_conv3d
? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
dilations[2] == 1
: dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
const T* input_data = input->data<T>();
......@@ -83,18 +127,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
int g = std::max(groups, 1);
if (g > 1) {
int o = weights_tz[0];
int i = weights_tz[1];
int h = weights_tz[2];
int w = weights_tz[3];
weights_tz.resize(5);
weights_tz[0] = g;
weights_tz[1] = o / g;
weights_tz[2] = i;
weights_tz[3] = h;
weights_tz[4] = w;
}
GetWeightsTz(weights_tz, g, is_conv3d);
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// Get unique name for storing MKLDNN primitives
......@@ -105,11 +138,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<primitive> pipeline;
auto src_format = input->format();
mkldnn::memory::format weights_format =
GetWeightsFormat(filter->format(), g, is_conv3d);
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
{src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(),
(g == 1) ? filter->format() : mkldnn::memory::format::goihw);
{weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
......@@ -119,10 +155,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
if (is_conv3d) {
chosen_memory_format =
platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
}
weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc(
......@@ -263,8 +305,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn,
mkldnn::prop_kind fwd_prop_kind) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
memory::dims stride_dims = strides;
memory::dims padding_dims = paddings;
auto conv_desc = mkldnn::convolution_forward::desc(
fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
......@@ -288,8 +330,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn,
mkldnn::prop_kind fwd_prop_kind) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
memory::dims stride_dims = strides;
memory::dims padding_dims = paddings;
auto conv_desc = mkldnn::convolution_forward::desc(
fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
......@@ -349,6 +391,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
bool is_conv3d = strides.size() == 3U;
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
const T* output_grad_data = output_grad->data<T>();
......@@ -358,8 +401,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
int g = std::max(groups, 1);
GetWeightsTz(weights_tz, g, is_conv3d);
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
auto src_format = input->format();
mkldnn::memory::format weights_format =
GetWeightsFormat(filter->format(), g, is_conv3d);
// Get an unique name from "argument" name of "Output" variable
// as well as attributes of primitive to be created
// This name will be used as key when saving info into device context
......@@ -372,9 +421,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// Create user memory descriptors
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
{src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
{weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
auto user_diff_dst_md = platform::MKLDNNMemDesc(
{dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format());
......@@ -386,14 +435,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
if (is_conv3d) {
chosen_memory_format =
platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
}
weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto diff_src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
auto diff_weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
auto diff_dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
......@@ -500,3 +555,13 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNGradOpKernel<float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN,
::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNGradOpKernel<float>);
......@@ -134,14 +134,14 @@ void Conv2DOpMaker::Make() {
"The format of output tensor is X (one-dimensional) of size equal"
"to the number of output channels. Only used with MKL-DNN.")
.AsDispensable();
AddOutput("Output",
"(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW.");
AddInput("ResidualData",
"(Tensor) Tensor with residual data "
"to which convolution output will be added."
"Used with fuse_residual_connection fusion.")
.AsDispensable();
AddOutput("Output",
"(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides",
"(vector<int> default:{1, 1}), the "
"strides(h_stride, w_stride) of "
......@@ -232,6 +232,10 @@ $$
}
void Conv3DOpMaker::Make() {
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddInput(
"Input",
"(Tensor) The input tensor of convolution operator. "
......@@ -247,6 +251,11 @@ void Conv3DOpMaker::Make() {
"is the width of the filter."
"If the groups attribute is greater than 1, C equals the number of "
"input image channels divided by the groups.");
AddInput("ResidualData",
"(Tensor) Tensor with residual data "
"to which convolution output will be added."
"Used with fuse_residual_connection fusion.")
.AsDispensable();
AddOutput("Output",
"(Tensor) The output tensor of convolution operator."
"The format of output tensor is also NCDHW.");
......@@ -280,6 +289,13 @@ void Conv3DOpMaker::Make() {
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_residual_connection",
"(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is as an input to residual "
"connection.")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......
......@@ -13,16 +13,26 @@ set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor
if(WITH_GRPC)
grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory)
DEPS lod_tensor selected_rows_functor memory)
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(grpc_serde_test SRCS grpc_serde_test.cc
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL)
cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
if(WITH_GPU)
cc_test(collective_server_test SRCS collective_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
selected_rows_functor scope math_function SERIAL)
endif()
cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory)
else()
set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <condition_variable> // NOLINT
#include <string>
#include "gflags/gflags.h"
#include "paddle/fluid/operators/distributed/collective_client.h"
DECLARE_int32(rpc_deadline);
namespace paddle {
namespace operators {
namespace distributed {
std::once_flag CollectiveClient::init_flag_;
std::unique_ptr<CollectiveClient> CollectiveClient::client_(nullptr);
bool CollectiveClient::Gather(const std::vector<RemoteVar>& remote_vars,
std::vector<const framework::SelectedRows*>* dst,
const platform::DeviceContext& ctx,
framework::Scope* scope, int64_t time_out) {
for (auto r : remote_vars) {
VLOG(50) << "begin gather from ep:" << r.String();
scope->Var(r.var_name_)->GetMutable<framework::SelectedRows>();
VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable(
r.ep_, ctx, *scope, r.var_name_, time_out);
}
rpc_client_->Wait();
for (auto r : remote_vars) {
auto select_rows =
scope->FindVar(r.var_name_)->GetMutable<framework::SelectedRows>();
dst->push_back(select_rows);
VLOG(4) << "gather from ep:" << r.String()
<< ", select_rows:" << GetSelectedRowsInfo(*select_rows);
rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_);
}
rpc_client_->Wait();
return true;
}
} // namespace distributed
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <condition_variable> // NOLINT
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
DECLARE_int32(rpc_deadline);
namespace paddle {
namespace operators {
namespace distributed {
inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) {
std::stringstream ss;
ss << ", height:" << slr.height() << ", rows:[";
for (unsigned int i = 0; i < slr.rows().size(); i++) {
if (i != slr.rows().size() - 1) {
ss << slr.rows()[i] << ",";
} else {
ss << slr.rows()[i];
}
}
ss << "], dims:" << slr.value().dims();
return ss.str();
}
struct RemoteVar {
std::string ep_;
std::string var_name_;
int trainer_id_{0};
std::string String() {
std::stringstream ss;
ss << "ep:" << ep_ << ", var_name:" << var_name_
<< ", trainer_id:" << trainer_id_;
return ss.str();
}
};
class CollectiveClient {
public:
CollectiveClient() {
rpc_client_.reset(new RPCCLIENT_T());
rpc_client_->InitImpl();
}
virtual ~CollectiveClient() {}
// note this function will retain the rank order.
bool Gather(const std::vector<RemoteVar>& remote_vars,
std::vector<const framework::SelectedRows*>* dst,
const platform::DeviceContext& ctx, framework::Scope* scope,
int64_t time_out = FLAGS_rpc_deadline);
static CollectiveClient* GetInstance() {
std::call_once(init_flag_, [&]() {
if (client_.get() == nullptr) {
client_.reset(new CollectiveClient());
}
});
return client_.get();
}
private:
std::unique_ptr<RPCClient> rpc_client_;
static std::once_flag init_flag_;
static std::unique_ptr<CollectiveClient> client_;
};
} // namespace distributed
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdio.h> // for removing the port file
#include <csignal>
#include <cstdlib>
#include <fstream>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/operators/distributed/collective_server.h"
DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get");
namespace paddle {
namespace operators {
namespace distributed {
std::once_flag CollectiveServer::init_flag_;
std::shared_ptr<CollectiveServer> CollectiveServer::collective_server_(nullptr);
CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) {
VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in;
rpc_server_.reset(new RPCSERVER_T(end_point, fan_in));
}
void CollectiveServer::Stop() {
rpc_server_->ShutDown();
server_thread_->join();
loop_thread_->join();
}
void CollectiveServer::StartServer() {
get_monomer_handler_.reset(new GetMonomerHandler());
get_monomer_handler_->SetRPCServer(rpc_server_.get());
get_barrier_handler_.reset(new GetMonomerBarrierHandler());
get_barrier_handler_->SetRPCServer(rpc_server_.get());
rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable,
get_monomer_handler_.get(),
FLAGS_collective_get_thread_num);
rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier,
get_barrier_handler_.get(), 1);
server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); }));
rpc_server_->WaitServerReady();
loop_thread_.reset(new std::thread([&]() {
while (true) {
if (rpc_server_->IsExit()) {
LOG(WARNING) << "get exit!rpc_processor break!";
break;
}
sleep(1);
}
VLOG(1) << "CollectiveServer loop_thread end";
}));
}
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <set>
#include <string>
#include <thread> // NOLINT
#include <utility>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
namespace paddle {
namespace operators {
namespace distributed {
class CollectiveServer;
class GetMonomerHandler final : public RequestHandler {
public:
GetMonomerHandler() : RequestHandler(true) {}
virtual ~GetMonomerHandler() {}
bool Handle(const std::string& var_name, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar,
const int trainer_id, const std::string& out_var_name = "",
const std::string& table_name = "") override {
VLOG(50) << "GetMonomerHandler recv " << var_name;
*outvar = scope->FindVar(var_name);
PADDLE_ENFORCE(outvar != nullptr, "%s not found", var_name);
return true;
}
};
class GetMonomerBarrierHandler final : public RequestHandler {
public:
GetMonomerBarrierHandler() : RequestHandler(true) {}
virtual ~GetMonomerBarrierHandler() {}
bool Handle(const std::string& var_name, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar,
const int trainer_id, const std::string& out_var_name = "",
const std::string& table_name = "") override {
VLOG(50) << "GetMonomerHandler recv " << var_name;
rpc_server_->IncreaseVarBarrier(var_name);
return true;
}
};
class CollectiveServer final {
public:
explicit CollectiveServer(const std::string& end_point, int fan_in);
virtual ~CollectiveServer() {}
void StartServer();
static CollectiveServer* GetInstance(const std::string& end_point,
int fan_in) {
std::call_once(init_flag_, [&]() {
if (collective_server_.get() == nullptr) {
collective_server_.reset(new CollectiveServer(end_point, fan_in));
collective_server_->StartServer();
}
});
return collective_server_.get();
}
std::shared_ptr<RPCServer> GetRPCServer() { return rpc_server_; }
void Stop();
private:
std::unique_ptr<GetMonomerHandler> get_monomer_handler_;
std::unique_ptr<GetMonomerBarrierHandler> get_barrier_handler_;
std::shared_ptr<distributed::RPCServer> rpc_server_;
std::shared_ptr<std::thread> server_thread_;
std::shared_ptr<std::thread> loop_thread_;
bool ready_{false};
static std::once_flag init_flag_;
static std::shared_ptr<CollectiveServer> collective_server_;
};
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread> // NOLINT
#include "gtest/gtest.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/collective_client.h"
#include "paddle/fluid/operators/distributed/collective_server.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace distributed = paddle::operators::distributed;
std::unique_ptr<distributed::CollectiveServer> StartServer(
const std::string& ep, int fan_in, framework::Scope* scope,
platform::DeviceContext* dev_ctx) {
distributed::CollectiveServer* server =
distributed::CollectiveServer::GetInstance(ep, fan_in);
auto rpc_server = server->GetRPCServer();
rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable,
scope, dev_ctx);
std::cout << "StartServer return" << std::endl;
return std::unique_ptr<distributed::CollectiveServer>(server);
}
std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
framework::Scope* scope = new framework::Scope();
framework::Variable* var = scope->Var("var1");
auto* slr = var->GetMutable<framework::SelectedRows>();
slr->set_height(1000);
auto* tensor = slr->mutable_value();
auto* rows = slr->mutable_rows();
tensor->Resize(framework::make_ddim({3, 5}));
tensor->mutable_data<float>(place);
paddle::operators::math::set_constant(ctx, tensor, 32.7);
for (int i = 0; i < 3; ++i) rows->push_back(i);
std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr);
return std::unique_ptr<framework::Scope>(scope);
}
void Gather(const std::vector<distributed::RemoteVar>& vars,
platform::DeviceContext* dev_ctx) {
distributed::CollectiveClient* client =
distributed::CollectiveClient::GetInstance();
framework::Scope* scope = new framework::Scope();
framework::Variable* var = scope->Var("var1");
var->GetMutable<framework::SelectedRows>();
std::vector<const framework::SelectedRows*> dst;
client->Gather(vars, &dst, *dev_ctx, scope);
std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
}
TEST(PREFETCH, GPU) {
platform::CUDAPlace place;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
std::string ep = "127.0.0.1:7164";
auto scope = GenerateVars(place);
auto* v1 = scope->FindVar("var1");
std::cout << "var1:" << v1 << std::endl;
auto server = StartServer(ep, 2, scope.get(), &ctx);
auto rpc_server = server->GetRPCServer();
distributed::RemoteVar var;
var.ep_ = ep;
var.var_name_ = "var1";
var.trainer_id_ = 0;
std::vector<distributed::RemoteVar> vars{var};
Gather(vars, &ctx);
Gather(vars, &ctx);
std::cout << "begin WaitVarBarrier" << std::endl;
rpc_server->WaitVarBarrier("var1");
rpc_server->ClearRegisteredVars();
server->Stop();
scope.release();
server.release();
}
......@@ -28,11 +28,11 @@ namespace paddle {
namespace operators {
namespace distributed {
void GRPCClient::InitImpl() { InitEventLoop(); }
void GRPCClient::InitEventLoop() {
void GRPCClient::InitImpl() {
// start the client process thread
// TODO(wuyi): can make this in a threadpool
PADDLE_ENFORCE(client_thread_ == nullptr,
"please not re init proceed thread");
client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
}
......@@ -106,6 +106,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
void ProcGetResponse(const VarHandle& var_h,
const ::grpc::ByteBuffer& ret_msg) {
VLOG(100) << "ProcGetResponse";
framework::Variable* outvar = nullptr;
// get response's trainer_id is not used
int trainer_id;
......@@ -126,6 +127,24 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
const framework::Scope& scope,
const std::string& var_name,
int64_t time_out) {
return _AsyncGetVar(ep, ctx, scope, var_name,
"/sendrecv.SendRecvService/GetVariable", time_out);
}
VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name,
int64_t time_out) {
return _AsyncGetVar(ep, ctx, scope, var_name,
"/sendrecv.SendRecvService/GetMonomerVariable", time_out);
}
VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name,
const std::string& rpc_path,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string var_name_val = var_name;
......@@ -136,7 +155,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] {
framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] {
// prepare input
sendrecv::VariableMessage req;
req.set_varname(var_name_val);
......@@ -151,8 +170,8 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
platform::RecordRPCEvent record_event(method, p_ctx);
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
auto call =
s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
call->StartCall();
call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
......@@ -268,6 +287,34 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
return h;
}
VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
const std::string& var_name,
int64_t time_out) {
const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
const std::string method = "SendMonomerFetchBarrierRPC";
VarHandlePtr h(
new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
s->Prepare(h, time_out);
VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
sendrecv::VariableMessage req;
req.set_varname(var_name);
platform::RecordRPCEvent record_event(method, nullptr);
auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++;
if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait();
}
return h;
}
VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
int64_t time_out) {
const auto ch = GetChannel(ep);
......
......@@ -189,6 +189,11 @@ class GRPCClient : public RPCClient {
const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncGetMonomerVariable(
const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncPrefetchVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
......@@ -200,8 +205,12 @@ class GRPCClient : public RPCClient {
VarHandlePtr AsyncSendBatchBarrier(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendFetchBarrier(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendFetchBarrier(const std::string& ep,
int64_t time_out) override;
VarHandlePtr AsyncGetMonomerBarrier(
const std::string& ep, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncCheckpointNotify(
const std::string& ep, const std::string& dir,
......@@ -214,21 +223,22 @@ class GRPCClient : public RPCClient {
void SendComplete() override;
protected:
void InitImpl() override;
private:
// InitEventLoop should only be called by Init()
void InitEventLoop();
void Proceed();
std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
VarHandlePtr _AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name, const std::string& rpc,
int64_t time_out);
private:
grpc::CompletionQueue cq_;
std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
std::unique_ptr<std::thread> client_thread_;
std::unique_ptr<std::thread> client_thread_{nullptr};
// mutex for Wait client sync
std::mutex sync_mutex_;
......
......@@ -158,6 +158,98 @@ class RequestGet final : public RequestBase {
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
};
class RequestGetMonomerVariable final : public RequestBase {
public:
explicit RequestGetMonomerVariable(GrpcService::AsyncService* service,
::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler,
int req_id, RPCServer* rpc_server)
: RequestBase(service, cq, request_handler, req_id),
responder_(&ctx_),
rpc_server_(rpc_server) {
auto method_id =
static_cast<int>(distributed::GrpcMethod::kGetMonomerVariable);
service_->RequestAsyncUnary(
method_id, &ctx_, &request_, &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
}
virtual ~RequestGetMonomerVariable() {}
std::string GetReqName() override { return request_.varname(); }
void Process() override {
// proc request.
std::string varname = request_.varname();
rpc_server_->WaitVarCond(varname);
MonomerHandle h = rpc_server_->GetMonomer(varname);
auto scope = h.scope_;
auto invar = scope->FindVar(varname);
framework::Variable* outvar = nullptr;
request_handler_->Handle(varname, scope, invar, &outvar,
request_.trainer_id());
if (outvar) {
SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_);
}
Finish(reply_, &responder_);
}
protected:
sendrecv::VariableMessage request_;
::grpc::ByteBuffer reply_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
RPCServer* rpc_server_{nullptr};
};
class RequestGetMonomerBarrier final : public RequestBase {
public:
explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service,
::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler, int req_id,
RPCServer* rpc_server)
: RequestBase(service, cq, request_handler, req_id),
responder_(&ctx_),
rpc_server_(rpc_server) {
auto method_id =
static_cast<int>(distributed::GrpcMethod::kGetMonomerBarrier);
service_->RequestAsyncUnary(
method_id, &ctx_, &request_, &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
}
virtual ~RequestGetMonomerBarrier() {}
std::string GetReqName() override { return request_.varname(); }
void Process() override {
// proc request.
std::string varname = request_.varname();
VLOG(4) << "RequestGetMonomerBarrier " << varname;
rpc_server_->WaitVarCond(varname);
MonomerHandle h = rpc_server_->GetMonomer(varname);
framework::Scope* scope = nullptr;
framework::Variable* invar = nullptr;
framework::Variable* outvar = nullptr;
request_handler_->Handle(varname, scope, invar, &outvar,
request_.trainer_id());
Finish(reply_, &responder_);
}
protected:
sendrecv::VariableMessage request_;
sendrecv::VoidMessage reply_;
ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
RPCServer* rpc_server_{nullptr};
};
class RequestPrefetch final : public RequestBase {
public:
explicit RequestPrefetch(GrpcService::AsyncService* service,
......@@ -249,7 +341,7 @@ class RequestCheckpointNotify final : public RequestBase {
};
void AsyncGRPCServer::WaitServerReady() {
VLOG(4) << "AsyncGRPCServer is wait server ready";
VLOG(4) << "AsyncGRPCServer is waiting server ready";
std::unique_lock<std::mutex> lock(this->mutex_ready_);
condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
VLOG(4) << "AsyncGRPCServer WaitSeverReady";
......@@ -368,6 +460,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
b = new RequestSend(&service_, cq.get(), handler, req_id);
} else if (rpc_name == kRequestGet) {
b = new RequestGet(&service_, cq.get(), handler, req_id);
} else if (rpc_name == kRequestGetMonomerVariable) {
b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id,
this);
} else if (rpc_name == kRequestGetMonomerBarrier) {
b = new RequestGetMonomerBarrier(&service_, cq.get(), handler, req_id,
this);
} else if (rpc_name == kRequestPrefetch) {
b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
} else if (rpc_name == kRequestCheckpoint) {
......@@ -378,7 +476,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
reqs[req_id] = b;
VLOG(4) << "Create RequestSend status:" << b->Status();
VLOG(4) << "TryToRegisterNewOne status:" << b->Status();
}
void AsyncGRPCServer::HandleRequest(
......
......@@ -81,10 +81,12 @@ enum class GrpcMethod {
kGetVariable,
kPrefetchVariable,
kCheckpointNotify,
kGetMonomerVariable,
kGetMonomerBarrier,
};
static const int kGrpcNumMethods =
static_cast<int>(GrpcMethod::kCheckpointNotify) + 1;
static_cast<int>(GrpcMethod::kGetMonomerBarrier) + 1;
inline const char* GrpcMethodName(GrpcMethod id) {
switch (id) {
......@@ -92,6 +94,10 @@ inline const char* GrpcMethodName(GrpcMethod id) {
return "/sendrecv.SendRecvService/SendVariable";
case GrpcMethod::kGetVariable:
return "/sendrecv.SendRecvService/GetVariable";
case GrpcMethod::kGetMonomerVariable:
return "/sendrecv.SendRecvService/GetMonomerVariable";
case GrpcMethod::kGetMonomerBarrier:
return "/sendrecv.SendRecvService/GetMonomerBarrier";
case GrpcMethod::kPrefetchVariable:
return "/sendrecv.SendRecvService/PrefetchVariable";
case GrpcMethod::kCheckpointNotify:
......
......@@ -37,6 +37,8 @@ namespace distributed {
constexpr char kRequestSend[] = "RequestSend";
constexpr char kRequestGet[] = "RequestGet";
constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable";
constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
constexpr char kRequestPrefetch[] = "RequestPrefetch";
constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
......
......@@ -45,6 +45,11 @@ class RPCClient {
const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncGetMonomerVariable(
const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncPrefetchVar(
const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& in_var_name,
......@@ -57,6 +62,10 @@ class RPCClient {
virtual VarHandlePtr AsyncSendFetchBarrier(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncGetMonomerBarrier(
const std::string& ep, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncCheckpointNotify(
const std::string& ep, const std::string& dir,
int64_t time_out = FLAGS_rpc_deadline) = 0;
......@@ -87,8 +96,9 @@ class RPCClient {
}
}
protected:
virtual void InitImpl() {}
protected:
// each trainer have exact one trainer id, it should be static
static int trainer_id_;
......
......@@ -132,6 +132,96 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
}
void RPCServer::RegisterVar(const std::string& var_name,
const std::string& rpc_name,
framework::Scope* scope,
platform::DeviceContext* dev_ctx) {
MonomerHandle h;
h.var_name_ = var_name;
h.rpc_name_ = rpc_name;
h.scope_ = scope;
h.dev_ctx_ = dev_ctx;
{
std::unique_lock<std::mutex> lock(mutex_);
if (var_map_.find(var_name) != var_map_.end()) {
PADDLE_ENFORCE(false, "%s alreay in var_map", var_name);
}
var_map_[var_name] = h;
}
rpc_cond_.notify_all();
VLOG(4) << "RegisterVar context:" << h.String();
}
void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
int b = 0;
MonomerHandle h;
{
std::unique_lock<std::mutex> lock(mutex_);
b = ++var_map_[var_name].barrier_;
h = var_map_[var_name];
}
if (b >= client_num_) {
barrier_cond_.notify_all();
}
VLOG(4) << "IncreaseVarBarrier context:" << h.String();
}
void RPCServer::WaitVarBarrier(const std::string& var_name) {
VLOG(4) << "WaitBarrier var_name:" << var_name;
std::unique_lock<std::mutex> lock(mutex_);
barrier_cond_.wait(lock, [&]() {
return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) ||
exit_flag_.load());
});
VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String();
}
void RPCServer::SetVarCond(const std::string& var_name) {
VLOG(4) << "SetVarCond var_name:" << var_name;
{
std::unique_lock<std::mutex> lock(mutex_);
if (var_map_.find(var_name) != var_map_.end()) {
rpc_cond_.notify_all();
}
}
}
void RPCServer::WaitVarCond(const std::string& var_name) {
VLOG(4) << "WaitVarCond var_name:" << var_name;
std::unique_lock<std::mutex> lock(mutex_);
rpc_cond_.wait(lock, [=] {
return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
});
VLOG(4) << "WaitVarCond var_name:" << var_name << " end";
}
MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
MonomerHandle h;
{
std::unique_lock<std::mutex> lock(mutex_);
h = var_map_[var_name];
}
return h;
}
void RPCServer::ClearRegisteredVars() {
std::unique_lock<std::mutex> lock(mutex_);
var_map_.clear();
}
void RPCServer::ClearVar(const std::string& var_name) {
std::unique_lock<std::mutex> lock(mutex_);
var_map_.erase(var_name);
}
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -21,12 +21,30 @@
#include <utility>
#include <vector>
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
namespace distributed {
struct MonomerHandle {
std::string var_name_;
std::string rpc_name_;
framework::Scope* scope_{nullptr};
platform::DeviceContext* dev_ctx_{nullptr};
int64_t barrier_{0};
std::string String() {
std::stringstream ss;
ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_
<< ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_
<< ", barrier_:" << barrier_;
return ss.str();
}
};
class RPCServer {
public:
explicit RPCServer(const std::string& address, int client_num)
......@@ -67,6 +85,16 @@ class RPCServer {
void WaitCond(const std::string& rpc_name);
void IncreaseBatchBarrier(const std::string rpc_name);
void RegisterVar(const std::string& var_name, const std::string& rpc_name,
framework::Scope* scope, platform::DeviceContext* dev_ctx);
void IncreaseVarBarrier(const std::string& var_name);
void WaitVarBarrier(const std::string& var_name);
void SetVarCond(const std::string& var_name);
void WaitVarCond(const std::string& var_name);
void ClearRegisteredVars();
void ClearVar(const std::string& var_name);
MonomerHandle GetMonomer(const std::string& var_name);
void Complete();
void ResetBarrierCounter();
......@@ -95,6 +123,9 @@ class RPCServer {
std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
std::unordered_map<std::string, int> rpc_thread_num_;
friend class RequestHandler;
// TODO(gongwb): use more cond to notify or wait;
std::unordered_map<std::string, MonomerHandle> var_map_;
};
}; // namespace distributed
......
......@@ -28,6 +28,9 @@ service SendRecvService {
rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
}
// VariableMessage is serialized paddle variable message.
......
......@@ -32,16 +32,26 @@ class LoadCombineOp : public framework::OperatorBase {
const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path");
auto load_as_fp16 = Attr<bool>("load_as_fp16");
std::ifstream fin(filename);
PADDLE_ENFORCE(static_cast<bool>(fin),
"Cannot open file %s for load_combine op", filename);
auto model_from_memory = Attr<bool>("model_from_memory");
auto out_var_names = Outputs("Out");
PADDLE_ENFORCE_GT(
static_cast<int>(out_var_names.size()), 0,
"The number of output variables should be greater than 0.");
if (!model_from_memory) {
std::ifstream fin(filename);
PADDLE_ENFORCE(static_cast<bool>(fin),
"Cannot open file %s for load_combine op", filename);
LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
} else {
PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
std::stringstream fin(filename);
LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
}
}
void LoadParamsFromBuffer(
const framework::Scope &scope, const platform::Place &place,
std::istream *buffer, bool load_as_fp16,
const std::vector<std::string> &out_var_names) const {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
......@@ -54,11 +64,10 @@ class LoadCombineOp : public framework::OperatorBase {
auto *tensor = out_var->GetMutable<framework::LoDTensor>();
// Error checking
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
filename);
PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more");
// Get data from fin to tensor
DeserializeFromStream(fin, tensor, dev_ctx);
DeserializeFromStream(*buffer, tensor, dev_ctx);
auto in_dtype = framework::ToDataType(tensor->type());
auto out_dtype =
......@@ -103,11 +112,17 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
"LoDTensors will be loaded from \"file_path\".")
.AddCustomChecker(
[](const std::string &path) { return !path.empty(); });
AddAttr<bool>("model_from_memory",
"(boolean, default false)"
"If true, file_path is in memory, and LoDTensors will be "
"loaded directly from memory")
.SetDefault(false);
AddComment(R"DOC(
LoadCombine Operator.
LoadCombine operator loads LoDTensor variables from a file. The file should
contain one or more LoDTensors serialized using the SaveCombine operator. The
LoadCombine operator loads LoDTensor variables from a file, which could be
loaded in memory already. The file should contain one or more LoDTensors
serialized using the SaveCombine operator. The
LoadCombine operator applies a deserialization strategy to appropriately load
the LodTensors, and this strategy complements the serialization strategy used
in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
......
......@@ -72,10 +72,11 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
auto rows_idx = outs_rows_idx[i];
outs[i]->set_height(height_sections[i]);
auto dims = x->GetCompleteDims();
dims[0] = rows_idx.size();
outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
outs[i]->mutable_rows()->clear();
if (rows_idx.size() > 0) {
auto dims = x->GetCompleteDims();
dims[0] = rows_idx.size();
outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
for (auto idx : rows_idx) {
outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
}
......@@ -98,6 +99,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
}
}
}
PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
"rows should has the same size with tensor dim 0");
}
}
};
......
......@@ -120,15 +120,24 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
}
void* allocate(size_t num_bytes) const override {
if (UNLIKELY(num_bytes == 0)) {
return nullptr;
}
auto buf = paddle::memory::Alloc(place_, num_bytes,
memory::Allocator::kScratchpad);
void* retv = buf->ptr();
allocations_[buf->ptr()] = std::move(buf);
{
std::lock_guard<std::mutex> lock(mtx_);
allocations_.emplace(retv, std::move(buf));
}
return retv;
}
void deallocate(void* buffer) const override {
allocations_.erase(allocations_.find(buffer));
if (LIKELY(buffer)) {
std::lock_guard<std::mutex> lock(mtx_);
allocations_.erase(buffer);
}
}
void* scratchpad() const override {
......@@ -155,6 +164,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
const cudaDeviceProp* device_prop_; // not owned;
mutable void* scratch_;
mutable unsigned int* semaphore_;
mutable std::mutex mtx_; // to protect allocations_
mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
};
......@@ -210,6 +220,40 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
<< (cudnn_dso_ver % 100) / 10 << ".";
{
// Check CUDA/CUDNN version compatiblity
auto local_cuda_version = runtime_version_ / 100;
auto compile_cuda_version = CUDA_VERSION / 100;
if (local_cuda_version < compile_cuda_version) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with CUDA "
<< compile_cuda_version / 10 << "." << compile_cuda_version % 10
<< ", but CUDA runtime version in your machine is "
<< local_cuda_version / 10 << "." << local_cuda_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDA "
"version.";
}
if (dynload::HasCUDNN()) {
auto local_cudnn_version = cudnn_dso_ver / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cuda_version < compile_cuda_version) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with CUDNN "
<< compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
<< ", but CUDNN version in your machine is "
<< local_cudnn_version / 10 << "." << local_cudnn_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDNN "
"version.";
}
}
}
callback_manager_.reset(new StreamCallbackManager(stream_));
}
......
......@@ -113,6 +113,18 @@ inline mkldnn::memory::format MKLDNNFormatForSize(
return mkldnn::memory::format::x;
} else if (dims_size == 2) {
return mkldnn::memory::format::nc;
} else if (dims_size == 3) {
if (data_format == mkldnn::memory::format::nchw) {
return mkldnn::memory::format::ncw;
} else if (data_format == mkldnn::memory::format::nhwc) {
return mkldnn::memory::format::nwc;
}
} else if (dims_size == 5) {
if (data_format == mkldnn::memory::format::nchw) {
return mkldnn::memory::format::ncdhw;
} else if (data_format == mkldnn::memory::format::nhwc) {
return mkldnn::memory::format::ndhwc;
}
}
return data_format;
}
......
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc)
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc)
if(WITH_PYTHON)
if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/imperative/tracer.h"
namespace paddle {
namespace pybind {
// Bind Methods
void BindTracer(pybind11::module *m) {
pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
.def("__init__",
[](imperative::Tracer &self, framework::BlockDesc *root_block) {
new (&self) imperative::Tracer(root_block);
})
.def("trace", &imperative::Tracer::Trace)
.def("get_scope", &imperative::Tracer::GetScope,
pybind11::return_value_policy::reference);
}
} // namespace pybind
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <Python.h>
#include <vector>
#include "paddle/fluid/imperative/layer.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace paddle {
namespace pybind {
class PyLayer : public imperative::Layer {
public:
using imperative::Layer::Layer; // Inherit constructors
std::vector<imperative::VarBase> Forward(
const std::vector<imperative::VarBase>& inputs) override {
PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Forward,
inputs); // NOLINT
}
void Backward() override {
PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT
}
};
class PyOpBase : public imperative::OpBase {
public:
using imperative::OpBase::OpBase; // Inherit constructors
};
class PyVarBase : public imperative::VarBase {
public:
using imperative::VarBase::VarBase; // Inherit constructors
};
void BindTracer(pybind11::module* m);
} // namespace pybind
} // namespace paddle
......@@ -34,6 +34,7 @@ limitations under the License. */
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
......@@ -45,6 +46,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/async_executor_py.h"
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/recordio.h"
......@@ -100,6 +102,42 @@ PYBIND11_MODULE(core, m) {
BindException(&m);
py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
.def(py::init<>())
.def("_run_backward",
[](imperative::VarBase &self, framework::Scope *scope) {
self.RunBackward(scope);
})
.def("_grad", &imperative::VarBase::Grad)
.def_property(
"desc",
[](const imperative::VarBase &self) { return self.var_desc_; },
[](imperative::VarBase &self, framework::VarDesc *var_desc) {
self.var_desc_ = var_desc;
},
py::return_value_policy::reference);
py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
.def(py::init<>())
.def_property(
"desc", [](const imperative::OpBase &self) { return self.op_desc_; },
[](imperative::OpBase &self, framework::OpDesc *op_desc) {
if (op_desc) {
self.op_desc_ = op_desc;
}
},
py::return_value_policy::reference);
py::class_<imperative::Layer, PyLayer /* <--- trampoline*/> layer(m, "Layer");
layer.def(py::init<>())
.def("forward",
[](imperative::Layer &self,
const std::vector<imperative::VarBase> &inputs) {
return self.Forward(inputs);
})
.def("backward", &imperative::Layer::Backward);
BindTracer(&m);
py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer(
[](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
......@@ -298,6 +336,8 @@ PYBIND11_MODULE(core, m) {
.def("get_tensor",
[](SelectedRows &self) { return self.mutable_value(); },
py::return_value_policy::reference)
.def("numel",
[](SelectedRows &self) -> int64_t { return self.value().numel(); })
.def("set_height", &SelectedRows::set_height)
.def("height", &SelectedRows::height)
.def("set_rows",
......@@ -601,6 +641,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable);
m.def("get_variable_tensor", framework::GetVariableTensor);
m.def("_is_program_version_supported", IsProgramVersionSupported);
......@@ -886,6 +927,18 @@ All parameter, weight, gradient are variables in Paddle.
[](BuildStrategy &self, int num_trainers) {
self.num_trainers_ = num_trainers;
})
.def_property(
"trainers_endpoints",
[](const BuildStrategy &self) { return self.trainers_endpoints_; },
[](BuildStrategy &self,
const std::vector<std::string> &trainers_endpoints) {
self.trainers_endpoints_ = trainers_endpoints;
})
.def_property("trainer_id",
[](const BuildStrategy &self) { return self.trainer_id_; },
[](BuildStrategy &self, int trainer_id) {
self.trainer_id_ = trainer_id;
})
.def_property(
"fuse_elewise_add_act_ops",
[](const BuildStrategy &self) {
......
......@@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray(
paddle::platform::CPUPlace place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
for (int i = 0; i < array.ndim(); ++i) {
dims.push_back(static_cast<int>(array.shape()[i]));
}
......
......@@ -32,11 +32,28 @@ the image layout as follows.
from __future__ import print_function
import six
import numpy as np
try:
import cv2
except ImportError:
cv2 = None
# FIXME(minqiyang): this is an ugly fix for the numpy bug reported here
# https://github.com/numpy/numpy/issues/12497
if six.PY3:
import subprocess
import sys
import_cv2_proc = subprocess.Popen(
[sys.executable, "-c", "import cv2"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
out, err = import_cv2_proc.communicate()
retcode = import_cv2_proc.poll()
if retcode != 0:
cv2 = None
else:
import cv2
else:
try:
import cv2
except ImportError:
cv2 = None
import os
import tarfile
import six.moves.cPickle as pickle
......
......@@ -34,6 +34,7 @@ from . import io
from . import evaluator
from . import initializer
from . import layers
from . import imperative
from . import contrib
from . import nets
from . import optimizer
......@@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'initializer',
'layers',
'contrib',
'imperative',
'transpiler',
'nets',
'optimizer',
......
......@@ -258,10 +258,13 @@ class DataFeeder(object):
multiple mini-batches. Each mini-batch will be feed on each device.
Args:
reader(fun): the input data.
multi_devices(bool): the number of places. Default None.
num_places(int): the number of places. Default None.
drop_last(bool): the number of places. Default None.
reader(function): the reader is the function which can generate data.
multi_devices(bool): whether to use multiple devices or not.
num_places(int): if the multi_devices is True, you can specify the number
of GPU to use, if 'num_places' is None, the function will use all the
GPU of the current machine. Default None.
drop_last(bool): whether to drop the last batch if the
size of the last batch is less than batch_size. Default True.
Returns:
dict: the result of conversion.
......
......@@ -18,6 +18,7 @@ import collections
import contextlib
import re
import six
import sys
import numpy as np
......@@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix()
ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
_imperative_tracer_ = None
def _in_imperative_mode():
return _imperative_tracer_ is not None
def _imperative_tracer():
return _imperative_tracer_
class NameScope(object):
def __init__(self, name="", parent=None):
......@@ -345,6 +356,21 @@ class Variable(object):
self.op = None
self.stop_gradient = stop_gradient
self.is_data = is_data
if _in_imperative_mode():
self._ivar = core.VarBase()
self._ivar.desc = self.desc
def _numpy(self):
scope = _imperative_tracer().get_scope(self.block.desc)
tensor = core.get_variable_tensor(scope, self.desc.name())
return np.array(tensor)
def _backward(self):
scope = _imperative_tracer().get_scope(self.block.desc)
self._ivar._run_backward(scope)
def _gradient(self):
return np.array(self._ivar._grad())
def __str__(self):
return self.to_string(True)
......@@ -655,6 +681,23 @@ class Operator(object):
if self._has_kernel(type):
self.desc.infer_var_type(self.block.desc)
self.desc.infer_shape(self.block.desc)
if _in_imperative_mode():
self.iop = core.OpBase()
self.iop.desc = self.desc
self.inputs = []
if inputs is not None:
for inp in inputs.values():
if isinstance(inp, Variable):
self.inputs.append(inp)
elif isinstance(inp, list) or isinstance(inp, tuple):
self.inputs.extend(inp[:])
self.outputs = []
if outputs is not None:
for out in outputs.values():
if isinstance(out, Variable):
self.outputs.append(out)
elif isinstance(out, list) or isinstance(out, tuple):
self.outputs.extend(out[:])
def _has_kernel(self, op_type):
return op_type not in self.OP_WITHOUT_KERNEL_SET
......@@ -1041,19 +1084,15 @@ class Block(object):
raise ValueError("var %s not in this block" % name)
return v
def _var_recursive(self, name):
def _find_var_recursive(self, name):
"""
Get a Variable by name from this block recursively.
Args:
name(str): the Variable's name.
Raises:
ValueError: this block and this parent block doesn't
have a Variable with the giving name.
Returns:
Variable: the Variable with the giving name.
Variable: the Variable with the giving name. Or None if not found.
"""
frontier = list()
visited = set()
......@@ -1079,8 +1118,27 @@ class Block(object):
frontier.append(prog.block(cur.forward_block_idx))
visited.add(id(cur))
return None
raise ValueError("Var {0} is not found recursively".format(name))
def _var_recursive(self, name):
"""
Get a Variable by name from this block recursively.
Args:
name(str): the Variable's name.
Raises:
ValueError: this block and this parent block doesn't
have a Variable with the giving name.
Returns:
Variable: the Variable with the giving name.
"""
var = self._find_var_recursive(name)
if var:
return var
else:
raise ValueError("Var {0} is not found recursively".format(name))
def all_parameters(self):
return list(self.iter_parameters())
......@@ -1206,6 +1264,9 @@ class Block(object):
"""
op_desc = self.desc.append_op()
op = Operator(block=self, desc=op_desc, *args, **kwargs)
if _in_imperative_mode():
_imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
[v._ivar for v in op.outputs], self.desc)
self.ops.append(op)
return op
......@@ -1442,6 +1503,7 @@ class Program(object):
self._is_chief = False
self._slice_vars_and_attrs = []
self._endpoints = []
self._trainers_endpoints = []
self._distributed_lookup_table = None
@property
......@@ -2209,3 +2271,12 @@ def _get_var(name, program=None):
assert isinstance(program, Program)
return program.global_block().var(name)
@contextlib.contextmanager
def _imperative_guard(tracer):
global _imperative_tracer_
tmp_trace = _imperative_tracer_
_imperative_tracer_ = tracer
yield
_imperative_tracer_ = tmp_trace
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from . import base
from .base import *
from . import layers
from .layers import *
__all__ = []
__all__ += layers.__all__
__all__ += base.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import numpy as np
from paddle.fluid import core
from paddle.fluid import framework
__all__ = ['enabled', 'guard', 'to_variable']
def enabled():
return framework._in_imperative_mode()
@contextlib.contextmanager
def guard():
train = framework.Program()
startup = framework.Program()
tracer = core.Tracer(train.current_block().desc)
with framework.program_guard(train, startup):
with framework.unique_name.guard():
with framework._imperative_guard(tracer):
yield
def to_variable(value, block=None):
if isinstance(value, np.ndarray):
if not block:
block = framework.default_main_program().current_block()
py_var = framework.Variable(
block,
type=core.VarDesc.VarType.LOD_TENSOR,
name=None,
shape=value.shape,
dtype=value.dtype)
scope = framework._imperative_tracer().get_scope(block.desc)
var = scope.var(py_var.name)
tensor = var.get_tensor()
tensor.set(value, core.CPUPlace())
return py_var
elif isinstance(value, framework.Variable):
return value
else:
raise ValueError("Unsupported type %s" % type(value))
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import sys
import numpy as np
from paddle.fluid import core
from paddle.fluid import framework
from paddle.fluid.imperative import base
__all__ = ['PyLayer']
class PyLayer(core.Layer):
def __init__(self):
pass
def __call__(self, inputs):
# TODO(panyx0718): Support declarative mode as well.
assert base.enabled()
if not isinstance(inputs, list) and not isinstance(inputs, tuple):
inputs = [inputs]
var_inputs = []
for x in inputs:
py_var = base.to_variable(x)
var_inputs.append(py_var)
outputs = self.forward(var_inputs)
return outputs
def forward(self, inputs):
return []
......@@ -145,7 +145,7 @@ def save_vars(executor,
prog = fluid.default_main_program()
fluid.io.save_vars(executor=exe, dirname=path, main_program=prog,
vars=None)
vars=None, predicate = name_has_fc)
# All variables in `main_program` whose name includes "fc" will be saved.
# And variables are going to be saved separately.
......@@ -369,7 +369,7 @@ def load_vars(executor,
prog = fluid.default_main_program()
fluid.io.load_vars(executor=exe, dirname=path, main_program=prog,
vars=None)
vars=None, predicate=name_has_fc)
# All variables in `main_program` whose name includes "fc" will be loaded.
# And all the variables are supposed to have been saved in differnet files.
......
......@@ -17,10 +17,13 @@ from __future__ import print_function
import copy
import itertools
import six
import sys
import numpy as np
from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
from . import unique_name
from paddle.fluid.initializer import Constant, Xavier
from paddle.fluid.imperative import base
from .param_attr import ParamAttr, WeightNormParamAttr
from . import core
from six.moves import zip
......@@ -46,23 +49,21 @@ class LayerHelper(object):
def startup_program(self):
return default_startup_program()
def to_variable(self, x):
return base.to_variable(x, self.main_program.current_block())
def append_op(self, *args, **kwargs):
return self.main_program.current_block().append_op(*args, **kwargs)
def multiple_input(self, input_param_name='input'):
inputs = self.kwargs.get(input_param_name, [])
type_error = TypeError(
"Input of {0} layer should be Variable or sequence of Variable".
format(self.layer_type))
if isinstance(inputs, Variable):
inputs = [inputs]
elif not isinstance(inputs, list) and not isinstance(inputs, tuple):
raise type_error
ret = []
if isinstance(inputs, list) or isinstance(inputs, tuple):
for inp in inputs:
ret.append(self.to_variable(inp))
else:
for each in inputs:
if not isinstance(each, Variable):
raise type_error
return inputs
ret.append(self.to_variable(inputs))
return ret
def input(self, input_param_name='input'):
inputs = self.multiple_input(input_param_name)
......
......@@ -717,8 +717,9 @@ class While(object):
out_vars = []
for inner_out_name in inner_outputs:
if inner_out_name in parent_block.vars:
out_vars.append(parent_block.var(inner_out_name))
inner_var = parent_block._find_var_recursive(inner_out_name)
if inner_var:
out_vars.append(inner_var)
step_scope = parent_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
......@@ -1264,10 +1265,11 @@ class ConditionalBlock(object):
if each_name not in input_set
]
out_list = [
parent_block.var(var_name) for var_name in parent_block.vars
if var_name in intermediate
]
out_list = []
for inner_out_name in intermediate:
inner_var = parent_block._find_var_recursive(inner_out_name)
if inner_var:
out_list.append(inner_var)
step_scope = parent_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
......
......@@ -41,6 +41,7 @@ __all__ = [
'crf_decoding',
'cos_sim',
'cross_entropy',
'bpr_loss',
'square_error_cost',
'chunk_eval',
'sequence_conv',
......@@ -1348,6 +1349,44 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
return out
def bpr_loss(input, label, name=None):
"""
Bayesian Personalized Ranking Loss Operator.
This operator belongs to pairwise ranking loss. Label is the desired item.
The loss at a given point in one session is defined as:
$Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j<N_{i},~ j\neq Label[i]}\log(\sigma(X[i, Label[i]]-X[i, j]))$
Learn more details by reading paper <session-based recommendations with recurrent
neural networks>(https://arxiv.org/abs/1511.06939)
Args:
input (Variable|list): a 2-D tensor with shape [N x D], where N is the
batch size and D is the number of classes.
This input is not probability but logits.
label (Variable|list): the ground truth which is a 2-D tensor. `label`
is a tensor<int64> with shape [N x 1].
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically. Default: None.
Returns:
A 2-D tensor with shape [N x 1], the bpr loss.
Examples:
.. code-block:: python
cost = fluid.layers.bpr_loss(input=predict, label=label)
"""
helper = LayerHelper('bpr_loss', **locals())
out = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op(
type='bpr_loss',
inputs={'X': [input],
'Label': [label]},
outputs={'Y': [out]})
return out
def square_error_cost(input, label):
"""
**Square error cost layer**
......@@ -6623,7 +6662,8 @@ def relu(x, name=None):
helper = LayerHelper('relu', **locals())
dtype = helper.input_dtype(input_param_name='x')
out = helper.create_variable_for_type_inference(dtype)
helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out})
helper.append_op(
type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out})
return out
......
......@@ -135,9 +135,17 @@ class ParallelExecutor(object):
build_strategy = BuildStrategy()
build_strategy.num_trainers = num_trainers
build_strategy.trainer_id = trainer_id
main = main_program
main = main if main else framework.default_main_program()
trainers_endpoints = main._trainers_endpoints
if num_trainers > 1 and trainers_endpoints:
assert num_trainers == len(
trainers_endpoints), "num_trainers == len(end_points)"
build_strategy.trainers_endpoints = trainers_endpoints
if scope == None:
scope = executor.global_scope()
......
......@@ -43,14 +43,13 @@ if(APPLE)
list(REMOVE_ITEM TEST_OPS test_desc_clone)
list(REMOVE_ITEM TEST_OPS test_program_code)
endif(NOT WITH_DISTRIBUTE)
message(WARNING "These tests has been disabled in OSX before being fixed: \n test_gradient_clip \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
# this op is not support on mac
list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
# TODO: add the unitest back when it fixed
list(REMOVE_ITEM TEST_OPS test_detection_map_op)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
list(REMOVE_ITEM TEST_OPS test_gradient_clip)
endif()
if(NOT WITH_MKLML)
# this op is not support on openblas
......
......@@ -102,7 +102,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
if args.is_dist:
if args.update_method == "pserver":
t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(),
args.endpoints, args.trainers,
......@@ -147,7 +147,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
def get_data():
origin_batch = next(reader_generator)
if args.is_dist and args.use_reader_alloc:
if args.update_method == "pserver" and args.use_reader_alloc:
new_batch = []
for offset, item in enumerate(origin_batch):
if offset % 2 == args.trainer_id:
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest, randomize_probability
class TestBprLossOp1(OpTest):
"""Test BprLoss with discrete one-hot labels.
"""
def setUp(self):
self.op_type = "bpr_loss"
batch_size = 40
class_num = 5
X = randomize_probability(batch_size, class_num, dtype='float64')
label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
bpr_loss_result = []
for i in range(batch_size):
sum = 0.0
for j in range(class_num):
if j == label[i][0]:
continue
sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label[i][0]])))
bpr_loss_result.append(-sum / (class_num - 1))
bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64")
self.inputs = {"X": X, "Label": label}
self.outputs = {"Y": bpr_loss}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
if __name__ == "__main__":
unittest.main()
......@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp):
self.activation = 'identity'
class TestIdentityActivation(TestConv2dFusionOp):
def init_activation(self):
self.activation = 'identity'
self.add_residual_data = False
class TestWithGroup(TestConv2dFusionOp):
def init_group(self):
self.groups = 3
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
class TestMKLDNN(TestConv3dOp):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNCase1(TestCase1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNGroup1(TestWithGroup1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNGroup2(TestWithGroup2):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNWith1x1(TestWith1x1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
if __name__ == '__main__':
unittest.main()
......@@ -74,6 +74,8 @@ class TestConv3dOp(OpTest):
def setUp(self):
self.op_type = "conv3d"
self.use_cudnn = False
self.use_mkldnn = False
self.data_format = "AnyLayout"
self.dtype = np.float32
self.init_kernel_type()
self.init_group()
......@@ -83,8 +85,7 @@ class TestConv3dOp(OpTest):
conv3d_param = {
'stride': self.stride,
'pad': self.pad,
'dilations': self.dilations,
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
'dilations': self.dilations
}
input = np.random.random(self.input_size).astype(self.dtype)
......@@ -101,7 +102,9 @@ class TestConv3dOp(OpTest):
'paddings': self.pad,
'groups': self.groups,
'dilations': self.dilations,
'use_cudnn': self.use_cudnn
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn,
'data_format': self.data_format
}
self.outputs = {'Output': output}
......@@ -109,59 +112,35 @@ class TestConv3dOp(OpTest):
return core.is_compiled_with_cuda() and self.use_cudnn
def test_check_output(self):
if self.testcudnn():
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-5)
else:
self.check_output()
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_output_with_place(place, atol=1e-5)
def test_check_grad(self):
if self.dtype == np.float16:
return
if self.testcudnn():
place = core.CUDAPlace(0)
self.check_grad_with_place(
place,
set(['Input', 'Filter']),
'Output',
max_relative_error=0.03)
else:
self.check_grad(
set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_grad_with_place(
place, {'Input', 'Filter'}, 'Output', max_relative_error=0.03)
def test_check_grad_no_filter(self):
if self.dtype == np.float16:
return
if self.testcudnn():
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ['Input'],
'Output',
max_relative_error=0.03,
no_grad_set=set(['Filter']))
else:
self.check_grad(
['Input'],
'Output',
max_relative_error=0.03,
no_grad_set=set(['Filter']))
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_grad_with_place(
place, ['Input'],
'Output',
max_relative_error=0.03,
no_grad_set=set(['Filter']))
def test_check_grad_no_input(self):
if self.dtype == np.float16:
return
if self.testcudnn():
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ['Filter'],
'Output',
max_relative_error=0.03,
no_grad_set=set(['Input']))
else:
self.check_grad(
['Filter'],
'Output',
max_relative_error=0.03,
no_grad_set=set(['Input']))
place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
self.check_grad_with_place(
place, ['Input'],
'Output',
max_relative_error=0.03,
no_grad_set=set(['Input']))
def init_test_case(self):
self.pad = [0, 0, 0]
......
......@@ -76,12 +76,24 @@ class TestDistRunnerBase(object):
if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
if args.is_dist:
if args.update_method == "pserver":
t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(),
args.endpoints, args.trainers,
args.sync_mode, args.dc_asgd)
trainer_prog = t.get_trainer_program()
elif args.update_method == "nccl2":
# transpile for nccl2
config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2"
nccl2_t = fluid.DistributeTranspiler(config=config)
nccl2_t.transpile(
args.trainer_id,
program=fluid.default_main_program(),
startup_program=fluid.default_startup_program(),
trainers=args.endpoints,
current_endpoint=args.current_endpoint)
trainer_prog = fluid.default_main_program()
else:
trainer_prog = fluid.default_main_program()
......@@ -110,11 +122,20 @@ class TestDistRunnerBase(object):
len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
mypass.set_int("num_repeats", args.batch_merge_repeat)
if args.update_method == "nccl2":
num_trainers = len(args.endpoints.split(","))
trainer_id = args.trainer_id
else:
num_trainers = 1
trainer_id = 0
exe = fluid.ParallelExecutor(
args.use_cuda,
loss_name=avg_cost.name,
exec_strategy=strategy,
build_strategy=build_stra)
build_strategy=build_stra,
num_trainers=num_trainers,
trainer_id=trainer_id)
feed_var_list = [
var for var in trainer_prog.global_block().vars.values()
......@@ -126,7 +147,7 @@ class TestDistRunnerBase(object):
def get_data():
origin_batch = next(reader_generator)
if args.is_dist and args.use_reader_alloc:
if args.update_method != "local" and args.use_reader_alloc:
new_batch = []
for offset, item in enumerate(origin_batch):
if offset % 2 == args.trainer_id:
......@@ -151,7 +172,11 @@ def runtime_main(test_class):
parser.add_argument(
'--role', type=str, required=True, choices=['pserver', 'trainer'])
parser.add_argument('--endpoints', type=str, required=False, default="")
parser.add_argument('--is_dist', action='store_true')
parser.add_argument(
'--update_method',
type=str,
default="local",
choices=["pserver", "nccl2", "local"])
parser.add_argument('--trainer_id', type=int, required=False, default=0)
parser.add_argument('--trainers', type=int, required=False, default=1)
parser.add_argument(
......@@ -170,7 +195,7 @@ def runtime_main(test_class):
args = parser.parse_args()
model = test_class()
if args.role == "pserver" and args.is_dist:
if args.role == "pserver" and args.update_method == "pserver":
model.run_pserver(args)
else:
model.run_trainer(args)
......@@ -208,6 +233,7 @@ class TestDistBase(unittest.TestCase):
self._use_reduce = False
self._dc_asgd = False # must use with async mode
self._use_reader_alloc = True
self._nccl2_mode = False
self._setup_config()
self._after_setup_config()
......@@ -218,7 +244,7 @@ class TestDistBase(unittest.TestCase):
def start_pserver(self, model_file, check_error_log, required_envs):
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver"
ps0_cmd = ps_cmd % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep,
self._trainers)
......@@ -270,7 +296,8 @@ class TestDistBase(unittest.TestCase):
else:
env_local = {'CPU_NUM': '1'}
envs.update(env_local)
env_local.update(envs)
print("local_cmd: {}, env: {}".format(cmd, env_local))
if check_error_log:
err_log = open("/tmp/trainer.err.log", "wb")
......@@ -278,13 +305,13 @@ class TestDistBase(unittest.TestCase):
cmd.split(" "),
stdout=subprocess.PIPE,
stderr=err_log,
env=envs)
env=env_local)
else:
local_proc = subprocess.Popen(
cmd.split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=envs)
env=env_local)
local_out, local_err = local_proc.communicate()
......@@ -303,7 +330,7 @@ class TestDistBase(unittest.TestCase):
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver"
tr0_cmd = tr_cmd % \
(self._python_interp, model, self._ps_endpoints,
0, ps0_ep, self._trainers)
......@@ -335,8 +362,8 @@ class TestDistBase(unittest.TestCase):
env0.update(envs)
env1.update(envs)
print("tr0_cmd:{}".format(tr0_cmd))
print("tr1_cmd:{}".format(tr1_cmd))
print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0))
print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1))
tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb")
......@@ -351,28 +378,111 @@ class TestDistBase(unittest.TestCase):
stderr=tr1_pipe,
env=env1)
# Wait until trainer process terminate
while True:
stat0 = tr0_proc.poll()
time.sleep(0.1)
if stat0 is not None:
break
while True:
stat1 = tr1_proc.poll()
time.sleep(0.1)
if stat1 is not None:
break
tr0_out, tr0_err = tr0_proc.communicate()
tr1_out, tr1_err = tr1_proc.communicate()
# close trainer file
tr0_pipe.close()
tr1_pipe.close()
ps0_pipe.close()
ps1_pipe.close()
# FIXME: use terminate() instead of sigkill.
os.kill(ps0.pid, signal.SIGKILL)
os.kill(ps1.pid, signal.SIGKILL)
ps0.terminate()
ps1.terminate()
# print server log
with open("/tmp/ps0_err.log", "r") as fn:
sys.stderr.write("ps0 stderr: %s\n" % fn.read())
with open("/tmp/ps1_err.log", "r") as fn:
sys.stderr.write("ps1 stderr: %s\n" % fn.read())
# print log
if stat0 == 0:
sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
with open("/tmp/tr0_err.log", "r") as fn:
sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
if stat1 == 0:
sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
with open("/tmp/tr1_err.log", "r") as fn:
sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
return pickle.loads(tr0_out), pickle.loads(tr1_out)
def _run_cluster_nccl2(self, model, envs, check_error_log):
# NOTE: we reuse ps_endpoints as nccl2 worker endpoints
worker_endpoints = self._ps_endpoints.split(",")
w0_ep, w1_ep = worker_endpoints
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2"
tr0_cmd = tr_cmd % \
(self._python_interp, model, self._ps_endpoints,
0, w0_ep)
tr1_cmd = tr_cmd % \
(self._python_interp, model, self._ps_endpoints,
1, w1_ep)
if self._mem_opt:
tr0_cmd += " --mem_opt"
tr1_cmd += " --mem_opt"
if self._use_reduce:
tr0_cmd += " --use_reduce"
tr1_cmd += " --use_reduce"
if self._use_reader_alloc:
tr0_cmd += " --use_reader_alloc"
tr1_cmd += " --use_reader_alloc"
if self.__use_cuda:
tr0_cmd += " --use_cuda"
tr1_cmd += " --use_cuda"
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
else:
env0 = {'CPU_NUM': '1'}
env1 = {'CPU_NUM': '1'}
env0.update(envs)
env1.update(envs)
print("tr0_cmd:{}, env: {}".format(tr0_cmd, env0))
print("tr1_cmd:{}, env: {}".format(tr1_cmd, env1))
tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb")
tr0_proc = subprocess.Popen(
tr0_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=tr0_pipe,
env=env0)
tr1_proc = subprocess.Popen(
tr1_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=tr1_pipe,
env=env1)
tr0_out, tr0_err = tr0_proc.communicate()
tr1_out, tr1_err = tr1_proc.communicate()
# close trainer file
tr0_pipe.close()
tr1_pipe.close()
# print log
sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
sys.stderr.write('trainer 0 stdout: %s\n' % tr0_out)
sys.stderr.write('trainer 1 stdout: %s\n' % tr1_out)
# return tr0_losses, tr1_losses
return pickle.loads(tr0_out), pickle.loads(tr1_out)
def check_with_place(self,
......@@ -386,21 +496,27 @@ class TestDistBase(unittest.TestCase):
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_rpc_deadline": "5000", # 5sec to fail fast
"FLAGS_cudnn_deterministic": "1",
"http_proxy": ""
"http_proxy": "",
"NCCL_P2P_DISABLE": "1"
}
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_v"] = "3"
required_envs["GLOG_logtostderr"] = "1"
local_losses\
= self._run_local(model_file, required_envs,
check_error_log)
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs,
check_error_log)
if self._nccl2_mode:
tr0_losses, tr1_losses = self._run_cluster_nccl2(
model_file, required_envs, check_error_log)
else:
tr0_losses, tr1_losses = self._run_cluster(
model_file, required_envs, check_error_log)
for step_id in range(RUN_STEP):
local_loss = local_losses[step_id]
......
......@@ -26,6 +26,19 @@ class TestDistMnist2x2(TestDistBase):
self.check_with_place("dist_mnist.py", delta=1e-5)
class TestDistMnistNCCL2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_reduce = False
self._use_reader_alloc = False
self._nccl2_mode = True
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_mnist.py", delta=1)
class TestDistMnist2x2Lars(TestDistBase):
def _setup_config(self):
self._sync_mode = True
......
......@@ -44,7 +44,7 @@ class TestDistSaveLoadDense2x2(TestDistBase):
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_v"] = "3"
required_envs["GLOG_logtostderr"] = "1"
model_dir = tempfile.mkdtemp()
......
......@@ -769,6 +769,7 @@ class TestNCCL2Transpile(TranspilerTest):
config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2"
config.wait_port = False
t = fluid.DistributeTranspiler(config=config)
t.transpile(
0,
......
......@@ -20,9 +20,6 @@ import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
BATCH_SIZE = 128
CLIP = 1
def bow_net(data,
label,
......@@ -64,6 +61,8 @@ class TestGradientClip(unittest.TestCase):
return places
def check_operators(self, place):
CLIP = 1
prog = fluid.framework.Program()
startup_program = fluid.framework.Program()
with fluid.program_guard(
......@@ -79,13 +78,13 @@ class TestGradientClip(unittest.TestCase):
avg_cost = fluid.layers.mean(cost)
prog_clip = prog.clone()
avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
p_g = fluid.backward.append_backward(loss=avg_cost)
p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
with fluid.program_guard(main_program=prog_clip):
with fluid.program_guard(
main_program=prog_clip, startup_program=startup_program):
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
......@@ -96,7 +95,7 @@ class TestGradientClip(unittest.TestCase):
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192),
batch_size=BATCH_SIZE)
batch_size=128)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
......@@ -112,12 +111,12 @@ class TestGradientClip(unittest.TestCase):
feed=feeder.feed(data),
fetch_list=grad_clip_list)
global_norm = 0
for v in out[1:]:
for v in out:
global_norm += np.sum(np.power(v, 2))
global_norm = np.sqrt(global_norm)
global_norm_clip = 0
for v in out_clip[1:]:
for v in out_clip:
global_norm_clip += np.sum(np.power(v, 2))
global_norm_clip = np.sqrt(global_norm_clip)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import sys
import numpy as np
import paddle.fluid as fluid
from paddle.fluid import core
class MyLayer(fluid.imperative.PyLayer):
def __init__(self):
super(MyLayer, self).__init__()
def forward(self, inputs):
x = fluid.layers.relu(inputs[0])
self._x_for_debug = x
return [fluid.layers.elementwise_mul(x, x)]
class TestImperative(unittest.TestCase):
def test_layer(self):
with fluid.imperative.guard():
cl = core.Layer()
cl.forward([])
l = fluid.imperative.PyLayer()
l.forward([])
def test_layer_in_out(self):
with fluid.imperative.guard():
l = MyLayer()
x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0]
self.assertIsNotNone(x)
sys.stderr.write("%s output: %s\n" % (x, x._numpy()))
x._backward()
sys.stderr.write("grad %s\n" % l._x_for_debug._gradient())
if __name__ == '__main__':
unittest.main()
......@@ -846,6 +846,15 @@ class TestBook(unittest.TestCase):
out = layers.cross_entropy(x, label, False, 4)
self.assertIsNotNone(out)
def test_bpr_loss(self):
program = Program()
with program_guard(program):
x = layers.data(name="x", shape=[30, 10], dtype="float32")
label = layers.data(name="label", shape=[30, 1], dtype="int32")
out = layers.bpr_loss(x, label)
self.assertIsNotNone(out)
print(str(program))
def test_expand(self):
program = Program()
with program_guard(program):
......
......@@ -22,6 +22,15 @@ from paddle.fluid.framework import Program, program_guard
from paddle.fluid.transpiler import memory_optimize
def _get_vars(prog):
assert (isinstance(prog, Program))
all_vars = set()
for op in prog.global_block().ops:
all_vars.update(op.input_arg_names)
all_vars.update(op.output_arg_names)
return all_vars
class TestControlFlowGraph(unittest.TestCase):
def setUp(self):
program = Program()
......@@ -37,11 +46,11 @@ class TestControlFlowGraph(unittest.TestCase):
self.program = program
def test_control_flow_graph(self):
print("before optimization")
print(str(self.program))
result_program = memory_optimize(self.program)
print("after optimization")
print(str(result_program))
result_program = self.program.clone()
memory_optimize(self.program)
old_vars = _get_vars(self.program)
new_vars = _get_vars(result_program)
self.assertTrue(old_vars != new_vars)
class TestMemoryTranspiler2(unittest.TestCase):
......@@ -58,14 +67,22 @@ class TestMemoryTranspiler2(unittest.TestCase):
avg_cost = layers.mean(cost)
opt = optimizer.SGD(learning_rate=0.001)
opt.minimize(avg_cost)
self.skip_set = set([cost.name, fc.name])
self.program = program
def test_inplace_ops(self):
print("before optimization")
print(str(self.program))
result_program = memory_optimize(self.program)
print("after optimization")
print(str(result_program))
result_program = self.program.clone()
memory_optimize(self.program)
old_vars = _get_vars(self.program)
new_vars = _get_vars(result_program)
self.assertTrue(old_vars != new_vars)
def test_skip_opt(self):
result_program = self.program.clone()
memory_optimize(self.program, skip_opt_set=self.skip_set)
old_vars = _get_vars(self.program)
new_vars = _get_vars(result_program)
self.assertTrue(old_vars != new_vars)
class TestMemoryTranspiler3(unittest.TestCase):
......
......@@ -63,6 +63,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
# expected output selected rows
expected_out0_rows = [0, 4]
expected_out1_rows = [0, 2]
expected_out2_rows = []
expected_out4_rows = [0]
op = Operator(
......@@ -75,6 +76,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
self.assertEqual(outs[0].rows(), expected_out0_rows)
self.assertEqual(outs[1].rows(), expected_out1_rows)
self.assertEqual(outs[2].rows(), expected_out2_rows)
self.assertEqual(outs[4].rows(), expected_out4_rows)
self.assertEqual(outs[0].height(), height_sections[0])
......@@ -84,6 +86,9 @@ class TestSpliteSelectedRows(unittest.TestCase):
self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
self.assertEqual(outs[2].numel(), 0)
self.assertEqual(outs[3].numel(), 0)
def check_grad_with_place(self, place):
scope = core.Scope()
height = 10
......
......@@ -142,6 +142,7 @@ class DistributeTranspilerConfig(object):
# supported modes: pserver, nccl2
mode = "pserver"
print_log = False
wait_port = True
class DistributeTranspiler(object):
......@@ -171,7 +172,6 @@ class DistributeTranspiler(object):
trainer_id = 0
trainers = 4
role = os.getenv("PADDLE_TRAINING_ROLE")
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id, pservers=pserver_endpoints, trainers=trainers)
......@@ -214,13 +214,16 @@ class DistributeTranspiler(object):
trainer_id,
trainers,
current_endpoint,
startup_program=None):
startup_program=None,
wait_port=True):
if not startup_program:
startup_program = default_startup_program()
if trainer_id >= 0:
worker_endpoints = trainers.split(",")
# send NCCL_ID to others or recv from trainer 0
worker_endpoints.remove(current_endpoint)
if trainer_id == 0 and wait_port:
wait_server_ready(worker_endpoints)
nccl_id_var = startup_program.global_block().create_var(
name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
......@@ -302,11 +305,13 @@ class DistributeTranspiler(object):
if self.config.mode == "nccl2":
assert (isinstance(trainers, str))
self.origin_program._trainers_endpoints = trainers.split(",")
self._transpile_nccl2(
trainer_id,
trainers,
current_endpoint,
startup_program=startup_program)
startup_program=startup_program,
wait_port=self.config.wait_port)
return
self.trainer_num = trainers
......@@ -652,9 +657,6 @@ class DistributeTranspiler(object):
# NOTE: assume blocks of the same variable is not distributed
# on the same pserver, only change param/grad varnames for
# trainers to fetch.
sys.stderr.write("get_pserver_program() is deprecated, call \
get_pserver_programs() to get pserver main and startup \
in a single call.")
# step1
pserver_program = Program()
pserver_program.random_seed = self.origin_program.random_seed
......@@ -922,18 +924,6 @@ in a single call.")
Returns:
Program: parameter server side startup program.
"""
sys.stderr.write("get_startup_program() is deprecated, call \
get_pserver_programs() to get pserver main and startup \
in a single call.")
if pserver_program != None:
sys.stderr.write("passing pserver_program to get_startup_program() \
is deprecated, you can use new API get_pserver_programs() to \
get both pserver main program and startup program.")
if startup_program != None:
sys.stderr.write("passing startup_program to get_startup_program() \
is deprecated, use fluid.program_guard() or pass this argument \
to transpile() call.")
s_prog = Program()
orig_s_prog = self.startup_program
s_prog.random_seed = orig_s_prog.random_seed
......
......@@ -14,6 +14,7 @@
from __future__ import print_function
import six
from collections import defaultdict, MutableSet
from .. import core
from ... import compat as cpt
......@@ -470,8 +471,21 @@ def memory_optimize(input_program,
Returns:
None
"""
def to_name_str(var):
if isinstance(var, Variable):
return var.desc.name()
elif isinstance(var, str):
return var
elif isinstance(var, six.string_types):
return str(var)
else:
raise TypeError(str(var) + " should be Variable or str")
if level != 0 and level != 1:
raise ValueError("only support opt_level 0 or 1.")
if skip_opt_set is not None and not isinstance(skip_opt_set, set):
raise ValueError("only support skip_opt_set as set.")
global PRINT_LOG
PRINT_LOG = print_log
if skip_grads:
......@@ -486,6 +500,8 @@ def memory_optimize(input_program,
skip_opt_set = grad_set
else:
skip_opt_set.update(grad_set)
if skip_opt_set is not None:
skip_opt_set = set(map(to_name_str, skip_opt_set))
cfgs = _get_cfgs(input_program)
for cfg in cfgs:
cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
......
......@@ -101,6 +101,7 @@ packages=['paddle',
'paddle.dataset',
'paddle.reader',
'paddle.fluid',
'paddle.fluid.imperative',
'paddle.fluid.proto',
'paddle.fluid.proto.profiler',
'paddle.fluid.layers',
......
......@@ -27,6 +27,8 @@ import pydoc
member_dict = collections.OrderedDict()
experimental_namespace = {"paddle.fluid.imperative"}
def visit_member(parent_name, member):
cur_name = ".".join([parent_name, member.__name__])
......@@ -51,6 +53,8 @@ def visit_member(parent_name, member):
def visit_all_module(mod):
if (mod.__name__ in experimental_namespace):
return
for member_name in (
name
for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册