未验证 提交 0b290782 编写于 作者: K Kaipeng Deng 提交者: GitHub

Merge branch 'develop' into grid_sampler

......@@ -62,7 +62,6 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF)
......
......@@ -2,8 +2,8 @@
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
......@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0)
### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
### Install Latest Stable Release:
```
# Linux CPU
......@@ -27,9 +27,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.0.1.post87
pip install paddlepaddle-gpu==1.1.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.0.1.post85
pip install paddlepaddle-gpu==1.1.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/
```
......@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.0.1.post85
## Installation
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website.
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
## Documentation
We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation.
We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
- [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html)
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html)
- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html)
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
We appreciate your contributions!
......
......@@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
......@@ -174,6 +174,7 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
......
......@@ -56,6 +56,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
# device_context reduce_op_handle )
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
......
......@@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
nccl_ctxs_(ctxs) {
if (nccl_ctxs_) {
for (auto &p : places_) {
this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
}
}
}
......@@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1;
......@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &p = places_[i];
auto *var = scope.FindVar(out_var_handles[i]->name_);
auto *dev_ctx = dev_ctxes_[p];
auto *dev_ctx = dev_ctxes_.at(p);
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
......
......@@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase {
nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
p_ctx.second.ctx_.get());
}
}
}
......
......@@ -12,232 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
namespace paddle {
namespace framework {
namespace details {
namespace f = paddle::framework;
namespace p = paddle::platform;
// test data amount
const f::DDim kDims = {20, 20};
struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
std::vector<Scope*> local_scopes_;
std::vector<Scope*> param_scopes_;
Scope g_scope_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
std::vector<p::Place> gpu_list_;
bool use_gpu_;
#ifdef PADDLE_WITH_CUDA
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
void WaitAll() {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
}
#ifdef PADDLE_WITH_CUDA
if (nccl_ctxs_) {
nccl_ctxs_->WaitAll();
}
#endif
}
void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu;
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
}
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
int count = 8;
for (int i = 0; i < count; ++i) {
auto p = p::CPUPlace();
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_.reset(nullptr);
#endif
}
}
void InitBroadcastOp(size_t input_scope_idx) {
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
Scope& local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
local_scope.Var("out");
param_scopes_.emplace_back(&local_scope);
}
param_scopes_[input_scope_idx]->Var("input");
std::unique_ptr<ir::Node> n =
ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
nccl_ctxs_.get()));
#else
op_handle_.reset(
new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
#endif
}
std::unique_ptr<ir::Node> v =
ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
gpu_list_[input_scope_idx]);
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
// add dummy var
std::unique_ptr<ir::Node> v2 =
ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
vars_.emplace_back(new DummyVarHandle(v2.get()));
DummyVarHandle* dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
dummy_var_handle->ClearGeneratedOp();
op_handle_->AddInput(dummy_var_handle);
for (size_t j = 0; j < gpu_list_.size(); ++j) {
if (!use_gpu_) {
op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
}
std::unique_ptr<ir::Node> v3 =
ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
VarHandle* out_var_handle =
new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
}
// add dummy var
std::unique_ptr<ir::Node> v4 =
ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
vars_.emplace_back(new DummyVarHandle(v4.get()));
DummyVarHandle* out_dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
out_dummy_var_handle->ClearGeneratedOp();
op_handle_->AddOutput(out_dummy_var_handle);
}
void TestBroadcastLodTensor(size_t input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k;
}
f::LoD lod{{0, 10, 20}};
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
in_lod_tensor->set_lod(lod);
in_lod_tensor->Resize(kDims);
op_handle_->Run(false);
WaitAll();
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = param_scopes_[j]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_tensor = out_var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
f::Tensor result_tensor;
f::TensorCopySync(out_tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place);
for (int64_t i = 0; i < f::product(kDims); ++i) {
ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
}
}
}
void TestBroadcastSelectedRows(size_t input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
int height = static_cast<int>(kDims[0]) * 2;
std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
in_selected_rows->set_height(height);
in_selected_rows->set_rows(rows);
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k;
}
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), value);
op_handle_->Run(false);
WaitAll();
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = param_scopes_[j]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
"height is not equal.");
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
}
f::Tensor result_tensor;
f::TensorCopySync(rt, cpu_place, &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t i = 0; i < f::product(kDims); ++i) {
ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
}
}
}
};
TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
namespace f = paddle::framework;
namespace p = paddle::platform;
// test data amount
const f::DDim kDims = {20, 20};
struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
std::vector<Scope*> local_scopes_;
std::vector<Scope*> param_scopes_;
Scope g_scope_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
std::vector<p::Place> place_list_;
bool use_gpu_;
#ifdef PADDLE_WITH_CUDA
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
void WaitAll() {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
}
#ifdef PADDLE_WITH_CUDA
if (nccl_ctxs_) {
nccl_ctxs_->WaitAll();
}
#endif
}
void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu;
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
place_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
}
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
int count = 8;
for (int i = 0; i < count; ++i) {
auto p = p::CPUPlace();
place_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_.reset(nullptr);
#endif
}
}
void InitBroadcastOp(size_t input_scope_idx) {
for (size_t j = 0; j < place_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
Scope& local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
local_scope.Var("out");
param_scopes_.emplace_back(&local_scope);
}
param_scopes_[input_scope_idx]->Var("input");
std::unique_ptr<ir::Node> n =
ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
place_list_, nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
place_list_, nccl_ctxs_.get()));
#else
op_handle_.reset(
new BroadcastOpHandle(n.get(), local_scopes_, place_list_));
#endif
}
std::unique_ptr<ir::Node> v =
ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
place_list_[input_scope_idx]);
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
// add dummy var
std::unique_ptr<ir::Node> v2 =
ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
vars_.emplace_back(new DummyVarHandle(v2.get()));
DummyVarHandle* dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
dummy_var_handle->ClearGeneratedOp();
op_handle_->AddInput(dummy_var_handle);
for (size_t j = 0; j < place_list_.size(); ++j) {
if (!use_gpu_) {
op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get());
}
std::unique_ptr<ir::Node> v3 =
ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
VarHandle* out_var_handle =
new VarHandle(v3.get(), 2, j, "out", place_list_[j]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
}
// add dummy var
std::unique_ptr<ir::Node> v4 =
ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
vars_.emplace_back(new DummyVarHandle(v4.get()));
DummyVarHandle* out_dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
out_dummy_var_handle->ClearGeneratedOp();
op_handle_->AddOutput(out_dummy_var_handle);
}
std::vector<float> InitLoDTensor(const std::string& varname,
size_t input_scope_idx, const f::LoD& lod,
float val_scalar = 0.0) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
auto lod_tensor = var->GetMutable<f::LoDTensor>();
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k + val_scalar;
}
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), lod_tensor);
lod_tensor->set_lod(lod);
lod_tensor->Resize(kDims);
return send_vector;
}
std::vector<float> InitSelectedRows(const std::string& varname,
size_t input_scope_idx,
const std::vector<int64_t>& rows,
int height, float value_scalar = 0.0) {
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k + value_scalar;
}
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
auto selected_rows = var->GetMutable<f::SelectedRows>();
auto value = selected_rows->mutable_value();
value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
selected_rows->set_height(height);
selected_rows->set_rows(rows);
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), value);
return send_vector;
}
void SelectedRowsEqual(const std::string& varname, int input_scope_idx,
const std::vector<float>& send_vector,
const std::vector<int64_t>& rows, int height) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
auto& selected_rows = var->Get<f::SelectedRows>();
auto rt = selected_rows.value();
PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal.");
for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]);
}
p::CPUPlace cpu_place;
f::Tensor result_tensor;
f::TensorCopySync(rt, cpu_place, &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t i = 0; i < f::product(kDims); ++i) {
ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
}
}
void LoDTensorEqual(const std::string& varname,
const std::vector<float>& send_vec, const f::LoD& lod,
framework::Scope* scope) {
p::CPUPlace cpu_place;
auto var = scope->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
auto tensor = var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal.");
f::Tensor result_tensor;
f::TensorCopySync(tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place);
for (int64_t k = 0; k < f::product(kDims); ++k) {
ASSERT_NEAR(ct[k], send_vec[k], 1e-5);
}
}
void TestBroadcastLodTensor(size_t input_scope_idx) {
f::LoD lod{{0, 10, 20}};
auto send_vector = InitLoDTensor("input", input_scope_idx, lod);
op_handle_->Run(false);
WaitAll();
for (size_t j = 0; j < place_list_.size(); ++j) {
LoDTensorEqual("out", send_vector, lod, param_scopes_[j]);
}
}
void TestBroadcastSelectedRows(size_t input_scope_idx) {
std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
int height = static_cast<int>(kDims[0] * 2);
auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height);
op_handle_->Run(false);
WaitAll();
for (size_t j = 0; j < place_list_.size(); ++j) {
SelectedRowsEqual("out", input_scope_idx, send_vector, rows, height);
}
}
};
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() {
bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
bool need_wait =
in_var && in_var->GeneratedOp() &&
in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
return need_wait;
}
......
......@@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle(
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
if (ctxs) {
for (auto &p : places_) {
this->dev_ctxes_[p] = ctxs->DevCtx(p);
this->SetDeviceContext(p, ctxs->DevCtx(p));
}
}
}
......@@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT(places_.size(), 1,
"Data balance can only be enabled when the number of "
"places to run larger than 1.");
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
......
......@@ -92,13 +92,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
size_t num_complete = 0;
remaining_ = 0;
BlockingQueue<size_t> complete_q;
auto complete_q = std::make_shared<BlockingQueue<size_t>>();
for (auto op : bootstrap_ops_) {
RunOpAsync(op_deps.get(), op, &complete_q);
RunOpAsync(op_deps.get(), op, complete_q);
}
while (num_complete != op_deps->size()) {
size_t num_comp = complete_q.Pop();
size_t num_comp = complete_q->Pop();
if (num_comp == -1UL) {
int remaining = 0;
while (true) {
......@@ -107,7 +107,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
break;
}
for (int i = 0; i < remaining; ++i) {
complete_q.Pop();
complete_q->Pop();
}
}
exception_.ReThrow();
......@@ -120,7 +120,8 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
}
void FastThreadedSSAGraphExecutor::RunOpAsync(
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
OpHandleBase *op,
const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
++remaining_;
this->pool_.enqueue([=] {
OpHandleBase *op_to_run = op;
......@@ -144,7 +145,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
if (op_to_run == nullptr) {
op_to_run = pending_op;
} else {
this->RunOpAsync(op_deps, pending_op, complete_q);
RunOpAsync(op_deps, pending_op, complete_q);
}
}
}
......@@ -156,8 +157,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
}
void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
atomic_op_deps_ = pool_.enqueue([&] {
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
new std::unordered_map<OpHandleBase *, std::atomic<int>>;
auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
for (auto &pair : op_deps_) {
(*op_deps)[pair.first] = pair.second;
}
......
......@@ -50,7 +50,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::atomic<int> remaining_;
void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, BlockingQueue<size_t> *complete_q);
OpHandleBase *op,
const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
void PrepareAtomicOpDeps();
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
namespace paddle {
namespace framework {
namespace details {
struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
std::vector<std::string> out_varnames_;
void InitFusedBroadcastOp(std::vector<size_t> input_scope_idxes) {
// initialize scope and var
for (size_t i = 0; i < place_list_.size(); ++i) {
local_scopes_.push_back(&(g_scope_.NewScope()));
Scope& local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
local_scope.Var("out_var" + j);
if (i == j) local_scope.Var("in_var" + j);
}
param_scopes_.emplace_back(&local_scope);
}
// create op handle node
std::unique_ptr<ir::Node> n =
ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation);
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new FusedBroadcastOpHandle(
n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not supported.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new FusedBroadcastOpHandle(
n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
#else
op_handle_.reset(
new FusedBroadcastOpHandle(n.get(), local_scopes_, place_list_));
#endif
}
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
// add input var handle
std::unique_ptr<ir::Node> in_node =
ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable);
VarHandle* in_var_handle =
new VarHandle(in_node.get(), 1, input_scope_idxes[i], "in_var" + i,
place_list_[input_scope_idxes[i]]);
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
// add output var handle
for (size_t j = 0; j < place_list_.size(); ++j) {
std::unique_ptr<ir::Node> out_node =
ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable);
VarHandle* out_var_handle =
new VarHandle(out_node.get(), 2, j, "out_var" + i, place_list_[j]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
}
}
}
void TestFusedBroadcastLoDTensor(std::vector<size_t> input_scope_idxes) {
std::vector<std::vector<float>> send_vec;
f::LoD lod{{0, 10, 20}};
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string varname("in_var" + i);
float val_scalar = static_cast<float>(i);
send_vec.push_back(
InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
}
op_handle_->Run(false);
WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string& varname("out_var" + i);
for (size_t j = 0; j < place_list_.size(); ++j) {
LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
}
}
}
void TestFusedBroadcastSelectedRows(std::vector<size_t> input_scope_idxes) {
std::vector<std::vector<float>> send_vector;
std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
int height = static_cast<int>(kDims[0] * 2);
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string varname("in_var" + i);
float val_scalar = static_cast<float>(i);
send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
rows, height, val_scalar));
}
op_handle_->Run(false);
WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string& varname("out_var" + i);
for (size_t j = 0; j < place_list_.size(); ++j) {
SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
height);
}
}
}
};
TEST(FusedBroadcastTester, CPULodTensor) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(false);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
}
TEST(FusedBroadcastTester, CPUSelectedRows) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(false);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
}
#ifdef PADDLE_WITH_CUDA
TEST(FusedBroadcastTester, GPULodTensor) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(true);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
}
TEST(FusedBroadcastTester, GPUSelectedRows) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(true);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
}
#endif
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() {
VarHandle *out_var_handle;
{
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one.");
out_var_handle = out_var_handles.front();
......@@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() {
Tensor *out_tensor = out_value->mutable_value();
// copy
auto dev_ctx = dev_ctxes_[out_var_handle->place_];
auto dev_ctx = dev_ctxes_.at(out_var_handle->place_);
RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx,
t_out_p] {
int s = 0, e = 0;
......
......@@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
for (auto *in : inputs_) {
if (NeedWait(in)) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]);
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
}
}
}
......
......@@ -27,7 +27,7 @@ namespace framework {
namespace details {
void ReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
if (places_.size() == 1) return;
// the input and output may have dummy var.
......
......@@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase {
nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
p_ctx.second.ctx_.get());
}
}
}
......
......@@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() {
continue;
}
if (in->GeneratedOp()) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]);
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
}
}
auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
......
......@@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
coeff_(static_cast<float>(1.0 / num_dev)),
scope_(scope),
place_(place) {
dev_ctxes_[place_] = dev_ctx;
this->SetDeviceContext(place_, dev_ctx);
}
ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
......@@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() {
} else {
#ifdef PADDLE_WITH_CUDA
this->RunAndRecordEvent([&] {
auto stream =
static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
->stream();
auto stream = static_cast<platform::CUDADeviceContext *>(
this->dev_ctxes_.at(place_))
->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(10) << place_ << "RUN Scale loss grad op";
......
......@@ -39,7 +39,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
BlockingQueue<VarHandleBase *> ready_vars;
auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
std::unordered_set<OpHandleBase *> ready_ops;
// For ops (e.g. nccl_all_reduce) that need to coordinate multiple
// streams from multiple GPUs, it's faster to buffer them and schedule
......@@ -51,12 +51,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
InsertPendingVar(&pending_vars, ready_vars.get(), version_pair.get());
}
}
}
for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
InsertPendingVar(&pending_vars, &ready_vars, var.get());
InsertPendingVar(&pending_vars, ready_vars.get(), var.get());
}
for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
......@@ -73,12 +73,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
FeedFetchList fetch_data(fetch_tensors.size());
InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
&pending_vars, &ready_vars, &fetch_data);
&pending_vars, ready_vars.get(), &fetch_data);
auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
for (auto *op : set) {
running_ops_++;
RunOp(&ready_vars, op);
RunOp(ready_vars, op);
}
set.clear();
};
......@@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
run_op_futures_.clear();
exception_holder_.Clear();
event.reset(nullptr);
// Step 3. Execution
while (!pending_vars.empty()) {
// 1. Run All Ready ops
......@@ -103,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// 2. Find ready variable
bool timeout;
auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
if (timeout) {
if (exception_holder_.IsCaught()) {
......@@ -133,7 +132,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
}
PADDLE_ENFORCE(ready_ops.empty());
// Wait FetchOps.
ClearFetchOp(graph_.get(), &fetch_ops);
......@@ -206,7 +204,8 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
}
void ThreadedSSAGraphExecutor::RunOp(
BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
details::OpHandleBase *op) {
auto op_run = [ready_var_q, op, this] {
try {
if (VLOG_IS_ON(10)) {
......
......@@ -51,7 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
~ThreadedSSAGraphExecutor() {}
private:
void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
details::OpHandleBase *op);
private:
......
......@@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() {
return result;
}
bool GraphItemCMP(const std::pair<PDNode *, Node *> &a,
const std::pair<PDNode *, Node *> &b) {
if (a.first != b.first) {
return a.first < b.first;
} else {
return a.second < b.second;
}
}
// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
// see https://github.com/PaddlePaddle/Paddle/issues/13550
void GraphPatternDetector::UniquePatterns(
......@@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns(
std::vector<GraphPatternDetector::subgraph_t> result;
std::unordered_set<size_t> set;
std::hash<std::string> hasher;
for (auto &g : *subgraphs) {
size_t key = 0;
for (auto &item : g) {
key ^= std::hash<void *>{}(item.first);
key ^= std::hash<void *>{}(item.second);
// Sort the items in the sub-graph, and transform to a string key.
std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP);
std::stringstream ss;
for (auto &item : sorted_keys) {
ss << item.first << ":" << item.second;
}
auto key = hasher(ss.str());
if (!set.count(key)) {
result.emplace_back(g);
set.insert(key);
......
......@@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor(
PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
for (size_t j = 0; j < lod.size(); ++j) {
auto &sub_lod = new_lod[j];
auto &offset = sub_lod.back();
size_t offset = sub_lod.back();
for (size_t k = 1; k < lod[j].size(); ++k) {
sub_lod.push_back(lod[j][k] + offset);
}
......
......@@ -19,81 +19,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
// directly, because there are many vector<LoDTensor> used accross the project,
// and some of them are treated as LoDTensorArray.
#if !defined(PADDLE_ON_INFERENCE)
using LoDTensorArray = std::vector<LoDTensor>;
#else // !PADDLE_ON_INFERENCE
#pragma message "LoDTensorArray is replaced with the inference one."
/*
* A LoDTensorArray which will not deallocate buffer when resized, fix the data
* diff in inference, and more performance friendly in the concurrency
* scenerios.
*/
class LoDTensorArray {
public:
LoDTensorArray() = default;
using iterator = std::vector<LoDTensor>::iterator;
using const_iterator = std::vector<LoDTensor>::const_iterator;
const_iterator begin() const { return array_.begin(); }
const_iterator end() const { return array_.begin() + size_; }
iterator begin() { return array_.begin(); }
iterator end() { return array_.begin() + size_; }
void push_back(const LoDTensor& x) {
if (size_ < array_.size()) {
array_[size_++] = x;
} else {
array_.push_back(x);
++size_;
}
}
void resize(size_t size) {
if (array_.size() < size) {
array_.resize(size);
}
size_ = size;
}
void emplace_back() { array_.emplace_back(); }
void emplace_back(LoDTensor&& x) { array_.emplace_back(std::move(x)); }
LoDTensor& back() { return array_.back(); }
size_t space() const { return array_.size(); }
void reserve(size_t size) {
// Naive warning to tell user this array might be to large. The memory and
// buffer used by this TensorArray will not be deleted during the training
// and inference phase, so attention not to make it expand too long.
if (size > 800UL) {
LOG(WARNING) << "TensorArray has more than 800 items";
}
array_.reserve(size);
}
bool empty() const { return size_ == 0UL; }
void clear() { size_ = 0UL; }
LoDTensor& operator[](size_t id) { return array_[id]; }
const LoDTensor& operator[](size_t id) const { return array_[id]; }
LoDTensor& at(size_t id) { return array_.at(id); }
const LoDTensor& at(size_t id) const { return array_.at(id); }
size_t size() const { return size_; }
private:
size_t size_{0};
std::vector<LoDTensor> array_;
};
#endif // !PADDLE_ON_INFERENCE
} // namespace framework
} // namespace paddle
......@@ -354,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() {
}
}
static bool VarIsTensor(const Variable* var) {
return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
static bool VarIsTensor(const Variable& var) {
return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
}
const Tensor* GetTensorFromVar(Variable* var) {
if (var->IsType<LoDTensor>()) {
return var->GetMutable<LoDTensor>();
} else if (var->IsType<SelectedRows>()) {
return var->GetMutable<SelectedRows>()->mutable_value();
const Tensor* GetTensorFromVar(const Variable& var) {
if (var.IsType<LoDTensor>()) {
return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
} else if (var.IsType<SelectedRows>()) {
return &(var.Get<SelectedRows>().value());
} else {
PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
var->Type().name());
var.Type().name());
}
}
......@@ -415,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
template <>
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
auto* var = InputVar(name);
return var == nullptr ? nullptr
: GetTensorFromVar(const_cast<Variable*>(var));
return var == nullptr ? nullptr : GetTensorFromVar(*var);
}
template <>
......@@ -428,7 +427,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
std::transform(names.begin(), names.end(), std::back_inserter(res),
[&](const std::string& sub_name) {
auto var = scope_.FindVar(sub_name);
return var == nullptr ? nullptr : GetTensorFromVar(var);
return var == nullptr ? nullptr : GetTensorFromVar(*var);
});
return res;
}
......@@ -770,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack(
for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
auto* transformed_tensor =
GetTensorFromVar(transfer_scope.FindVar(var_name));
auto* var = transfer_scope.FindVar(var_name);
PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr",
var_name);
auto* transformed_tensor = GetTensorFromVar(*var);
original_tensor->ShareDataWith(*transformed_tensor);
}
}
......@@ -784,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData(
for (auto& var_name : var_name_item.second) {
auto* var = scope.FindVar(var_name);
// Only tensor can be tranfer to another device.
if (var == nullptr || !VarIsTensor(var)) {
if (var == nullptr || !VarIsTensor(*var)) {
continue;
}
auto* tensor_in = GetTensorFromVar(var);
auto* tensor_in = GetTensorFromVar(*var);
if (!tensor_in->IsInitialized()) {
continue;
}
......
......@@ -63,7 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
}
proto::VarType::Type GetDataTypeOfVar(const Variable* var);
const Tensor* GetTensorFromVar(Variable* var);
const Tensor* GetTensorFromVar(const Variable& var);
class OperatorBase;
class ExecutionContext;
......
......@@ -75,6 +75,19 @@ TEST(Tensor, MutableData) {
platform::CPUPlace());
EXPECT_EQ(p1, p2);
}
// Not sure if it's desired, but currently, Tensor type can be changed.
{
framework::Tensor src_tensor;
int8_t* p1 = src_tensor.mutable_data<int8_t>(framework::make_ddim({1}),
platform::CPUPlace());
EXPECT_NE(p1, nullptr);
*p1 = 1;
uint8_t* p2 = src_tensor.mutable_data<uint8_t>(framework::make_ddim({1}),
platform::CPUPlace());
EXPECT_NE(p2, nullptr);
EXPECT_EQ(static_cast<int>(p2[0]), 1);
}
#ifdef PADDLE_WITH_CUDA
{
......
if(WITH_TESTING)
include(test.cmake) # some generic cmake funtion for inference
endif()
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
add_subdirectory(analysis)
......
......@@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)
function (inference_analysis_test TARGET)
if(WITH_TESTING)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS ARGS EXTRA_DEPS)
cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(mem_opt "")
if(WITH_GPU)
set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
endif()
cc_test(${TARGET}
SRCS "${analysis_test_SRCS}"
DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
endif(WITH_TESTING)
function(inference_analysis_test TARGET)
if(WITH_TESTING)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS ARGS EXTRA_DEPS)
cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
inference_base_test(${TARGET}
SRCS ${analysis_test_SRCS}
DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS})
endif()
endfunction(inference_analysis_test)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
......
......@@ -17,39 +17,12 @@ if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}
)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
endif()
function(inference_api_test TARGET_NAME)
if (WITH_TESTING)
set(options "")
set(oneValueArgs SRC)
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (WITH_GPU)
cc_test(${TARGET_NAME}
SRCS ${inference_test_SRC}
DEPS "${inference_deps}"
ARGS --dirname=${PYTHON_TESTS_DIR}/book/ --fraction_of_gpu_memory_to_use=0.15)
else()
cc_test(${TARGET_NAME}
SRCS ${inference_test_SRC}
DEPS "${inference_deps}"
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
endif()
if(inference_test_ARGS)
set_tests_properties(${TARGET_NAME}
PROPERTIES DEPENDS "${inference_test_ARGS}")
endif()
endif(WITH_TESTING)
endfunction(inference_api_test)
cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
......@@ -59,8 +32,11 @@ cc_test(test_paddle_inference_api
SRCS api_tester.cc
DEPS paddle_inference_api)
inference_api_test(test_api_impl SRC api_impl_tester.cc
ARGS test_word2vec test_image_classification)
if(WITH_TESTING)
inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
endif()
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
ARGS --dirname=${PYTHON_TESTS_DIR}/book)
......@@ -68,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
if(WITH_TESTING)
inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
endif()
endif()
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
......@@ -22,12 +22,14 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h"
#ifdef __clang__
#define ACC_DIFF 4e-2
#define ACC_DIFF 4e-3
#else
#define ACC_DIFF 1e-2
#define ACC_DIFF 1e-3
#endif
DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_string(word2vec_dirname, "",
"Directory of the word2vec inference model.");
DEFINE_string(book_dirname, "", "Directory of the book inference model.");
namespace paddle {
......@@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
NativeConfig GetConfig() {
NativeConfig config;
config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
config.model_dir = FLAGS_word2vec_dirname;
LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15;
#ifdef PADDLE_WITH_CUDA
......@@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) {
NativeConfig config = GetConfig();
config.use_gpu = use_gpu;
config.model_dir =
FLAGS_dirname + "/image_classification_resnet.inference.model";
FLAGS_book_dirname + "/image_classification_resnet.inference.model";
const bool is_combined = false;
std::vector<std::vector<int64_t>> feed_target_shapes =
......@@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) {
NativeConfig config = GetConfig();
config.use_gpu = use_gpu;
config.model_dir =
FLAGS_dirname + "/image_classification_resnet.inference.model";
FLAGS_book_dirname + "/image_classification_resnet.inference.model";
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
std::vector<framework::LoDTensor> jobs(num_jobs);
......
......@@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
//# 1. Create PaddlePredictor with a config.
NativeConfig config0;
config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
config0.model_dir = FLAGS_dirname;
config0.use_gpu = true;
config0.fraction_of_gpu_memory = 0.3;
config0.device = 0;
MixedRTConfig config1;
config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
config1.model_dir = FLAGS_dirname;
config1.use_gpu = true;
config1.fraction_of_gpu_memory = 0.3;
config1.device = 0;
......
......@@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB
make -j
word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
if [ -d $word2vec_model ]; then
for use_gpu in $use_gpu_list; do
./simple_on_word2vec \
......
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
function (inference_download install_dir url filename)
message(STATUS "Download inference test stuff from ${url}/${filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
message(STATUS "finish downloading ${filename}")
endfunction()
function (inference_download_and_uncompress install_dir url filename)
inference_download(${install_dir} ${url} ${filename})
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
endfunction()
set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
if (NOT EXISTS ${WORD2VEC_INSTALL_DIR})
inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
endif()
set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
function (inference_base_test TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS ARGS DEPS)
cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if(WITH_GPU)
set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
endif()
cc_test(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS} ARGS ${mem_opt} ${base_test_ARGS})
endfunction()
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
function (inference_download install_dir url filename)
message(STATUS "Download inference test stuff from ${url}/${filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
message(STATUS "finish downloading ${filename}")
endfunction()
function (inference_download_and_uncompress install_dir url filename)
inference_download(${install_dir} ${url} ${filename})
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
endfunction()
function(download_model_and_data install_dir model_name data_name)
if (NOT EXISTS ${install_dir})
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using ScopedSpatialTransformerDescriptor =
platform::ScopedSpatialTransformerDescriptor;
template <typename T>
class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto* theta = ctx.Input<Tensor>("Theta");
auto* output = ctx.Output<Tensor>("Output");
const T* theta_data = theta->data<T>();
int n = theta->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
Tensor h_sizes;
int* h_size_data;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
h_size_data = h_sizes.data<int>();
} else {
h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
h_size_data[0] = n;
h_size_data[1] = size_attr[1];
h_size_data[2] = size_attr[2];
h_size_data[3] = size_attr[3];
}
T* output_data = output->mutable_data<T>(
{n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace());
ScopedSpatialTransformerDescriptor st_desc;
cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
st_desc.descriptor<T>(4, h_size_data);
PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward(
handle, cudnn_st_desc, theta_data, output_data));
}
};
template <typename T>
class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
int n = output_grad->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
Tensor h_sizes;
int* h_size_data;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
h_size_data = h_sizes.data<int>();
} else {
h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
h_size_data[0] = n;
h_size_data[1] = size_attr[1];
h_size_data[2] = size_attr[2];
h_size_data[3] = size_attr[3];
}
ScopedSpatialTransformerDescriptor st_desc;
cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
st_desc.descriptor<T>(4, h_size_data);
const T* output_grad_data = output_grad->data<T>();
T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward(
handle, cudnn_st_desc, output_grad_data, theta_grad_data));
}
};
} // namespace operators
} // namespace paddle
namespace plat = paddle::platform;
REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNAffineGridOpKernel<float>,
paddle::operators::CUDNNAffineGridOpKernel<double>);
REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNAffineGridGradOpKernel<float>,
paddle::operators::CUDNNAffineGridGradOpKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/affine_grid_op.h"
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
struct Linspace<paddle::platform::CPUDeviceContext, T> {
framework::Tensor operator()(T start, T end, int count,
const framework::ExecutionContext& ctx) {
Tensor numbers;
T* number_data = numbers.mutable_data<T>({count}, platform::CPUPlace());
T slice = (end - start) / (T)(count - 1);
for (int i = 0; i < count; ++i) {
number_data[i] = start + (T)i * slice;
}
return numbers;
}
};
class AffineGridOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Theta"),
"Input(Theta) of AffineGridOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of AffineGridOp should not be null.");
auto theta_dims = ctx->GetInputDim("Theta");
PADDLE_ENFORCE(theta_dims.size() == 3,
"AffineGrid's Input(Theta) should be 3-D tensor.");
auto output_shape = ctx->Attrs().Get<std::vector<int>>("output_shape");
if (output_shape.size() == 0) {
PADDLE_ENFORCE(ctx->HasInput("OutputShape"),
"Input(OutputShape) of AffineGridOp should not be null if "
"attr(output_shape) is not configured.");
auto output_shape_dims = ctx->GetInputDim("OutputShape");
PADDLE_ENFORCE(output_shape_dims.size() == 1,
"AffineGrid's Input(OutputShape) should be 1-D tensor.");
} else {
PADDLE_ENFORCE(output_shape.size() == 4,
"The size of attr(output_shape) should be 4.");
}
PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2.");
PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3.");
// N * H * W * 2
ctx->SetOutputDim("Output",
framework::make_ddim({theta_dims[0], -1, -1, 2}));
ctx->ShareLoD("Theta", "Output");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) {
library = framework::LibraryType::kCUDNN;
}
#endif
auto data_type = framework::ToDataType(ctx.Input<Tensor>("Theta")->type());
return framework::OpKernelType(data_type, ctx.GetPlace(),
framework::DataLayout::kAnyLayout, library);
}
};
class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"Theta",
"(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. "
"It is used to transform coordinate (x_0, y_0) to coordinate (x_1, "
"y_1).");
AddInput("OutputShape",
"(Tensor) The shape of target image with format [N, C, H, W].")
.AsDispensable();
AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2].");
AddAttr<bool>(
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(true);
AddAttr<std::vector<int>>(
"output_shape",
"The target output image shape with format [N, C, H, W].")
.SetDefault(std::vector<int>());
AddComment(R"DOC(
It generates a grid of (x,y) coordinates using the parameters of the
affine transformation that correspond to a set of points where the input
feature map should be sampled to produce the transformed output feature map.
Given:
Theta = [[[x_11, x_12, x_13]
[x_14, x_15, x_16]]
[[x_21, x_22, x_23]
[x_24, x_25, x_26]]]
OutputShape = [2, 3, 5, 5]
Step 1:
Generate relative coordinates according to OutputShape.
The values of relative coordinates are in the interval between -1 and 1.
The shape of the relative coordinates is [2, H, W] as below:
C = [[[-1. -1. -1. -1. -1. ]
[-0.5 -0.5 -0.5 -0.5 -0.5]
[ 0. 0. 0. 0. 0. ]
[ 0.5 0.5 0.5 0.5 0.5]
[ 1. 1. 1. 1. 1. ]]
[[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]]]
C[0] is the coordinates in height axis and C[1] is the coordinates in width axis.
Step2:
Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
C_ = [[-1. -1. 1. ]
[-0.5 -1. 1. ]
[ 0. -1. 1. ]
[ 0.5 -1. 1. ]
[ 1. -1. 1. ]
[-1. -0.5 1. ]
[-0.5 -0.5 1. ]
[ 0. -0.5 1. ]
[ 0.5 -0.5 1. ]
[ 1. -0.5 1. ]
[-1. 0. 1. ]
[-0.5 0. 1. ]
[ 0. 0. 1. ]
[ 0.5 0. 1. ]
[ 1. 0. 1. ]
[-1. 0.5 1. ]
[-0.5 0.5 1. ]
[ 0. 0.5 1. ]
[ 0.5 0.5 1. ]
[ 1. 0.5 1. ]
[-1. 1. 1. ]
[-0.5 1. 1. ]
[ 0. 1. 1. ]
[ 0.5 1. 1. ]
[ 1. 1. 1. ]]
Step3:
Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
)DOC");
}
};
class AffineGridOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto theta_dims = ctx->GetInputDim("Theta");
if (ctx->HasOutput(framework::GradVarName("Theta"))) {
ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Theta")->type()),
ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_);
}
};
class AffineGridGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("affine_grid_grad");
op->SetInput("Theta", Input("Theta"));
op->SetInput("OutputShape", Input("OutputShape"));
op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker,
ops::AffineGridGradMaker);
REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);
REGISTER_OP_CPU_KERNEL(
affine_grid,
ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
affine_grid_grad,
ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
using Array1 = Eigen::DSizes<int64_t, 1>;
using Array2 = Eigen::DSizes<int64_t, 2>;
using Array3 = Eigen::DSizes<int64_t, 3>;
using Array4 = Eigen::DSizes<int64_t, 4>;
/**
*Return a tensor with evenly spaced numbers over a specified interval.
*/
template <typename DeviceContext, typename T>
struct Linspace {
framework::Tensor operator()(T start, T end, int count,
const framework::ExecutionContext& ctx);
};
template <typename DeviceContext, typename T>
class AffineGridOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto* theta = ctx.Input<Tensor>("Theta");
int n = theta->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
int h = 0;
int w = 0;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
Tensor h_sizes;
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
const int* h_size_data = h_sizes.data<int>();
h = h_size_data[2];
w = h_size_data[3];
} else {
h = size_attr[2];
w = size_attr[3];
}
auto* output = ctx.Output<Tensor>("Output");
output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), output,
static_cast<T>(0));
Linspace<DeviceContext, T> linspace;
// Get indexes of height with shape [height, width, 1]
auto h_idx = linspace((T)-1, (T)1, h, ctx);
auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
// Get indexes of width with shape [height, width, 1]
auto w_idx = linspace((T)-1, (T)1, w, ctx);
auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
// Get constant ones tensor with shape [height, width, 1]
Tensor ones;
ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
// Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
// ones
Tensor grid;
grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
auto grid_t = EigenTensor<T, 4>::From(grid);
grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
.broadcast(Array2(h, 1))
.reshape(Array3(h, w, 1))
.concatenate(h_idx_t.reshape(Array2(1, h))
.broadcast(Array2(w, 1))
.shuffle(Array2(1, 0))
.reshape(Array3(h, w, 1)),
2)
.eval()
.concatenate(ones_t, 2)
.reshape(Array4(1, h, w, 3))
.broadcast(Array4(n, 1, 1, 1));
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2});
blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
T(0));
}
}
};
template <typename DeviceContext, typename T>
class AffineGridGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
int n = output_grad->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
int h = 0;
int w = 0;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
Tensor h_sizes;
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
const int* h_size_data = h_sizes.data<int>();
h = h_size_data[2];
w = h_size_data[3];
} else {
h = size_attr[2];
w = size_attr[3];
}
theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), theta_grad,
static_cast<T>(0));
Linspace<DeviceContext, T> linspace;
// Get indexes of height with shape [height, width, 1]
auto h_idx = linspace((T)-1, (T)1, h, ctx);
auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
// Get indexes of width with shape [height, width, 1]
auto w_idx = linspace((T)-1, (T)1, w, ctx);
auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
// Get constant ones tensor with shape [height, width, 1]
Tensor ones;
ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
// Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
// ones
Tensor grid;
grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
auto grid_t = EigenTensor<T, 4>::From(grid);
grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
.broadcast(Array2(h, 1))
.reshape(Array3(h, w, 1))
.concatenate(h_idx_t.reshape(Array2(1, h))
.broadcast(Array2(w, 1))
.shuffle(Array2(1, 0))
.reshape(Array3(h, w, 1)),
2)
.eval()
.concatenate(ones_t, 2)
.reshape(Array4(1, h, w, 3))
.broadcast(Array4(n, 1, 1, 1));
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2});
Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
&sliced_theta_grad, T(0));
}
}
};
} // namespace operators
} // namespace paddle
......@@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase {
}
};
class DeleteVarOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {}
};
class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
......@@ -48,4 +53,5 @@ It should not be configured by users directly.
REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::DeleteVarOpInfoMaker);
paddle::operators::DeleteVarOpInfoMaker,
paddle::operators::DeleteVarOpShapeInference);
......@@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
ops::GatherOpKernel<int>, ops::GatherOpKernel<double>);
ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
ops::GatherOpKernel<int64_t>);
REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
ops::GatherGradientOpKernel<double>,
ops::GatherGradientOpKernel<int>,
ops::GatherGradientOpKernel<double>);
ops::GatherGradientOpKernel<int64_t>);
......@@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
ops::GatherGradOpCUDAKernel<double>,
ops::GatherGradOpCUDAKernel<int64_t>,
ops::GatherGradOpCUDAKernel<int>);
......@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
template <typename T, bool is_test>
class MaxSeqPoolFunctor {
public:
void operator()(const platform::CPUDeviceContext& context,
......@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor {
}
}
};
// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
// index buffer
template <typename T>
class MaxSeqPoolFunctor<T, true> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index) {
auto in_dims = input.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1);
PADDLE_ENFORCE_GT(out_dims.size(), 1);
for (int64_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
}
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
T* out_data = output->data<T>();
int64_t num_seq = out_dims[0];
int64_t dim = output->numel() / num_seq;
for (int64_t i = 0; i < num_seq; ++i) {
std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim],
dim * sizeof(T));
for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
for (int64_t k = 0; k < dim; ++k) {
if (in_data[j * dim + k] > out_data[i * dim + k]) {
out_data[i * dim + k] = in_data[j * dim + k];
}
}
}
}
}
};
template <typename T>
class MaxSeqPoolGradFunctor {
public:
......@@ -188,11 +222,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
/* max pool has index output */
void operator()(const platform::CPUDeviceContext& context,
const std::string pooltype, const framework::LoDTensor& input,
framework::Tensor* output,
framework::Tensor* output, bool is_test,
framework::Tensor* index = nullptr) {
if (pooltype == "MAX") {
math::MaxSeqPoolFunctor<T> max_pool;
max_pool(context, input, output, index);
if (is_test) {
math::MaxSeqPoolFunctor<T, true> max_pool;
max_pool(context, input, output, index);
} else {
math::MaxSeqPoolFunctor<T, false> max_pool;
max_pool(context, input, output, index);
}
return;
}
if (pooltype == "LAST") {
......@@ -200,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
last_pool(context, input, output);
return;
}
if (pooltype == "FIRST") {
math::FirstSeqPoolFunctor<T> first_pool;
first_pool(context, input, output);
......
......@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const std::string pooltype, const framework::LoDTensor& input,
framework::Tensor* output,
framework::Tensor* output, bool is_test,
framework::Tensor* index = nullptr) {
auto& lod = input.lod()[0];
const size_t item_dim = output->numel() / output->dims()[0];
......
......@@ -28,7 +28,7 @@ class SequencePoolFunctor {
/* max pool has index output */
void operator()(const DeviceContext& context, const std::string pooltype,
const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index = nullptr);
bool is_test = false, framework::Tensor* index = nullptr);
};
template <typename DeviceContext, typename T>
......
......@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor<int>) This tensor is used for the sequence max-pooling "
"to record the max indexes.")
.AsIntermediate();
AddAttr<bool>("is_test", "").SetDefault(false);
AddAttr<std::string>(
"pooltype",
"(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
......
......@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<Tensor>("Out");
std::string pooltype = context.Attr<std::string>("pooltype");
Tensor* index = nullptr;
if (pooltype == "MAX") {
index = context.Output<Tensor>("MaxIndex");
}
auto dims = in->dims();
auto lod = in->lod();
......@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
dims[0] = lod[0].size() - 1;
out->Resize({dims});
out->mutable_data<T>(context.GetPlace());
if (pooltype == "MAX") {
Tensor* index = nullptr;
const bool is_test = context.Attr<bool>("is_test");
// Do not create index buffer for inference (is_test) mode
// TODO(jczaja): Skip index buffer creation for other devices eg. GPU
if (pooltype == "MAX" &&
(is_test == false ||
platform::is_cpu_place(context.GetPlace()) == false)) {
index = context.Output<Tensor>("MaxIndex");
index->Resize({dims});
index->mutable_data<int>(context.GetPlace());
}
math::SequencePoolFunctor<DeviceContext, T> pool;
pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
index);
is_test, index);
}
};
......
......@@ -67,6 +67,7 @@ class SumOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto x_vars = ctx.MultiInputVar("X");
auto x_vars_name = ctx.Inputs("X");
framework::LibraryType library{framework::LibraryType::kPlain};
framework::DataLayout layout{framework::DataLayout::kAnyLayout};
......@@ -81,10 +82,11 @@ class SumOp : public framework::OperatorWithKernel {
if (x_vars[0]->IsType<framework::LoDTensor>()) {
int dtype = -1;
for (auto& x_var : x_vars) {
for (size_t idx = 0; idx < x_vars.size(); ++idx) {
PADDLE_ENFORCE(x_vars[idx] != nullptr,
"Input var[%s] should not be nullptr", x_vars_name[idx]);
// FIXME(zcd): The input x_var may be SelectedRows or LoDTensor.
auto tensor = framework::GetTensorFromVar(
const_cast<framework::Variable*>(x_var));
auto tensor = framework::GetTensorFromVar(*x_vars[idx]);
if (tensor->numel() == 0) {
continue;
}
......
......@@ -147,7 +147,6 @@ function cmake_gen() {
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-DCUDNN_ROOT=/usr/
-DWITH_TESTING=${WITH_TESTING:-ON}
-DWITH_FAST_BUNDLE_TEST=ON
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
......@@ -180,7 +179,6 @@ EOF
-DWITH_PYTHON=${WITH_PYTHON:-ON} \
-DCUDNN_ROOT=/usr/ \
-DWITH_TESTING=${WITH_TESTING:-ON} \
-DWITH_FAST_BUNDLE_TEST=ON \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
......
......@@ -884,12 +884,13 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
load_prog = Program()
load_block = load_prog.global_block()
need_delete_vars = []
for var_tuple in slice_vars_and_attrs:
orig_var = var_tuple[0]
start = var_tuple[1]
slice_var = var_tuple[2]
end = start + reduce(lambda x, y: x * y, slice_var.shape)
end = start + slice_var.shape[0]
clone_orig_var = load_block.create_var(
name=orig_var.name,
......@@ -917,5 +918,8 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
attrs={'axes': [0],
'starts': [start],
'ends': [end]})
need_delete_vars.append(clone_orig_var)
load_block.append_op(
type='delete_var',
inputs={'X': need_delete_vars}, )
executor.run(load_prog)
......@@ -154,6 +154,7 @@ __all__ = [
'mul',
'sigmoid_cross_entropy_with_logits',
'maxout',
'affine_grid',
'sequence_reverse',
'affine_channel',
'hash',
......@@ -711,8 +712,18 @@ def dynamic_gru(input,
The first part are weights of the update gate and reset gate with
shape :math:`(D \\times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D \\times D)`.
bias_attr(ParamAttr): The parameter attribute for learnable the
hidden-hidden bias.
If it is set to None or one attribute of ParamAttr, dynamic_gru will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, dynamic_gru will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
is_reverse(bool): Whether to compute reversed GRU, default
:attr:`False`.
gate_activation(str): The activation for update gate and reset gate.
......@@ -750,7 +761,7 @@ def dynamic_gru(input,
attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
batch_size = input.shape[0]
inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
if h_0 != None:
if h_0:
assert h_0.shape == (
batch_size, size
), 'The shape of h0 should be(batch_size, %d)' % size
......@@ -811,10 +822,29 @@ def gru_unit(input,
Args:
input (Variable): The fc transformed input value of current step.
hidden (Variable): The hidden value of lstm unit from previous step.
hidden (Variable): The hidden value of gru unit from previous step.
size (integer): The input dimension value.
param_attr (ParamAttr): The weight parameters for gru unit. Default: None
bias_attr (ParamAttr): The bias parameters for gru unit. Default: None
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weight matrix. Note:
- The shape of the weight matrix is :math:`(T \\times 3D)`, where
:math:`D` is the hidden size.
- All elements in the weight matrix can be divided into two parts.
The first part are weights of the update gate and reset gate with
shape :math:`(D \\times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D \\times D)`.
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, gru_unit will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
activation (string): The activation type for cell (actNode).
Default: 'tanh'
gate_activation (string): The activation type for gates (actGate).
......@@ -1826,7 +1856,7 @@ def conv3d(input,
return helper.append_activation(pre_act)
def sequence_pool(input, pool_type):
def sequence_pool(input, pool_type, is_test=False):
"""
This function add the operator for sequence pooling.
It pools features of all time-steps of each instance, and is applied
......@@ -1863,6 +1893,7 @@ def sequence_pool(input, pool_type):
input(variable): The input variable which is a LoDTensor.
pool_type (string): The pooling type of sequence_pool.
It supports average, sum, sqrt and max.
is_test(bool, Default False): Used distinguish training from scoring mode.
Returns:
The sequence pooling variable which is a Tensor.
......@@ -1890,7 +1921,8 @@ def sequence_pool(input, pool_type):
inputs={"X": input},
outputs={"Out": pool_out,
"MaxIndex": max_index},
attrs={"pooltype": pool_type.upper()})
attrs={"pooltype": pool_type.upper(),
"is_test": is_test})
# when pool_type is max, variable max_index is initialized,
# so we stop the gradient explicitly here
......@@ -3019,7 +3051,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
x = fluid.layers.data(name='y', shape=[10, 5],
dtype='float32', lod_level=1)
pad_value = fluid.layers.assign(input=numpy.array([0]))
pad_value = fluid.layers.assign(
input=numpy.array([0], dtype=numpy.float32))
out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
"""
......@@ -4441,7 +4474,10 @@ def transpose(x, perm, name=None):
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32')
# use append_batch_size=False to avoid prepending extra
# batch size in shape
x = fluid.layers.data(name='x', shape=[5, 10, 15],
dtype='float32', append_batch_size=False)
x_transposed = layers.transpose(x, perm=[1, 0, 2])
"""
......@@ -6106,6 +6142,124 @@ def crop(x, shape=None, offsets=None, name=None):
return out
def affine_grid(theta, out_shape, name=None):
"""
It generates a grid of (x,y) coordinates using the parameters of
the affine transformation that correspond to a set of points where
the input feature map should be sampled to produce the transformed
output feature map.
.. code-block:: text
* Case 1:
Given:
theta = [[[x_11, x_12, x_13]
[x_14, x_15, x_16]]
[[x_21, x_22, x_23]
[x_24, x_25, x_26]]]
out_shape = [2, 3, 5, 5]
Step 1:
Generate normalized coordinates according to out_shape.
The values of the normalized coordinates are in the interval between -1 and 1.
The shape of the normalized coordinates is [2, H, W] as below:
C = [[[-1. -1. -1. -1. -1. ]
[-0.5 -0.5 -0.5 -0.5 -0.5]
[ 0. 0. 0. 0. 0. ]
[ 0.5 0.5 0.5 0.5 0.5]
[ 1. 1. 1. 1. 1. ]]
[[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]]]
C[0] is the coordinates in height axis and C[1] is the coordinates in width axis.
Step2:
Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
C_ = [[-1. -1. 1. ]
[-0.5 -1. 1. ]
[ 0. -1. 1. ]
[ 0.5 -1. 1. ]
[ 1. -1. 1. ]
[-1. -0.5 1. ]
[-0.5 -0.5 1. ]
[ 0. -0.5 1. ]
[ 0.5 -0.5 1. ]
[ 1. -0.5 1. ]
[-1. 0. 1. ]
[-0.5 0. 1. ]
[ 0. 0. 1. ]
[ 0.5 0. 1. ]
[ 1. 0. 1. ]
[-1. 0.5 1. ]
[-0.5 0.5 1. ]
[ 0. 0.5 1. ]
[ 0.5 0.5 1. ]
[ 1. 0.5 1. ]
[-1. 1. 1. ]
[-0.5 1. 1. ]
[ 0. 1. 1. ]
[ 0.5 1. 1. ]
[ 1. 1. 1. ]]
Step3:
Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
Args:
theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
out_shape can be a Variable or a list or tuple.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output with shape [N, H, W, 2].
Raises:
ValueError: If the type of arguments is not supported.
Examples:
.. code-block:: python
theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
data = fluid.layers.affine_grid(theta, out_shape)
# or
data = fluid.layers.affine_grid(theta, [5, 3, 28, 28])
"""
helper = LayerHelper('affine_grid')
if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
isinstance(out_shape, Variable)):
raise ValueError("The out_shape should be a list, tuple or Variable.")
if not isinstance(theta, Variable):
raise ValueError("The theta should be a Variable.")
out = helper.create_variable_for_type_inference(theta.dtype)
ipts = {'Theta': theta}
attrs = {}
if isinstance(out_shape, Variable):
ipts['OutputShape'] = out_shape
else:
attrs['output_shape'] = out_shape
helper.append_op(
type='affine_grid',
inputs=ipts,
outputs={'Output': out},
attrs=None if len(attrs) == 0 else attrs)
return out
def rank_loss(label, left, right, name=None):
"""
**Rank loss layer for RankNet**
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
if(NOT APPLE)
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
else()
foreach(src ${TEST_OPS})
if(${src} STREQUAL "test_image_classification_vgg")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif(${src} STREQUAL "test_image_classification_resnet")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif()
py_test(${src} SRCS ${src}.py)
endif()
endforeach()
endif()
......@@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
endif(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
......@@ -89,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
endif()
......@@ -90,8 +90,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
# TODO(typhoonzero): fix distributed adam optimizer
# opt = fluid.optimizer.AdamOptimizer(
# learning_rate=0.001, beta1=0.9, beta2=0.999)
opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
# Reader
train_reader = paddle.batch(
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import sys
import signal
import subprocess
import argparse
import time
import math
import random
from multiprocessing import Process
from functools import reduce
import numpy as np
import unittest
import six
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid import io
from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP
from dist_simnet_bow import TestDistSimnetBow2x2, DATA_URL, DATA_MD5
class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
def _load_persistable_vars(self, executor, dirname, program):
def _is_checkpoint_var(var):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.RAW:
return False
# @GRAD are named for gradient variables, checkpoint will not save it.
if "@GRAD" in var.name:
return False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if ".trainer_" in var.name:
return False
# .block is named for distribute train variables, checkpoint will not save it.
if ".block" in var.name:
return False
if "tmp_" in var.name:
return False
return var.persistable
io.load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var,
filename=None)
def run_pserver(self, args):
self.get_model(batch_size=2)
# NOTE: pserver should not call memory optimize
t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(), args.endpoints,
args.trainers, args.sync_mode)
pserver_prog = t.get_pserver_program(args.current_endpoint)
startup_prog = t.get_startup_program(args.current_endpoint,
pserver_prog)
need_load = bool(int(os.getenv("LOAD", "0")))
model_dir = os.getenv("MODEL_DIR", "")
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
if need_load and model_dir:
self._load_persistable_vars(exe, model_dir, startup_prog)
exe.run(pserver_prog)
def run_trainer(self, args):
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
self.get_model(batch_size=2)
if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
if args.is_dist:
t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(),
args.endpoints, args.trainers,
args.sync_mode)
trainer_prog = t.get_trainer_program()
else:
trainer_prog = fluid.default_main_program()
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
startup_exe = fluid.Executor(place)
startup_exe.run(fluid.default_startup_program())
strategy = fluid.ExecutionStrategy()
strategy.num_threads = 1
strategy.allow_op_delay = False
build_stra = fluid.BuildStrategy()
if args.use_reduce:
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
else:
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
exe = fluid.ParallelExecutor(
args.use_cuda,
loss_name=avg_cost.name,
exec_strategy=strategy,
build_strategy=build_stra)
feed_var_list = [
var for var in trainer_prog.global_block().vars.values()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
reader_generator = train_reader()
def get_data():
origin_batch = next(reader_generator)
if args.is_dist and args.use_reader_alloc:
new_batch = []
for offset, item in enumerate(origin_batch):
if offset % 2 == args.trainer_id:
new_batch.append(item)
return new_batch
else:
return origin_batch
need_save = bool(int(os.getenv("SAVE", "0")))
model_dir = os.getenv("MODEL_DIR", "")
if need_save:
for _ in six.moves.xrange(RUN_STEP):
loss, = exe.run(fetch_list=[avg_cost.name],
feed=feeder.feed(get_data()))
if need_save and model_dir:
io.save_persistables(startup_exe, model_dir, trainer_prog)
var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
print(np.ravel(var).tolist())
if __name__ == "__main__":
paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
runtime_main(TestDistSaveLoad2x2)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
def AffineGrid(theta, size):
n = size[0]
w = size[3]
h = size[2]
h_idx = np.repeat(
np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
w_idx = np.repeat(
np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
grid = np.concatenate(
[w_idx, h_idx, np.ones([h, w, 1])], axis=2) # h * w * 3
grid = np.repeat(grid[np.newaxis, :], size[0], axis=0) # n * h * w *3
ret = np.zeros([n, h * w, 2])
theta = theta.transpose([0, 2, 1])
for i in range(len(theta)):
ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
# print ret.reshape([h * w, 2]).astype("float32")
return ret.reshape([n, h, w, 2]).astype("float32")
class TestAffineGridOp(OpTest):
def setUp(self):
self.initTestCase()
self.op_type = "affine_grid"
theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
theta = np.ones(self.theta_shape).astype("float32")
self.inputs = {'Theta': theta}
self.attrs = {"use_cudnn": True}
if self.dynamic_shape:
self.inputs['OutputShape'] = self.output_shape
else:
self.attrs['output_shape'] = self.output_shape
self.outputs = {'Output': AffineGrid(theta, self.output_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(
['Theta'],
'Output',
no_grad_set=['OutputShape'],
max_relative_error=0.006)
def initTestCase(self):
self.theta_shape = (3, 2, 3)
self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
self.dynamic_shape = False
class TestAffineGridOpCase1(TestAffineGridOp):
def initTestCase(self):
self.theta_shape = (3, 2, 3)
self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
self.dynamic_shape = True
if __name__ == '__main__':
unittest.main()
......@@ -22,6 +22,8 @@ import signal
import subprocess
import six
import argparse
import pickle
import numpy as np
import paddle.fluid as fluid
......@@ -128,10 +130,15 @@ class TestDistRunnerBase(object):
else:
return origin_batch
out_losses = []
for _ in six.moves.xrange(RUN_STEP):
loss, = exe.run(fetch_list=[avg_cost.name],
feed=feeder.feed(get_data()))
print(loss)
out_losses.append(loss[0])
if six.PY2:
print(pickle.dumps(out_losses))
else:
sys.stdout.buffer.write(pickle.dumps(out_losses))
def runtime_main(test_class):
......@@ -149,7 +156,7 @@ def runtime_main(test_class):
parser.add_argument('--use_cuda', action='store_true')
parser.add_argument('--use_reduce', action='store_true')
parser.add_argument(
'--use_reader_alloc', action='store_true', required=False, default=True)
'--use_reader_alloc', action='store_true', required=False)
parser.add_argument('--batch_size', required=False, type=int, default=2)
parser.add_argument(
'--batch_merge_repeat', required=False, type=int, default=1)
......@@ -188,7 +195,7 @@ class TestDistBase(unittest.TestCase):
self._pservers = 2
self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
self._find_free_port(), self._find_free_port())
self._python_interp = "python"
self._python_interp = sys.executable
self._sync_mode = True
self._enforce_place = None
self._mem_opt = False
......@@ -237,21 +244,6 @@ class TestDistBase(unittest.TestCase):
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
def _wait_ps_ready(self, pid):
retry_times = 50
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(3)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error as e:
sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
(e, retry_times))
retry_times -= 1
def _run_local(self,
model,
envs,
......@@ -288,23 +280,20 @@ class TestDistBase(unittest.TestCase):
env=envs)
local_out, local_err = local_proc.communicate()
local_ret = cpt.to_text(local_out)
if check_error_log:
err_log.close()
sys.stderr.write('local_stdout: %s\n' % local_ret)
sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
sys.stderr.write('local_stderr: %s\n' % local_err)
local_losses = local_ret.split("\n")
return local_losses
return pickle.loads(local_out)
def _run_cluster(self, model, envs, check_error_log):
# Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
check_error_log, envs)
self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
......@@ -339,8 +328,8 @@ class TestDistBase(unittest.TestCase):
env0.update(envs)
env1.update(envs)
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
print("tr0_cmd:{}".format(tr0_cmd))
print("tr1_cmd:{}".format(tr1_cmd))
tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb")
......@@ -356,9 +345,7 @@ class TestDistBase(unittest.TestCase):
env=env1)
tr0_out, tr0_err = tr0_proc.communicate()
tr0_loss_text = cpt.to_text(tr0_out)
tr1_out, tr1_err = tr1_proc.communicate()
tr1_loss_text = cpt.to_text(tr1_out)
# close trainer file
tr0_pipe.close()
......@@ -373,15 +360,13 @@ class TestDistBase(unittest.TestCase):
ps1.terminate()
# print log
sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err)
sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text)
sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
tr0_losses = tr0_loss_text.split("\n")
tr1_losses = tr1_loss_text.split("\n")
return tr0_losses, tr1_losses
# return tr0_losses, tr1_losses
return pickle.loads(tr0_out), pickle.loads(tr1_out)
def check_with_place(self,
model_file,
......@@ -411,9 +396,9 @@ class TestDistBase(unittest.TestCase):
check_error_log)
for step_id in range(RUN_STEP):
local_loss = eval(local_losses[step_id])[0]
tr0_loss = eval(tr0_losses[step_id])[0]
tr1_loss = eval(tr1_losses[step_id])[0]
dist_loss = (tr0_loss + tr1_loss) / 2
print(str(local_loss) + ":" + str(dist_loss))
self.assertAlmostEqual(local_loss, dist_loss, delta=delta)
local_loss = local_losses[step_id]
tr0_loss = tr0_losses[step_id]
tr1_loss = tr1_losses[step_id]
dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
print("=======", local_loss, ":", dist_loss[0], "=======")
self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import shutil
import unittest
import tempfile
import numpy as np
from test_dist_base import TestDistBase, RUN_STEP
class TestDistSaveLoadDense2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._enforce_place = "CPU"
def check_with_place(self,
model_file,
delta=1e-3,
check_error_log=False,
need_envs={}):
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"http_proxy": ""
}
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1"
model_dir = tempfile.mkdtemp()
local_env = {}
local_env["SAVE"] = "1"
local_env["MODEL_DIR"] = model_dir
local_env.update(required_envs)
cluster_env = {}
cluster_env["LOAD"] = "1"
cluster_env["MODEL_DIR"] = model_dir
cluster_env.update(required_envs)
local_var = self._run_local(model_file, local_env, check_error_log)
tr0_var, tr1_var = self._run_cluster(model_file, cluster_env,
check_error_log)
shutil.rmtree(model_dir)
local_np = np.array(eval(local_var[0]))
train0_np = np.array(eval(tr0_var[0]))
train1_np = np.array(eval(tr1_var[0]))
self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta)
self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta)
self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta)
def test_dist(self):
need_envs = {
"IS_DISTRIBUTED": '0',
"IS_SPARSE": '0',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place(
"dist_save_load.py",
delta=0,
check_error_log=False,
need_envs=need_envs)
if __name__ == "__main__":
unittest.main()
......@@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase):
self._use_reader_alloc = False
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100)
self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistseResnXt2x2WithMemopt(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._mem_opt = True
self._use_reader_alloc = False
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100)
self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistSeResneXt2x2Async(TestDistBase):
......
......@@ -283,6 +283,25 @@ class TestDecayedAdagrad(TranspilerTest):
trainer, _ = self.get_trainer()
class TestFtrl(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
opt = fluid.optimizer.Ftrl(learning_rate=0.1)
opt.minimize(avg_cost)
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
trainer, _ = self.get_trainer()
class TestLRDecayConditional(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
......@@ -405,18 +424,43 @@ class TestL2DecayWithPiecewise(TranspilerTest):
["sum", "scale", "scale", "elementwise_add", "momentum"])
class TestEmptyPserverOptimizeBlocks(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
# only one parameter
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=False)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=1.0)
sgd_optimizer.minimize(avg_cost)
def transpiler_test_impl(self):
config = fluid.DistributeTranspilerConfig()
config.slice_var_up = False
pserver, startup = self.get_pserver(ep=self.pserver2_ep, config=config)
self.assertEqual(len(pserver.blocks), 2)
self.assertEqual(len(pserver.blocks[1].ops), 0)
class TestDistLookupTableBase(TranspilerTest):
def network_with_table(self, is_sparse, is_distributed):
self.table_size = 1000
self.emb_size = 64
self.lookup_table_name = 'shared_w'
def emb_pool(ids):
def emb_pool(ids, table_name, is_distributed):
emb = fluid.layers.embedding(
input=ids,
size=[self.table_size, self.emb_size],
dtype='float32',
param_attr=self.lookup_table_name, # share parameter
param_attr=table_name,
is_sparse=is_sparse,
is_distributed=is_distributed)
pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
......@@ -426,9 +470,13 @@ class TestDistLookupTableBase(TranspilerTest):
name='title_ids', shape=[1], dtype='int64', lod_level=1)
brand_ids = fluid.layers.data(
name='brand_ids', shape=[1], dtype='int64', lod_level=1)
title_emb = emb_pool(title_ids)
brand_emb = emb_pool(brand_ids)
fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1)
profile_ids = fluid.layers.data(
name='brand_ids', shape=[1], dtype='int64', lod_level=1)
title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed)
brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed)
profile_emb = emb_pool(profile_ids, "profile_emb", False)
fc0 = fluid.layers.concat(
input=[title_emb, brand_emb, profile_emb], axis=1)
predict = fluid.layers.fc(input=fc0,
size=2,
act=None,
......@@ -449,7 +497,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
def transpiler_test_impl(self):
pserver1, startup1 = self.get_pserver(self.pserver1_ep)
self.assertEqual(len(pserver1.blocks), 3)
self.assertEqual(len(pserver1.blocks), 4)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
......@@ -459,16 +507,23 @@ class TestLocalLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["sum", "scale", "adam", "scale", "scale"])
# 3 optimize for table 2 adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["sum", "scale", "adam", "scale", "scale"])
trainer, _ = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1)
ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
'fill_constant', 'mean_grad', 'cross_entropy_grad',
'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sum', 'split_selected_rows', 'send',
'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
......@@ -480,39 +535,45 @@ class TestDistLookupTable(TestDistLookupTableBase):
def transpiler_test_impl(self):
pserver1, startup1 = self.get_pserver(self.pserver1_ep)
self.assertEqual(len(pserver1.blocks), 5)
self.assertEqual(len(pserver1.blocks), 6)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["sum", "scale", "adam", "scale", "scale"])
# 2 optimize for table sgd
# 4 prefetch -> lookup_sparse_table for data0
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["sum", "scale", "adam", "scale", "scale"])
# 2 optimize for table sgd
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["sum", "sgd"])
# 3 prefetch -> lookup_sparse_table for data0
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
self.assertEqual([op.type for op in pserver1.blocks[4].ops],
["lookup_sparse_table"])
# 4 save table
self.assertEqual([op.type for op in pserver1.blocks[4].ops], ["save"])
# 5 save table
self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
trainer, trainer_startup = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1)
ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_ids',
'send', 'send_barrier', 'recv', 'recv', 'fetch_barrier'
'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'split_selected_rows', 'send',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier',
'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
startup_ops = [
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'uniform_random', 'recv', 'recv',
'fetch_barrier', 'fake_init'
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'uniform_random',
'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
'fake_init'
]
self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
startup_ops)
......@@ -526,7 +587,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
config = fluid.DistributeTranspilerConfig()
pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
self.assertEqual(len(pserver1.blocks), 3)
self.assertEqual(len(pserver1.blocks), 4)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
......@@ -535,17 +596,23 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["adam", "scale", "scale"])
# 3 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["adam", "scale", "scale"])
trainer, _ = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1)
ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
'fill_constant', 'mean_grad', 'cross_entropy_grad',
'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv',
'recv', 'recv', 'concat'
'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv',
'recv', 'concat', 'concat'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
......@@ -559,29 +626,34 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
self.assertEqual(len(pserver1.blocks), 5)
self.assertEqual(len(pserver1.blocks), 6)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["adam", "scale", "scale"])
# 2 optimize for table sgd
self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"])
# 3 prefetch -> lookup_sparse_table for data0
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
# 2 optimize for table adam
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["adam", "scale", "scale"])
# 3 optimize for table sgd
self.assertEqual([op.type for op in pserver1.blocks[3].ops], ["sgd"])
# 4 prefetch -> lookup_sparse_table for data0
self.assertEqual([op.type for op in pserver1.blocks[4].ops],
["lookup_sparse_table"])
# 4 save table
self.assertEqual([op.type for op in pserver1.blocks[4].ops], ["save"])
# 5 save table
self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
trainer, _ = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1)
ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_ids',
'send', 'recv', 'recv'
'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'split_selected_rows', 'send',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv',
'recv', 'concat'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
......
......@@ -873,6 +873,22 @@ class TestBook(unittest.TestCase):
out = layers.grid_sampler(x, grid)
self.assertIsNotNone(out)
print(str(program))
def test_affine_grid(self):
program = Program()
with program_guard(program):
data = layers.data(name='data', shape=[2, 3, 3], dtype="float32")
out, ids = layers.argsort(input=data, axis=1)
theta = layers.data(name="theta", shape=[2, 3], dtype="float32")
out_shape = layers.data(
name="out_shape", shape=[-1], dtype="float32")
data_0 = layers.affine_grid(theta, out_shape)
data_1 = layers.affine_grid(theta, [5, 3, 28, 28])
self.assertIsNotNone(data_0)
self.assertIsNotNone(data_1)
print(str(program))
if __name__ == '__main__':
......
......@@ -55,6 +55,46 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
exe.run(pserver_prog)
def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
trainer_id):
x = fluid.layers.data(name='x', shape=[1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None, bias_attr=False)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
# loss function
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
# optimizer
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
ps1 = ip + ":" + str(int(port) + 1)
ps2 = ip + ":" + port
pserver_endpoints = ps1 + "," + ps2
config = fluid.DistributeTranspilerConfig()
config.slice_var_up = False
t = fluid.DistributeTranspiler(config=config)
t.transpile(
trainer_id,
pservers=pserver_endpoints,
trainers=trainers,
sync_mode=sync_mode)
pserver_prog = t.get_pserver_program(ps2)
# pserver2 have no parameter
assert (len(pserver_prog.blocks) == 2)
assert (len(pserver_prog.blocks[1].ops) == 0)
pserver_startup = t.get_startup_program(ps2, pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
class TestListenAndServOp(OpTest):
def setUp(self):
self.ps_timeout = 5
......@@ -63,9 +103,9 @@ class TestListenAndServOp(OpTest):
self.trainers = 1
self.trainer_id = 0
def _start_pserver(self, use_cuda, sync_mode):
def _start_pserver(self, use_cuda, sync_mode, pserver_func):
p = Process(
target=run_pserver,
target=pserver_func,
args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
self.trainer_id))
p.daemon = True
......@@ -92,7 +132,24 @@ class TestListenAndServOp(OpTest):
def test_handle_signal_in_serv_op(self):
# run pserver on CPU in sync mode
p1 = self._start_pserver(False, True)
p1 = self._start_pserver(False, True, run_pserver)
self._wait_ps_ready(p1.pid)
# raise SIGTERM to pserver
os.kill(p1.pid, signal.SIGINT)
p1.join()
# run pserver on CPU in async mode
p2 = self._start_pserver(False, False, run_pserver)
self._wait_ps_ready(p2.pid)
# raise SIGTERM to pserver
os.kill(p2.pid, signal.SIGTERM)
p2.join()
def test_list_and_serv_run_empty_optimize_block(self):
# run pserver on CPU in sync mode
p1 = self._start_pserver(False, True, run_pserver_with_empty_block)
self._wait_ps_ready(p1.pid)
# raise SIGTERM to pserver
......@@ -100,7 +157,7 @@ class TestListenAndServOp(OpTest):
p1.join()
# run pserver on CPU in async mode
p2 = self._start_pserver(False, False)
p2 = self._start_pserver(False, False, run_pserver_with_empty_block)
self._wait_ps_ready(p2.pid)
# raise SIGTERM to pserver
......
......@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
def compute(self, x, offset, out):
self.attrs = {'pooltype': "MAX", 'is_test': True}
for i in range(len(offset[0]) - 1):
sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
(-1, 3 * 11))
out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
def test_check_grad(self):
"""Grad computation does not apply to Sequence MAX
Pool executed when is_test is true """
return
class TestSeqLastPool2D(TestSeqAvgPool2D):
def compute(self, x, offset, out):
self.attrs = {'pooltype': "LAST"}
......
......@@ -35,6 +35,7 @@ import sys
import numpy as np
import collections
import six
import logging
from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
from .. import core, framework
......@@ -767,6 +768,15 @@ in a single call.")
prefetch_var_name_to_block_id.extend(
lookup_table_var_name_to_block_id)
if len(optimize_blocks) == 0:
logging.warn("pserver [" + str(endpoint) +
"] has no optimize block!!")
pre_block_idx = pserver_program.num_blocks - 1
empty_block = pserver_program._create_block(pre_block_idx)
optimize_blocks.append(empty_block)
# In some case, some parameter server will have no parameter to optimize
# So we give an empty optimize block to parameter server.
attrs = {
"optimize_blocks": optimize_blocks,
"endpoint": endpoint,
......@@ -910,11 +920,11 @@ to transpile() call.")
block_idx = int(block_name.split(block_suffix)[1])
orig_var = self.origin_program.global_block().vars[orig_var_name]
skip_numel = 0
skip_dim0 = 0
slice_vars = self.param_var_mapping[orig_var_name]
for slice_var in slice_vars[:block_idx]:
skip_numel += reduce(lambda x, y: x * y, slice_var.shape)
slice_vars_and_attrs.append([orig_var, skip_numel, param])
skip_dim0 += slice_var.shape[0]
slice_vars_and_attrs.append([orig_var, skip_dim0, param])
return slice_vars_and_attrs
......@@ -1065,7 +1075,12 @@ to transpile() call.")
continue_search_lookup_table_op = False
all_ops = program.global_block().ops
for op in all_ops:
if op.type == LOOKUP_TABLE_TYPE:
if op.type == LOOKUP_TABLE_TYPE and self.table_name == op.input(
"W")[0]:
if not op.attr('is_distributed'):
raise RuntimeError(
"lookup_table_op that lookup an distributed embedding table"
"should set is_distributed to true")
continue_search_lookup_table_op = True
lookup_table_op_index = lookup_table_op_index if lookup_table_op_index != -1 else list(
......@@ -1275,7 +1290,6 @@ to transpile() call.")
}
outputs = {"ParamOut": [param_var]}
# only support sgd now
import logging
logging.warn(
"distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
+ table_opt_op.type)
......@@ -1442,6 +1456,9 @@ to transpile() call.")
elif op_type == "decayed_adagrad":
if varkey == "Moment":
return param_shape
elif op_type == "ftrl":
if varkey in ["SquaredAccumulator", "LinearAccumulator"]:
return param_shape
elif op_type == "sgd":
pass
else:
......
......@@ -61,6 +61,9 @@ class InferenceTranspiler(object):
raise TypeError("scope should be as Scope type or None")
use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
if use_mkldnn:
self._depthwise_conv_mkldnn(program)
self._fuse_batch_norm(program, place, scope)
if use_mkldnn:
self._fuse_conv_bias_mkldnn(program)
......@@ -70,6 +73,31 @@ class InferenceTranspiler(object):
program) # ResNet residual block merging
self._fuse_bn_relu_mkldnn(program)
def _depthwise_conv_mkldnn(self, program):
'''
Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.
The result is:
- before:
- any_other_op->depthwise_conv->any_other_op
- after:
- any_other_op->conv->any_other_op
:param program: program to transpile
:type program: Program
'''
self.block = program.block(0)
i = 0
while i < len(self.block.ops):
current_op = self.block.ops[i]
if current_op.type == 'depthwise_conv2d':
current_op.desc.set_type("conv2d")
i = i + 1
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
def _fuse_conv_eltwise_mkldnn(self, program):
'''
Transpile the program fusing elementwise_add into conv for MKLDNN
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册