提交 745aacfc 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_dist_unit_test

# A image for building paddle binaries # A image for building paddle binaries
# Use cuda devel base image for both cpu and gpu environment # Use cuda devel base image for both cpu and gpu environment
FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com> MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ARG UBUNTU_MIRROR ARG UBUNTU_MIRROR
...@@ -57,7 +57,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 ...@@ -57,7 +57,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed. # version(1.7.1 for now), which causes building documentation failed.
RUN pip install --upgrade pip && \ RUN pip install --upgrade pip==9.0.3 && \
pip install -U wheel && \ pip install -U wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark pip install sphinx-rtd-theme==0.1.9 recommonmark
......
...@@ -78,7 +78,7 @@ if(NOT CMAKE_CROSSCOMPILING) ...@@ -78,7 +78,7 @@ if(NOT CMAKE_CROSSCOMPILING)
/usr/lib/reference/ /usr/lib/reference/
) )
else() else()
# Diable the finding of reference cblas under host's system path # Disable the finding of reference cblas under host's system path
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include) set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib) set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
endif() endif()
......
...@@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad ...@@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad
From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop. From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid. We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
## Turing Completeness ## Turing Completeness
......
cc_library(var_handle SRCS var_handle.cc DEPS place) cc_library(var_handle SRCS var_handle.cc DEPS place)
cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
...@@ -20,3 +20,11 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ...@@ -20,3 +20,11 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context) simple_threadpool device_context)
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context broadcast_op_handle)
cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context gather_op_handle)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
Tensor *GetTensorFromVar(Variable *in_var) {
if (in_var->IsType<LoDTensor>()) {
return in_var->GetMutable<LoDTensor>();
} else if (in_var->IsType<SelectedRows>()) {
return in_var->GetMutable<SelectedRows>()->mutable_value();
} else {
PADDLE_THROW("Var should be LoDTensor or SelectedRows");
}
return nullptr;
}
BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: local_scopes_(local_scopes), places_(places) {}
void BroadcastOpHandle::RunImpl() {
// the input may have dummy var.
std::vector<VarHandle *> in_var_handle;
for (auto *in : inputs_) {
auto *out_handle = dynamic_cast<VarHandle *>(in);
if (out_handle) {
in_var_handle.push_back(out_handle);
}
}
PADDLE_ENFORCE_EQ(in_var_handle.size(), 1,
"The number of input should be one.");
// the output may have dummy var.
std::vector<VarHandle *> out_var_handles;
for (auto *out : outputs_) {
auto *out_handle = dynamic_cast<VarHandle *>(out);
if (out_handle) {
out_var_handles.push_back(out_handle);
}
}
PADDLE_ENFORCE_EQ(
out_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
// Wait input done, this Wait is asynchronous operation
auto &in_place = in_var_handle[0]->place_;
if (in_var_handle[0]->generated_op_) {
for (auto *out : out_var_handles) {
auto &out_p = out->place_;
in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
}
}
//
auto in_scope_idx = in_var_handle[0]->scope_idx_;
auto in_var =
local_scopes_.at(in_scope_idx)->FindVar(in_var_handle[0]->name_);
Tensor *in_tensor = GetTensorFromVar(in_var);
for (auto *out : out_var_handles) {
auto &out_p = out->place_;
auto out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
"Places must be all on CPU or all on CUDA.");
if (in_var->IsType<framework::SelectedRows>()) {
auto &in_sr = in_var->Get<framework::SelectedRows>();
auto out_sr = out_var->GetMutable<framework::SelectedRows>();
if (&in_sr == out_sr) continue;
out_sr->set_height(in_sr.height());
out_sr->set_rows(in_sr.rows());
out_sr->mutable_value()->Resize(in_sr.value().dims());
out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type());
} else if (in_var->IsType<framework::LoDTensor>()) {
auto in_lod = in_var->Get<framework::LoDTensor>();
auto out_lod = out_var->GetMutable<framework::LoDTensor>();
if (&in_lod == out_lod) continue;
out_lod->set_lod(in_lod.lod());
out_lod->Resize(in_lod.dims());
out_lod->mutable_data(out_p, in_lod.type());
} else {
PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
}
Tensor *out_tensor = GetTensorFromVar(out_var);
paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
out_tensor);
}
}
std::string BroadcastOpHandle::Name() const { return "broadcast"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct BroadcastOpHandle : public OpHandleBase {
const std::vector<Scope *> &local_scopes_;
const std::vector<platform::Place> &places_;
BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places);
std::string Name() const override;
bool IsMultiDeviceTransfer() override { return false; };
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
namespace f = paddle::framework;
namespace p = paddle::platform;
// test data amount
const f::DDim kDims = {20, 20};
struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
std::vector<Scope*> local_scopes_;
Scope g_scope_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
std::vector<p::Place> gpu_list_;
void WaitAll() {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
}
}
void InitCtxOnGpu(bool use_gpu) {
if (use_gpu) {
#ifdef PADDLE_WITH_CUDA
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
}
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
int count = 8;
for (int i = 0; i < count; ++i) {
auto p = p::CPUPlace();
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
}
}
void InitBroadcastOp(size_t input_scope_idx) {
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
local_scopes_[j]->Var("out");
}
local_scopes_[input_scope_idx]->Var("input");
op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
vars_.emplace_back(new VarHandle());
VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
in_var_handle->place_ = gpu_list_[input_scope_idx];
in_var_handle->name_ = "input";
in_var_handle->version_ = 1;
in_var_handle->scope_idx_ = input_scope_idx;
in_var_handle->generated_op_ = nullptr;
op_handle_->AddInput(in_var_handle);
// add dummy var
vars_.emplace_back(new DummyVarHandle());
DummyVarHandle* dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
dummy_var_handle->generated_op_ = nullptr;
op_handle_->AddInput(dummy_var_handle);
for (size_t j = 0; j < gpu_list_.size(); ++j) {
op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
vars_.emplace_back(new VarHandle());
VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
out_var_handle->place_ = gpu_list_[j];
out_var_handle->name_ = "out";
out_var_handle->version_ = 2;
out_var_handle->scope_idx_ = j;
op_handle_->AddOutput(out_var_handle);
}
// add dummy var
vars_.emplace_back(new DummyVarHandle());
DummyVarHandle* out_dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
out_dummy_var_handle->generated_op_ = nullptr;
op_handle_->AddOutput(out_dummy_var_handle);
}
void TestBroadcastLodTensor(size_t input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k;
}
f::LoD lod{{0, 10, 20}};
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
in_lod_tensor->set_lod(lod);
op_handle_->Run(false);
WaitAll();
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = local_scopes_[j]->Var("out");
auto out_tensor = out_var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
f::Tensor result_tensor;
f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place);
for (int64_t i = 0; i < f::product(kDims); ++i) {
ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
}
}
}
void TestBroadcastSelectedRows(size_t input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
int height = static_cast<int>(kDims[0]) * 2;
std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
in_selected_rows->set_height(height);
in_selected_rows->set_rows(rows);
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k;
}
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), value);
op_handle_->Run(false);
WaitAll();
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = local_scopes_[j]->Var("out");
auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
"height is not equal.");
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
}
f::Tensor result_tensor;
f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t i = 0; i < f::product(kDims); ++i) {
ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
}
}
}
};
TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnGpu(false);
test_op.InitBroadcastOp(input_scope_idx);
test_op.TestBroadcastLodTensor(input_scope_idx);
}
TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnGpu(false);
test_op.InitBroadcastOp(input_scope_idx);
test_op.TestBroadcastSelectedRows(input_scope_idx);
}
#ifdef PADDLE_WITH_CUDA
TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnGpu(true);
test_op.InitBroadcastOp(input_scope_idx);
test_op.TestBroadcastLodTensor(input_scope_idx);
}
TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnGpu(true);
test_op.InitBroadcastOp(input_scope_idx);
test_op.TestBroadcastSelectedRows(input_scope_idx);
}
#endif
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -35,7 +35,9 @@ void ComputationOpHandle::RunImpl() { ...@@ -35,7 +35,9 @@ void ComputationOpHandle::RunImpl() {
} }
} }
this->RunAndRecordEvent([this] {
op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_); op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
});
} }
std::string ComputationOpHandle::Name() const { return op_->Type(); } std::string ComputationOpHandle::Name() const { return op_->Type(); }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: local_scopes_(local_scopes), places_(places) {}
void GatherOpHandle::RunImpl() {
// the input may have dummy var.
std::vector<VarHandle *> in_var_handles;
for (auto *in : inputs_) {
auto *in_handle = dynamic_cast<VarHandle *>(in);
if (in_handle) {
in_var_handles.push_back(in_handle);
}
}
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
// the output may have dummy var.
std::vector<VarHandle *> out_var_handles;
for (auto *out : outputs_) {
auto *out_handle = dynamic_cast<VarHandle *>(out);
if (out_handle) {
out_var_handles.push_back(out_handle);
}
}
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one.");
auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
auto pre_in_var =
local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
auto pre_place = in_0_handle->place_;
PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
"Currently, gather_op only can gather SelectedRows.");
PADDLE_ENFORCE_EQ(out_var_handles[0]->place_.which(), pre_place.which(),
"The place of input and output should be the same.");
// Wait input done, this Wait is asynchronous operation
for (auto *in : in_var_handles) {
if (in->generated_op_) {
in->generated_op_->Wait(dev_ctxes_[in->place_]);
}
}
std::vector<int64_t> out_rows;
std::vector<Tensor> in_tensors;
std::vector<platform::Place> in_places;
auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
// gather the inputs
for (auto *in : in_var_handles) {
auto in_handle = static_cast<VarHandle *>(in);
auto in_p = in_handle->place_;
in_places.push_back(in_p);
PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
"Places must be all on CPU or all on CUDA.");
auto in_var =
local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
auto &in_sr = in_var->Get<framework::SelectedRows>();
PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
"The type of input is not consistent.");
PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
"The height of inputs is not consistent.");
PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
"The dims of inputs is not consistent.");
auto in_sr_rows = in_sr.rows();
out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
in_tensors.emplace_back(in_sr.value());
}
// write the output
auto &out_place = out_var_handles[0]->place_;
auto out_scope_idx = out_var_handles[0]->scope_idx_;
auto out_var =
local_scopes_[out_scope_idx]->FindVar(out_var_handles[0]->name_);
auto out = out_var->GetMutable<framework::SelectedRows>();
out->set_height(pre_in.height());
out->set_rows(out_rows);
size_t rows = out_rows.size();
DDim out_dim = pre_in.GetCompleteDims();
out_dim[0] = static_cast<int64_t>(rows);
out->mutable_value()->Resize(out_dim);
out->mutable_value()->mutable_data(out_place, pre_in.value().type());
Tensor *out_tensor = out->mutable_value();
// copy
int s = 0, e = 0;
for (size_t j = 0; j < in_tensors.size(); ++j) {
e += in_tensors[j].dims()[0];
auto sub_out = out_tensor->Slice(s, e);
paddle::framework::TensorCopy(in_tensors[j], out_place,
*(dev_ctxes_[in_places[j]]), &sub_out);
s = e;
}
}
std::string GatherOpHandle::Name() const { return "gather"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct GatherOpHandle : public OpHandleBase {
const std::vector<Scope *> &local_scopes_;
const std::vector<platform::Place> &places_;
GatherOpHandle(const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places);
std::string Name() const override;
bool IsMultiDeviceTransfer() override { return false; };
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
namespace f = paddle::framework;
namespace p = paddle::platform;
// test data amount
const f::DDim kDims = {20, 20};
struct TestGatherOpHandle {
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
std::vector<Scope*> local_scopes_;
Scope g_scope_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
std::vector<p::Place> gpu_list_;
void WaitAll() {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
}
}
void InitCtxOnGpu(bool use_gpu) {
if (use_gpu) {
#ifdef PADDLE_WITH_CUDA
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
}
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
int count = 8;
for (int i = 0; i < count; ++i) {
auto p = p::CPUPlace();
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
}
}
void InitGatherOp(size_t input_scope_idx) {
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
local_scopes_[j]->Var("out");
}
local_scopes_[input_scope_idx]->Var("input");
op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
// add input
for (size_t j = 0; j < gpu_list_.size(); ++j) {
op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
vars_.emplace_back(new VarHandle());
VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
in_var_handle->place_ = gpu_list_[j];
in_var_handle->name_ = "input";
in_var_handle->version_ = 1;
in_var_handle->scope_idx_ = j;
in_var_handle->generated_op_ = nullptr;
op_handle_->AddInput(in_var_handle);
}
// add dummy var
vars_.emplace_back(new DummyVarHandle());
DummyVarHandle* in_dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
in_dummy_var_handle->generated_op_ = nullptr;
op_handle_->AddInput(in_dummy_var_handle);
// add output
vars_.emplace_back(new VarHandle());
VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
out_var_handle->place_ = gpu_list_[input_scope_idx];
out_var_handle->name_ = "out";
out_var_handle->version_ = 2;
out_var_handle->scope_idx_ = input_scope_idx;
op_handle_->AddOutput(out_var_handle);
// add dummy var
vars_.emplace_back(new DummyVarHandle());
DummyVarHandle* dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
op_handle_->AddOutput(dummy_var_handle);
}
void TestGatherSelectedRows(size_t output_scope_idx) {
int height = kDims[0] * 2;
std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
std::vector<float> send_vector(f::product(kDims));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k;
}
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
in_selected_rows->set_height(height);
in_selected_rows->set_rows(rows);
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), value);
value->Resize(kDims);
}
auto out_var = local_scopes_[output_scope_idx]->Var("out");
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = local_scopes_[output_scope_idx]->Var("input");
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
out_selected_rows->mutable_value()->ShareDataWith(
in_selected_rows->value());
op_handle_->Run(false);
WaitAll();
p::CPUPlace cpu_place;
auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
}
f::Tensor result_tensor;
f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
}
}
};
TEST(GatherTester, TestCPUGatherTestSelectedRows) {
TestGatherOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnGpu(false);
test_op.InitGatherOp(input_scope_idx);
test_op.TestGatherSelectedRows(input_scope_idx);
}
#ifdef PADDLE_WITH_CUDA
TEST(GatherTester, TestGPUGatherTestSelectedRows) {
TestGatherOpHandle test_op;
size_t input_scope_idx = 0;
test_op.InitCtxOnGpu(false);
test_op.InitGatherOp(input_scope_idx);
test_op.TestGatherSelectedRows(input_scope_idx);
}
#endif
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#include <algorithm>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -27,6 +29,32 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( ...@@ -27,6 +29,32 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
} }
} }
struct ReduceLoDTensor {
const std::vector<LoDTensor> &src_tensors_;
LoDTensor &dst_tensor_;
ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
: src_tensors_(src), dst_tensor_(*dst) {}
template <typename T>
void operator()() const {
PADDLE_ENFORCE(!src_tensors_.empty());
auto &t0 = src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0);
dst_tensor_.Resize(t0.dims());
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
for (size_t i = 1; i < src_tensors_.size(); ++i) {
auto &t = src_tensors_[i];
PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
PADDLE_ENFORCE_EQ(t.type(), t0.type());
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
[](T a, T b) -> T { return a + b; });
}
}
};
void NCCLAllReduceOpHandle::RunImpl() { void NCCLAllReduceOpHandle::RunImpl() {
if (inputs_.size() == 1) { if (inputs_.size() == 1) {
return; // No need to all reduce when GPU count = 1; return; // No need to all reduce when GPU count = 1;
...@@ -41,14 +69,20 @@ void NCCLAllReduceOpHandle::RunImpl() { ...@@ -41,14 +69,20 @@ void NCCLAllReduceOpHandle::RunImpl() {
int dtype = -1; int dtype = -1;
size_t numel = 0; size_t numel = 0;
std::vector<std::function<void()>> all_reduce_calls; std::vector<LoDTensor> lod_tensors;
for (size_t i = 0; i < local_scopes_.size(); ++i) { for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
auto *s = local_scopes_[i]; auto *s = local_scopes_[i];
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>(); auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
lod_tensors.emplace_back(lod_tensor);
}
if (platform::is_gpu_place(lod_tensors[0].place())) {
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
auto &lod_tensor = lod_tensors[i];
void *buffer = const_cast<void *>(lod_tensor.data<void>()); void *buffer = const_cast<void *>(lod_tensor.data<void>());
if (dtype == -1) { if (dtype == -1) {
...@@ -59,20 +93,43 @@ void NCCLAllReduceOpHandle::RunImpl() { ...@@ -59,20 +93,43 @@ void NCCLAllReduceOpHandle::RunImpl() {
numel = static_cast<size_t>(lod_tensor.numel()); numel = static_cast<size_t>(lod_tensor.numel());
} }
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &nccl_ctx = nccl_ctxs_.at(dev_id); auto &nccl_ctx = nccl_ctxs_.at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_; auto comm = nccl_ctx.comm_;
all_reduce_calls.emplace_back([=] { all_reduce_calls.emplace_back([=] {
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum, buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
comm, stream)); ncclSum, comm, stream));
}); });
} }
this->RunAndRecordEvent([&] {
platform::NCCLGroupGuard guard; platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) { for (auto &call : all_reduce_calls) {
call(); call();
} }
});
} else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg =
*this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
// Reduce All Tensor to trg in CPU
ReduceLoDTensor func(lod_tensors, &trg);
VisitDataType(ToDataType(lod_tensors[0].type()), func);
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &scope = local_scopes_[i];
auto &p = places_[i];
auto *var = scope->FindVar(var_name);
auto *dev_ctx = dev_ctxes_[p];
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
auto &tensor_cpu = trg;
TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
});
}
}
} }
} }
......
...@@ -54,17 +54,6 @@ void OpHandleBase::Run(bool use_event) { ...@@ -54,17 +54,6 @@ void OpHandleBase::Run(bool use_event) {
#endif #endif
RunImpl(); RunImpl();
#ifdef PADDLE_WITH_CUDA
if (use_event) {
for (auto &p : dev_ctxes_) {
int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
auto stream =
static_cast<platform::CUDADeviceContext *>(p.second)->stream();
PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
}
}
#endif
} }
void OpHandleBase::Wait(platform::DeviceContext *waited_dev) { void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
...@@ -97,6 +86,43 @@ void OpHandleBase::AddOutput(VarHandleBase *out) { ...@@ -97,6 +86,43 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
out->generated_op_ = this; out->generated_op_ = this;
} }
void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
#ifdef PADDLE_WITH_CUDA
if (!events_.empty()) { // Use event
std::function<void()> method = callback;
for (auto &p : dev_ctxes_) {
method = [method, p, this]() {
static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
events_.at(boost::get<platform::CUDAPlace>(p.first).device),
method);
};
}
method();
} else {
#endif
callback();
#ifdef PADDLE_WITH_CUDA
}
#endif
}
void OpHandleBase::RunAndRecordEvent(platform::Place p,
const std::function<void()> &callback) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_cpu_place(p) || events_.empty()) {
callback();
} else {
auto *ctx = dev_ctxes_.at(p);
auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
cuda_ctx->RecordEvent(events_.at(boost::get<platform::CUDAPlace>(p).device),
callback);
}
#else
callback();
#endif
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -62,6 +62,11 @@ class OpHandleBase { ...@@ -62,6 +62,11 @@ class OpHandleBase {
virtual bool IsMultiDeviceTransfer() { return false; } virtual bool IsMultiDeviceTransfer() { return false; }
protected: protected:
void RunAndRecordEvent(const std::function<void()> &callback);
void RunAndRecordEvent(platform::Place p,
const std::function<void()> &callback);
virtual void RunImpl() = 0; virtual void RunImpl() = 0;
}; };
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include <string>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -37,11 +39,13 @@ void ScaleLossGradOpHandle::RunImpl() { ...@@ -37,11 +39,13 @@ void ScaleLossGradOpHandle::RunImpl() {
*tmp = coeff_; *tmp = coeff_;
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
this->RunAndRecordEvent([&] {
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_]) static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
->stream(); ->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream); platform::CPUPlace(), &coeff_, sizeof(float), stream);
});
#endif #endif
} }
} }
......
...@@ -34,7 +34,7 @@ void SendOpHandle::RunImpl() { ...@@ -34,7 +34,7 @@ void SendOpHandle::RunImpl() {
} }
in->generated_op_->Wait(dev_ctxes_[p]); in->generated_op_->Wait(dev_ctxes_[p]);
} }
op_->Run(*local_scope_, place_); this->RunAndRecordEvent([&] { op_->Run(*local_scope_, place_); });
} }
std::string SendOpHandle::Name() const { return "send"; } std::string SendOpHandle::Name() const { return "send"; }
......
...@@ -196,10 +196,12 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -196,10 +196,12 @@ void ThreadedSSAGraphExecutor::RunOp(
BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) { BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
auto op_run = [ready_var_q, op, this] { auto op_run = [ready_var_q, op, this] {
try { try {
VLOG(10) << op->Name() << " : " << op->DebugString(); VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
op->Run(use_event_); op->Run(use_event_);
VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--; running_ops_--;
ready_var_q->Extend(op->outputs_); ready_var_q->Extend(op->outputs_);
VLOG(10) << op << " " << op->Name() << "Signal posted";
} catch (platform::EnforceNotMet ex) { } catch (platform::EnforceNotMet ex) {
exception_.reset(new platform::EnforceNotMet(ex)); exception_.reset(new platform::EnforceNotMet(ex));
} catch (...) { } catch (...) {
......
...@@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase { ...@@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase {
// version field currently is not used, however, just store the version to // version field currently is not used, however, just store the version to
// debug easily. // debug easily.
size_t version_; size_t version_;
size_t scope_idx_;
std::string name_; std::string name_;
platform::Place place_; platform::Place place_;
}; };
......
...@@ -83,8 +83,8 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -83,8 +83,8 @@ static void CheckTensorNANOrInf(const std::string& name,
if (tensor.memory_size() == 0) { if (tensor.memory_size() == 0) {
return; return;
} }
if (tensor.type().hash_code() != typeid(float).hash_code() && if (tensor.type().hash_code() != typeid(float).hash_code() && // NOLINT
tensor.type().hash_code() != typeid(double).hash_code()) { tensor.type().hash_code() != typeid(double).hash_code()) { // NOLINT
return; return;
} }
PADDLE_ENFORCE(!framework::TensorContainsInf(tensor), PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
...@@ -145,12 +145,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -145,12 +145,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
// Return true if the block has feed operators and holder of matching info. // Return true if the block has feed operators and holder of matching info.
static bool has_feed_operators( static bool has_feed_operators(
const BlockDesc& block, const BlockDesc& block,
std::map<std::string, const LoDTensor*>& feed_targets, const std::map<std::string, const LoDTensor*>& feed_targets,
const std::string& feed_holder_name) { const std::string& feed_holder_name) {
size_t feed_count = 0; size_t feed_count = 0;
for (auto* op : block.AllOps()) { for (auto* op : block.AllOps()) {
if (op->Type() == kFeedOpType) { if (op->Type() == kFeedOpType) {
feed_count++; feed_count++;
// The input variable's name of feed_op should be feed_holder_name.
PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name, PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
"Input to feed op should be '%s'", feed_holder_name); "Input to feed op should be '%s'", feed_holder_name);
std::string feed_target_name = op->Output("Out")[0]; std::string feed_target_name = op->Output("Out")[0];
...@@ -166,7 +167,8 @@ static bool has_feed_operators( ...@@ -166,7 +167,8 @@ static bool has_feed_operators(
feed_count, feed_targets.size(), feed_count, feed_targets.size(),
"The number of feed operators should match 'feed_targets'"); "The number of feed operators should match 'feed_targets'");
// When feed operator are present, so should be feed_holder if (!feed_holder_name.empty()) {
// When feed operator are present, so should be feed_holder.
auto var = block.FindVar(feed_holder_name); auto var = block.FindVar(feed_holder_name);
PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
feed_holder_name); feed_holder_name);
...@@ -174,6 +176,7 @@ static bool has_feed_operators( ...@@ -174,6 +176,7 @@ static bool has_feed_operators(
"'%s' variable should be 'FEED_MINIBATCH' type", "'%s' variable should be 'FEED_MINIBATCH' type",
feed_holder_name); feed_holder_name);
} }
}
return feed_count > 0; return feed_count > 0;
} }
...@@ -185,12 +188,14 @@ static bool has_feed_operators( ...@@ -185,12 +188,14 @@ static bool has_feed_operators(
// and fetch_holder_name. Raise exception when any mismatch is found. // and fetch_holder_name. Raise exception when any mismatch is found.
// Return true if the block has fetch operators and holder of matching info. // Return true if the block has fetch operators and holder of matching info.
static bool has_fetch_operators( static bool has_fetch_operators(
const BlockDesc& block, std::map<std::string, LoDTensor*>& fetch_targets, const BlockDesc& block,
const std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& fetch_holder_name) { const std::string& fetch_holder_name) {
size_t fetch_count = 0; size_t fetch_count = 0;
for (auto* op : block.AllOps()) { for (auto* op : block.AllOps()) {
if (op->Type() == kFetchOpType) { if (op->Type() == kFetchOpType) {
fetch_count++; fetch_count++;
// The output variable's name of fetch_op should be fetch_holder_name.
PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name, PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
"Output of fetch op should be '%s'", fetch_holder_name); "Output of fetch op should be '%s'", fetch_holder_name);
std::string fetch_target_name = op->Input("X")[0]; std::string fetch_target_name = op->Input("X")[0];
...@@ -206,7 +211,8 @@ static bool has_fetch_operators( ...@@ -206,7 +211,8 @@ static bool has_fetch_operators(
fetch_count, fetch_targets.size(), fetch_count, fetch_targets.size(),
"The number of fetch operators should match 'fetch_targets'"); "The number of fetch operators should match 'fetch_targets'");
// When fetch operator are present, so should be fetch_holder if (!fetch_holder_name.empty()) {
// When fetch operator are present, so should be fetch_holder.
auto var = block.FindVar(fetch_holder_name); auto var = block.FindVar(fetch_holder_name);
PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
fetch_holder_name); fetch_holder_name);
...@@ -214,6 +220,7 @@ static bool has_fetch_operators( ...@@ -214,6 +220,7 @@ static bool has_fetch_operators(
"'%s' variable should be 'FETCH_LIST' type", "'%s' variable should be 'FETCH_LIST' type",
fetch_holder_name); fetch_holder_name);
} }
}
return fetch_count > 0; return fetch_count > 0;
} }
...@@ -259,16 +266,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -259,16 +266,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
} }
} }
// map the data of feed_targets to feed_holder
for (auto* op : global_block->AllOps()) {
if (op->Type() == kFeedOpType) {
std::string feed_target_name = op->Output("Out")[0];
int idx = boost::get<int>(op->GetAttr("col"));
SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
idx);
}
}
if (!has_fetch_ops) { if (!has_fetch_ops) {
// create fetch_holder variable // create fetch_holder variable
auto* fetch_holder = global_block->Var(fetch_holder_name); auto* fetch_holder = global_block->Var(fetch_holder_name);
...@@ -292,17 +289,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -292,17 +289,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
} }
} }
Run(*copy_program, scope, 0, create_vars, create_vars); auto ctx = Prepare(*copy_program, 0);
RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, create_vars,
// obtain the data of fetch_targets from fetch_holder feed_holder_name, fetch_holder_name);
for (auto* op : global_block->AllOps()) {
if (op->Type() == kFetchOpType) {
std::string fetch_target_name = op->Input("X")[0];
int idx = boost::get<int>(op->GetAttr("col"));
*fetch_targets[fetch_target_name] =
GetFetchVariable(*scope, fetch_holder_name, idx);
}
}
} }
std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
...@@ -370,5 +359,42 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -370,5 +359,42 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} }
} }
void Executor::RunPreparedContext(
ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets, bool create_vars,
const std::string& feed_holder_name, const std::string& fetch_holder_name) {
auto& global_block = ctx->prog_.Block(ctx->block_id_);
PADDLE_ENFORCE(
has_feed_operators(global_block, feed_targets, feed_holder_name),
"Program in ExecutorPrepareContext should has feed_ops.");
PADDLE_ENFORCE(
has_fetch_operators(global_block, fetch_targets, fetch_holder_name),
"Program in the prepared context should has fetch_ops.");
// map the data of feed_targets to feed_holder
for (auto* op : global_block.AllOps()) {
if (op->Type() == kFeedOpType) {
std::string feed_target_name = op->Output("Out")[0];
int idx = boost::get<int>(op->GetAttr("col"));
SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
idx);
}
}
RunPreparedContext(ctx, scope, create_vars, create_vars);
// obtain the data of fetch_targets from fetch_holder
for (auto* op : global_block.AllOps()) {
if (op->Type() == kFetchOpType) {
std::string fetch_target_name = op->Input("X")[0];
int idx = boost::get<int>(op->GetAttr("col"));
*fetch_targets[fetch_target_name] =
GetFetchVariable(*scope, fetch_holder_name, idx);
}
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,9 @@ limitations under the License. */ ...@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -70,6 +73,13 @@ class Executor { ...@@ -70,6 +73,13 @@ class Executor {
bool create_local_scope = true, bool create_local_scope = true,
bool create_vars = true); bool create_vars = true);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets,
bool create_vars = true,
const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch");
private: private:
const platform::Place place_; const platform::Place place_;
}; };
......
...@@ -11,8 +11,10 @@ ...@@ -11,8 +11,10 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include <algorithm>
#include <limits>
#include <vector>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -65,8 +67,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -65,8 +67,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace(); auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy( memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
......
...@@ -14,8 +14,12 @@ ...@@ -14,8 +14,12 @@
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
DEFINE_int32(io_threadpool_size, 100,
"number of threads used for doing IO, default 100");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -91,5 +95,20 @@ void ThreadPool::TaskLoop() { ...@@ -91,5 +95,20 @@ void ThreadPool::TaskLoop() {
} }
} }
std::unique_ptr<ThreadPool> ThreadPoolIO::io_threadpool_(nullptr);
std::once_flag ThreadPoolIO::io_init_flag_;
ThreadPool* ThreadPoolIO::GetInstanceIO() {
std::call_once(io_init_flag_, &ThreadPoolIO::InitIO);
return io_threadpool_.get();
}
void ThreadPoolIO::InitIO() {
if (io_threadpool_.get() == nullptr) {
// TODO(typhoonzero1986): make this configurable
io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size));
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -14,12 +14,12 @@ limitations under the License. */ ...@@ -14,12 +14,12 @@ limitations under the License. */
#pragma once #pragma once
#include <condition_variable> #include <condition_variable> // NOLINT
#include <functional> #include <functional>
#include <future> #include <future> // NOLINT
#include <mutex> #include <mutex> // NOLINT
#include <queue> #include <queue>
#include <thread> #include <thread> // NOLINT
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -28,6 +28,22 @@ limitations under the License. */ ...@@ -28,6 +28,22 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
struct ExceptionHandler {
mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
explicit ExceptionHandler(
std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
: future_(std::move(f)) {}
void operator()() const {
auto ex = this->future_.get();
if (ex != nullptr) {
LOG(FATAL) << "The exception is thrown inside the thread pool. You "
"should use RunAndGetException to handle the exception.\n"
"The default exception handler is LOG(FATAL)."
<< ex->what();
}
}
};
// ThreadPool maintains a queue of tasks, and runs them using a fixed // ThreadPool maintains a queue of tasks, and runs them using a fixed
// number of threads. // number of threads.
class ThreadPool { class ThreadPool {
...@@ -87,22 +103,6 @@ class ThreadPool { ...@@ -87,22 +103,6 @@ class ThreadPool {
void Wait(); void Wait();
private: private:
struct ExceptionHandler {
mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
explicit ExceptionHandler(
std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
: future_(std::move(f)) {}
void operator()() const {
auto ex = this->future_.get();
if (ex != nullptr) {
LOG(FATAL) << "The exception is thrown inside the thread pool. You "
"should use RunAndGetException to handle the exception.\n"
"The default exception handler is LOG(FATAL)."
<< ex->what();
}
}
};
DISABLE_COPY_AND_ASSIGN(ThreadPool); DISABLE_COPY_AND_ASSIGN(ThreadPool);
// If the task queue is empty and avaialbe is equal to the number of // If the task queue is empty and avaialbe is equal to the number of
...@@ -135,6 +135,17 @@ class ThreadPool { ...@@ -135,6 +135,17 @@ class ThreadPool {
std::condition_variable completed_; std::condition_variable completed_;
}; };
class ThreadPoolIO : ThreadPool {
public:
static ThreadPool* GetInstanceIO();
static void InitIO();
private:
// NOTE: threadpool in base will be inhereted here.
static std::unique_ptr<ThreadPool> io_threadpool_;
static std::once_flag io_init_flag_;
};
// Run a function asynchronously. // Run a function asynchronously.
// NOTE: The function must return void. If the function need to return a value, // NOTE: The function must return void. If the function need to return a value,
// you can use lambda to capture a value pointer. // you can use lambda to capture a value pointer.
...@@ -143,5 +154,10 @@ std::future<void> Async(Callback callback) { ...@@ -143,5 +154,10 @@ std::future<void> Async(Callback callback) {
return ThreadPool::GetInstance()->Run(callback); return ThreadPool::GetInstance()->Run(callback);
} }
template <typename Callback>
std::future<void> AsyncIO(Callback callback) {
return ThreadPoolIO::GetInstanceIO()->Run(callback);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -23,7 +23,7 @@ limitations under the License. */ ...@@ -23,7 +23,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace inference { namespace inference {
// Temporarilly add this function for exposing framework::InitDevices() when // Temporarily add this function for exposing framework::InitDevices() when
// linking the inference shared library. // linking the inference shared library.
void Init(bool init_p2p) { framework::InitDevices(init_p2p); } void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
......
...@@ -46,8 +46,8 @@ TEST(inference, image_classification) { ...@@ -46,8 +46,8 @@ TEST(inference, image_classification) {
// Run inference on CPU // Run inference on CPU
LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "--- CPU Runs: ---";
TestInference<paddle::platform::CPUPlace, false>(dirname, cpu_feeds, TestInference<paddle::platform::CPUPlace, false, true>(
cpu_fetchs1, FLAGS_repeat); dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -57,8 +57,8 @@ TEST(inference, image_classification) { ...@@ -57,8 +57,8 @@ TEST(inference, image_classification) {
// Run inference on CUDA GPU // Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: ---"; LOG(INFO) << "--- GPU Runs: ---";
TestInference<paddle::platform::CUDAPlace, false>(dirname, cpu_feeds, TestInference<paddle::platform::CUDAPlace, false, true>(
cpu_fetchs2, FLAGS_repeat); dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); CheckError<float>(output1, output2);
......
...@@ -89,7 +89,7 @@ void CheckError(const paddle::framework::LoDTensor& output1, ...@@ -89,7 +89,7 @@ void CheckError(const paddle::framework::LoDTensor& output1,
EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
} }
template <typename Place, bool CreateVars = true> template <typename Place, bool CreateVars = true, bool PrepareContext = false>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs, const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
...@@ -175,8 +175,15 @@ void TestInference(const std::string& dirname, ...@@ -175,8 +175,15 @@ void TestInference(const std::string& dirname,
} }
// Ignore the profiling results of the first run // Ignore the profiling results of the first run
std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
if (PrepareContext) {
ctx = executor.Prepare(*inference_program, 0);
executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets,
CreateVars);
} else {
executor.Run(*inference_program, scope, feed_targets, fetch_targets, executor.Run(*inference_program, scope, feed_targets, fetch_targets,
CreateVars); CreateVars);
}
// Enable the profiler // Enable the profiler
paddle::platform::EnableProfiler(state); paddle::platform::EnableProfiler(state);
...@@ -187,9 +194,16 @@ void TestInference(const std::string& dirname, ...@@ -187,9 +194,16 @@ void TestInference(const std::string& dirname,
"run_inference", "run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
if (PrepareContext) {
// Note: if you change the inference_program, you need to call
// executor.Prepare() again to get a new ExecutorPrepareContext.
executor.RunPreparedContext(ctx.get(), scope, feed_targets,
fetch_targets, CreateVars);
} else {
executor.Run(*inference_program, scope, feed_targets, fetch_targets, executor.Run(*inference_program, scope, feed_targets, fetch_targets,
CreateVars); CreateVars);
} }
}
// Disable the profiler and print the timing information // Disable the profiler and print the timing information
paddle::platform::DisableProfiler( paddle::platform::DisableProfiler(
......
...@@ -245,9 +245,17 @@ op_library(channel_send_op DEPS concurrency) ...@@ -245,9 +245,17 @@ op_library(channel_send_op DEPS concurrency)
op_library(channel_recv_op DEPS concurrency) op_library(channel_recv_op DEPS concurrency)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
# Because the fully connected layer has only one MKLDNN's operator
if(NOT WITH_MKLDNN)
list(REMOVE_ITEM GENERAL_OPS fc_op)
endif(NOT WITH_MKLDNN)
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src}) op_library(${src})
endforeach() endforeach()
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
add_subdirectory(reader) add_subdirectory(reader)
......
...@@ -25,12 +25,14 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>( ...@@ -25,12 +25,14 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>(
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CPUPlace(), old_num_accumulates_, auto cuda_place =
platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(), boost::get<platform::CUDAPlace>(in_old_num_accumulates->place());
sizeof(int64_t), stream); memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place,
memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(), sizeof(int64_t),
stream);
memory::Copy(platform::CPUPlace(), num_accumulates_, cuda_place,
in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream); in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(), memory::Copy(platform::CPUPlace(), num_updates_, cuda_place,
in_num_updates->data<int64_t>(), sizeof(int64_t), stream); in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
} }
...@@ -42,14 +44,16 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>( ...@@ -42,14 +44,16 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates"); auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates"); auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates"); auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
auto cuda_place =
boost::get<platform::CUDAPlace>(out_old_num_accumulates->place());
memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(), memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t), platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
stream); stream);
memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(), memory::Copy(cuda_place, out_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &num_accumulates_, sizeof(int64_t), platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
stream); stream);
memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(), memory::Copy(cuda_place, out_num_updates->data<int64_t>(),
platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream); platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
} }
......
...@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
const auto *bias = ctx.Input<Tensor>("Bias"); const auto *bias = ctx.Input<Tensor>("Bias");
auto *y = ctx.Output<Tensor>("Y"); auto *y = ctx.Output<Tensor>("Y");
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
// alloc memory // alloc memory
y->mutable_data<T>(ctx.GetPlace()); y->mutable_data<T>(ctx.GetPlace());
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
...@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
// Run training mode. // Run training mode.
// obtain running mean and running inv var, and see if we need to // obtain running mean and running inv var, and see if we need to
// initialize them. // initialize them.
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
double this_factor = 1. - momentum; double this_factor = 1. - momentum;
CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
......
...@@ -35,7 +35,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -35,7 +35,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
this] {
auto* var = p_scope->FindVar(var_name_val); auto* var = p_scope->FindVar(var_name_val);
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
...@@ -89,7 +90,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -89,7 +90,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
this] {
// prepare input // prepare input
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name_val); req.set_varname(var_name_val);
...@@ -132,7 +134,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, ...@@ -132,7 +134,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
time_out, ch, this] { time_out, ch, this] {
auto* var = p_scope->FindVar(in_var_name_val); auto* var = p_scope->FindVar(in_var_name_val);
...@@ -196,7 +198,7 @@ bool RPCClient::Wait() { ...@@ -196,7 +198,7 @@ bool RPCClient::Wait() {
std::vector<std::future<void>> waits(req_count_); std::vector<std::future<void>> waits(req_count_);
for (int i = 0; i < req_count_; i++) { for (int i = 0; i < req_count_; i++) {
waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); }); waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); });
} }
for (int i = 0; i < req_count_; i++) { for (int i = 0; i < req_count_; i++) {
......
...@@ -217,10 +217,10 @@ void AsyncGRPCServer::RunSyncUpdate() { ...@@ -217,10 +217,10 @@ void AsyncGRPCServer::RunSyncUpdate() {
std::function<void()> prefetch_register = std::function<void()> prefetch_register =
std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this); std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
// TODO(wuyi): Run these "HandleRequest" in thread pool
t_send_.reset( t_send_.reset(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
cq_send_.get(), "cq_send", send_register))); cq_send_.get(), "cq_send", send_register)));
t_get_.reset( t_get_.reset(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
cq_get_.get(), "cq_get", get_register))); cq_get_.get(), "cq_get", get_register)));
......
...@@ -268,6 +268,7 @@ void batched_gemm<platform::CUDADeviceContext, float16>( ...@@ -268,6 +268,7 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta, const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C, const int batchCount, const int strideA, const int strideB) { float16* C, const int batchCount, const int strideA, const int strideB) {
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from // Note that cublas follows fortran order, so the order is different from
// the cblas convention. // the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M; int lda = (transA == CblasNoTrans) ? K : M;
...@@ -288,9 +289,13 @@ void batched_gemm<platform::CUDADeviceContext, float16>( ...@@ -288,9 +289,13 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
// TODO(kexinzhao): add processing code for compute capability < 53 case // TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53"); "cublas Hgemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
#endif
} }
template <> template <>
...@@ -299,6 +304,7 @@ void batched_gemm<platform::CUDADeviceContext, float>( ...@@ -299,6 +304,7 @@ void batched_gemm<platform::CUDADeviceContext, float>(
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float alpha, const float* A, const float* B, const float beta, const float alpha, const float* A, const float* B, const float beta,
float* C, const int batchCount, const int strideA, const int strideB) { float* C, const int batchCount, const int strideA, const int strideB) {
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from // Note that cublas follows fortran order, so the order is different from
// the cblas convention. // the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M; int lda = (transA == CblasNoTrans) ? K : M;
...@@ -313,6 +319,9 @@ void batched_gemm<platform::CUDADeviceContext, float>( ...@@ -313,6 +319,9 @@ void batched_gemm<platform::CUDADeviceContext, float>(
PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
#endif
} }
template <> template <>
...@@ -321,6 +330,7 @@ void batched_gemm<platform::CUDADeviceContext, double>( ...@@ -321,6 +330,7 @@ void batched_gemm<platform::CUDADeviceContext, double>(
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const double alpha, const double* A, const double* B, const double beta, const double alpha, const double* A, const double* B, const double beta,
double* C, const int batchCount, const int strideA, const int strideB) { double* C, const int batchCount, const int strideA, const int strideB) {
#if CUDA_VERSION >= 8000
// Note that cublas follows fortran order, so the order is different from // Note that cublas follows fortran order, so the order is different from
// the cblas convention. // the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M; int lda = (transA == CblasNoTrans) ? K : M;
...@@ -335,6 +345,9 @@ void batched_gemm<platform::CUDADeviceContext, double>( ...@@ -335,6 +345,9 @@ void batched_gemm<platform::CUDADeviceContext, double>(
PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
#else
PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
#endif
} }
template <> template <>
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
auto src_memory = auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); mkldnn::memory({src_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(input_data)));
auto dst_memory = auto dst_memory =
mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data); mkldnn::memory({dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(output_data)));
auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory, auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
*workspace_memory); *workspace_memory);
...@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
pool_bwd_desc, mkldnn_engine, *pool_pd); pool_bwd_desc, mkldnn_engine, *pool_pd);
auto diff_src_memory = auto diff_src_memory =
mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data); mkldnn::memory({diff_src_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(in_x_grad_data)));
auto diff_dst_memory = auto diff_dst_memory =
mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data); mkldnn::memory({diff_dst_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(out_grad_data)));
auto bwd_prim = mkldnn::pooling_backward( auto bwd_prim = mkldnn::pooling_backward(
pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory); pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/prelu_op.h" #include "paddle/fluid/operators/prelu_op.h"
#include <string> #include <string>
namespace paddle { namespace paddle {
......
...@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel { ...@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
bool flip = ctx->Attrs().Get<bool>("flip"); bool flip = ctx->Attrs().Get<bool>("flip");
std::vector<float> aspect_ratios_vec; std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec); ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
size_t num_priors = aspect_ratios_vec.size() * min_sizes.size(); size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
......
...@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> { ...@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
auto clip = ctx.Attr<bool>("clip"); auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios; std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w")); T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h")); T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
...@@ -22,23 +24,23 @@ namespace operators { ...@@ -22,23 +24,23 @@ namespace operators {
inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior, inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
bool flip, bool flip,
std::vector<float>& output_aspect_ratior) { std::vector<float>* output_aspect_ratior) {
constexpr float epsilon = 1e-6; constexpr float epsilon = 1e-6;
output_aspect_ratior.clear(); output_aspect_ratior->clear();
output_aspect_ratior.push_back(1.0f); output_aspect_ratior->push_back(1.0f);
for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
float ar = input_aspect_ratior[i]; float ar = input_aspect_ratior[i];
bool already_exist = false; bool already_exist = false;
for (size_t j = 0; j < output_aspect_ratior.size(); ++j) { for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
if (fabs(ar - output_aspect_ratior[j]) < epsilon) { if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
already_exist = true; already_exist = true;
break; break;
} }
} }
if (!already_exist) { if (!already_exist) {
output_aspect_ratior.push_back(ar); output_aspect_ratior->push_back(ar);
if (flip) { if (flip) {
output_aspect_ratior.push_back(1.0f / ar); output_aspect_ratior->push_back(1.0f / ar);
} }
} }
} }
...@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
auto clip = ctx.Attr<bool>("clip"); auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios; std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w")); T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h")); T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/rank_loss_op.h" #include "paddle/fluid/operators/rank_loss_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -33,28 +33,14 @@ static constexpr size_t kChannelSize = 0; // kCacheSize - 2 ...@@ -33,28 +33,14 @@ static constexpr size_t kChannelSize = 0; // kCacheSize - 2
class DoubleBufferReader : public framework::DecoratedReader { class DoubleBufferReader : public framework::DecoratedReader {
public: public:
struct Item {
Item() : ctx_(nullptr) {}
Item(Item&& b) {
payloads_ = std::move(b.payloads_);
ctx_ = std::move(b.ctx_);
}
Item& operator=(Item&& b) {
payloads_ = std::move(b.payloads_);
ctx_ = std::move(b.ctx_);
return *this;
}
std::vector<framework::LoDTensor> payloads_;
platform::DeviceContext* ctx_;
};
explicit DoubleBufferReader( explicit DoubleBufferReader(
ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
: DecoratedReader(reader), place_(target_place) { : DecoratedReader(reader), place_(target_place) {
cpu_tensor_cache_.resize(kCacheSize);
gpu_tensor_cache_.resize(kCacheSize);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
for (size_t i = 0; i < kCacheSize; ++i) {
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
for (size_t i = 0; i < kCacheSize; ++i) {
ctxs_.emplace_back(new platform::CUDADeviceContext( ctxs_.emplace_back(new platform::CUDADeviceContext(
boost::get<platform::CUDAPlace>(place_))); boost::get<platform::CUDAPlace>(place_)));
} }
...@@ -72,7 +58,7 @@ class DoubleBufferReader : public framework::DecoratedReader { ...@@ -72,7 +58,7 @@ class DoubleBufferReader : public framework::DecoratedReader {
bool HasNext() const; bool HasNext() const;
void StartPrefetcher() { void StartPrefetcher() {
channel_ = framework::MakeChannel<Item>(kChannelSize); channel_ = framework::MakeChannel<size_t>(kChannelSize);
prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
} }
...@@ -88,8 +74,10 @@ class DoubleBufferReader : public framework::DecoratedReader { ...@@ -88,8 +74,10 @@ class DoubleBufferReader : public framework::DecoratedReader {
void PrefetchThreadFunc(); void PrefetchThreadFunc();
std::thread prefetcher_; std::thread prefetcher_;
framework::Channel<Item>* channel_; framework::Channel<size_t>* channel_;
platform::Place place_; platform::Place place_;
std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache_;
std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache_;
std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_; std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
}; };
...@@ -153,11 +141,14 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -153,11 +141,14 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) { void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
out->clear(); out->clear();
if (HasNext()) { if (HasNext()) {
Item batch; size_t cached_tensor_id;
channel_->Receive(&batch); channel_->Receive(&cached_tensor_id);
*out = batch.payloads_; if (platform::is_gpu_place(place_)) {
if (batch.ctx_) { *out = gpu_tensor_cache_[cached_tensor_id];
batch.ctx_->Wait(); ctxs_[cached_tensor_id]->Wait();
} else {
// CPU place
*out = cpu_tensor_cache_[cached_tensor_id];
} }
} }
} }
...@@ -176,42 +167,33 @@ bool DoubleBufferReader::HasNext() const { ...@@ -176,42 +167,33 @@ bool DoubleBufferReader::HasNext() const {
void DoubleBufferReader::PrefetchThreadFunc() { void DoubleBufferReader::PrefetchThreadFunc() {
VLOG(5) << "A new prefetch thread starts."; VLOG(5) << "A new prefetch thread starts.";
std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
size_t cached_tensor_id = 0; size_t cached_tensor_id = 0;
while (true) { while (true) {
Item batch; auto& cpu_batch = cpu_tensor_cache_[cached_tensor_id];
auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
reader_->ReadNext(&cpu_batch); reader_->ReadNext(&cpu_batch);
if (cpu_batch.empty()) { if (cpu_batch.empty()) {
// The underlying reader have no next data. // The underlying reader have no next data.
break; break;
} }
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
auto& gpu_batch = gpu_tensor_cache[cached_tensor_id]; auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id];
auto* gpu_ctx = ctxs_[cached_tensor_id].get(); auto* gpu_ctx = ctxs_[cached_tensor_id].get();
gpu_batch.resize(cpu_batch.size()); gpu_batch.resize(cpu_batch.size());
for (size_t i = 0; i < cpu_batch.size(); ++i) { for (size_t i = 0; i < cpu_batch.size(); ++i) {
framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]); framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
gpu_batch[i].set_lod(cpu_batch[i].lod()); gpu_batch[i].set_lod(cpu_batch[i].lod());
} }
batch.payloads_ = gpu_batch;
batch.ctx_ = gpu_ctx;
} else {
// CPUPlace
batch.payloads_ = cpu_batch;
} }
++cached_tensor_id;
cached_tensor_id %= kCacheSize;
try { try {
channel_->Send(&batch); size_t tmp = cached_tensor_id;
channel_->Send(&tmp);
} catch (paddle::platform::EnforceNotMet e) { } catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The double buffer channel has been closed. The " VLOG(5) << "WARNING: The double buffer channel has been closed. The "
"prefetch thread will terminate."; "prefetch thread will terminate.";
break; break;
} }
++cached_tensor_id;
cached_tensor_id %= kCacheSize;
} }
channel_->Close(); channel_->Close();
VLOG(5) << "Prefetch thread terminates."; VLOG(5) << "Prefetch thread terminates.";
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <future> // NOLINT
#include <ostream> #include <ostream>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -19,7 +20,6 @@ limitations under the License. */ ...@@ -19,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle { namespace paddle {
......
...@@ -60,7 +60,7 @@ class ReshapeOp : public framework::OperatorWithKernel { ...@@ -60,7 +60,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
static framework::DDim ValidateShape(const std::vector<int> shape, static framework::DDim ValidateShape(const std::vector<int> shape,
const framework::DDim &in_dims) { const framework::DDim &in_dims) {
const int64_t in_size = framework::product(in_dims); const int64_t in_size = framework::product(in_dims);
// only one dimension canbe set to -1, whose size will be automatically // only one dimension can be set to -1, whose size will be automatically
// infered. // infered.
const int64_t unk_dim_val = -1; const int64_t unk_dim_val = -1;
const int64_t copy_dim_val = 0; const int64_t copy_dim_val = 0;
...@@ -119,13 +119,15 @@ class ReshapeKernel : public framework::OpKernel<T> { ...@@ -119,13 +119,15 @@ class ReshapeKernel : public framework::OpKernel<T> {
auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape"); auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
framework::DDim out_dims = out->dims(); framework::DDim out_dims = out->dims();
if (shape_tensor) { if (shape_tensor) {
auto *shape_data = shape_tensor->data<int>(); auto *shape_data = shape_tensor->data<int>();
if (platform::is_gpu_place(ctx.GetPlace())) {
framework::Tensor cpu_shape_tensor; framework::Tensor cpu_shape_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) {
TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(), TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
&cpu_shape_tensor); &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>(); shape_data = cpu_shape_tensor.data<int>();
ctx.device_context().Wait();
} }
auto shape = auto shape =
std::vector<int>(shape_data, shape_data + shape_tensor->numel()); std::vector<int>(shape_data, shape_data + shape_tensor->numel());
...@@ -145,6 +147,7 @@ class ReshapeKernel : public framework::OpKernel<T> { ...@@ -145,6 +147,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
if (!inplace) { if (!inplace) {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out); framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
ctx.device_context().Wait();
// TensorCopy will resize to in_dims. // TensorCopy will resize to in_dims.
out->Resize(out_dims); out->Resize(out_dims);
} else { } else {
...@@ -167,6 +170,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> { ...@@ -167,6 +170,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
auto in_dims = d_x->dims(); auto in_dims = d_x->dims();
if (!inplace) { if (!inplace) {
framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
ctx.device_context().Wait();
d_x->Resize(in_dims); d_x->Resize(in_dims);
} else { } else {
d_x->ShareDataWith(*d_out); d_x->ShareDataWith(*d_out);
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <limits>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/scale_op.h" #include "paddle/fluid/operators/scale_op.h"
#include <string> #include <string>
namespace paddle { namespace paddle {
......
...@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "gather.cu.h" #include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/gather_op.h" #include "paddle/fluid/operators/gather_op.h"
#include "scatter.cu.h" #include "paddle/fluid/operators/scatter.cu.h"
#include "paddle/fluid/operators/scatter_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "gather.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "scatter.h" #include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/scatter.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,44 +13,48 @@ See the License for the specific language governing permissions and ...@@ -13,44 +13,48 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/scatter.h" #include "paddle/fluid/operators/scatter.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/place.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/place.h"
TEST(scatter, ScatterUpdate) { TEST(scatter, ScatterUpdate) {
using namespace paddle::framework; // using namespace paddle::framework;
using namespace paddle::platform; // using namespace paddle::platform;
using namespace paddle::operators; // using namespace paddle::operators;
Tensor* src = new Tensor(); paddle::framework::Tensor* src = new paddle::framework::Tensor();
Tensor* index = new Tensor(); paddle::framework::Tensor* index = new paddle::framework::Tensor();
Tensor* output = new Tensor(); paddle::framework::Tensor* output = new paddle::framework::Tensor();
float* p_src = nullptr; float* p_src = nullptr;
int* p_index = nullptr; int* p_index = nullptr;
p_src = src->mutable_data<float>(make_ddim({1, 4}), CPUPlace()); p_src = src->mutable_data<float>(paddle::framework::make_ddim({1, 4}),
p_index = index->mutable_data<int>(make_ddim({1}), CPUPlace()); paddle::platform::CPUPlace());
p_index = index->mutable_data<int>(paddle::framework::make_ddim({1}),
paddle::platform::CPUPlace());
for (size_t i = 0; i < 4; ++i) p_src[i] = float(i); for (size_t i = 0; i < 4; ++i) p_src[i] = static_cast<float>(i);
p_index[0] = 1; p_index[0] = 1;
float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace()); float* p_output = output->mutable_data<float>(
paddle::framework::make_ddim({4, 4}), paddle::platform::CPUPlace());
auto* cpu_place = new paddle::platform::CPUPlace(); auto* cpu_place = new paddle::platform::CPUPlace();
paddle::platform::CPUDeviceContext ctx(*cpu_place); paddle::platform::CPUDeviceContext ctx(*cpu_place);
ScatterAssign<float>(ctx, *src, *index, output); paddle::operators::ScatterAssign<float>(ctx, *src, *index, output);
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0)); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0)); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f);
for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4)); for (size_t i = 4; i < 8; ++i) {
EXPECT_EQ(p_output[i], static_cast<float>(i - 4));
}
for (size_t i = 4; i < 8; ++i) for (size_t i = 4; i < 8; ++i)
EXPECT_EQ(output->data<float>()[i], float(i - 4)); EXPECT_EQ(output->data<float>()[i], static_cast<float>(i - 4));
for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0)); for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f);
for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0)); for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f);
delete src; delete src;
delete index; delete index;
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <future> // NOLINT
#include <ostream> #include <ostream>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -19,7 +20,6 @@ limitations under the License. */ ...@@ -19,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle { namespace paddle {
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <future> #include <future> // NOLINT
#include <ostream> #include <ostream>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
......
...@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/context_project.h" #include "paddle/fluid/operators/math/context_project.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sequence_erase_op.h" #include "paddle/fluid/operators/sequence_erase_op.h"
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
......
...@@ -84,13 +84,12 @@ class SequenceExpandOp : public framework::OperatorWithKernel { ...@@ -84,13 +84,12 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
} }
} }
out_dims[0] = out_first_dim; out_dims[0] = out_first_dim;
ctx->SetOutputDim("Out", out_dims);
} else { } else {
out_dims[0] = -1; out_dims[0] = -1;
}
ctx->SetOutputDim("Out", out_dims); ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
}
}; };
class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker { class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -12,8 +12,135 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,135 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU #include <algorithm>
#include "paddle/fluid/operators/sequence_expand_op.h" #include "paddle/fluid/operators/sequence_expand_op.h"
#include "paddle/fluid/platform/cuda_helper.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
template <typename T>
__global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
const size_t* ref_lod,
const size_t* offset,
const size_t lod_size,
/* default=1,
the instance length*/
const int x_item_length, T* out_data) {
int bid = blockIdx.x;
if (bid >= lod_size - 1) return;
int x_item_count = x_lod[bid + 1] - x_lod[bid];
int repeats = ref_lod[bid + 1] - ref_lod[bid];
int out_offset = static_cast<int>(offset[bid]);
int x_offset = x_lod[bid];
for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
for (int tid_x = threadIdx.x; tid_x < x_item_length;
tid_x += blockDim.x) {
out_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length +
tid_x] = x_data[(x_offset + tid_y) * x_item_length + tid_x];
}
}
}
}
template <typename T>
__global__ void sequence_expand_grad_kernel(
const T* dout_data, const size_t* ref_lod, const size_t* dx_lod,
const size_t* offset, const size_t lod_size,
/* default=1,
the instance length*/
const int x_item_length, T* dx_data) {
int bid = blockIdx.x;
if (bid >= lod_size - 1) return;
int x_item_count = dx_lod[bid + 1] - dx_lod[bid];
int repeats = ref_lod[bid + 1] - ref_lod[bid];
int out_offset = static_cast<int>(offset[bid]);
int x_offset = dx_lod[bid];
for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
for (int tid_x = threadIdx.x; tid_x < x_item_length;
tid_x += blockDim.x) {
platform::CudaAtomicAdd(
&dx_data[(x_offset + tid_y) * x_item_length + tid_x],
dout_data[(out_offset + tid_z * x_item_count + tid_y) *
x_item_length +
tid_x]);
}
}
}
}
void GetOutputOffset(const framework::Vector<size_t>& x_lod,
const framework::Vector<size_t>& ref_lod,
framework::Vector<size_t>* out_offset) {
size_t offset = 0;
int lod_size = static_cast<int>(x_lod.size());
for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
(*out_offset)[i] = offset;
if (i < lod_size - 1) {
offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
}
}
}
template <typename T>
struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
void operator()(
const platform::CUDADeviceContext& context, const LoDTensor& x,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* out) {
int x_item_length = x.numel() / x.dims()[0];
framework::Vector<size_t> out_offset(x_lod.size());
GetOutputOffset(x_lod, ref_lod, &out_offset);
int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
int thread_y = 16;
int thread_z = 1024 / thread_x / thread_y;
int block_x = static_cast<int>(ref_lod.size());
dim3 block_size(thread_x, thread_y, thread_z);
dim3 grid_size(block_x, 1);
sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
x.data<T>(), x_lod.CUDAData(context.GetPlace()),
ref_lod.CUDAData(context.GetPlace()),
out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
out->mutable_data<T>(context.GetPlace()));
}
};
template <typename T>
struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const LoDTensor& dout,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand based lod*/
LoDTensor* dx) {
int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
framework::Vector<size_t> out_offset(x_lod.size());
GetOutputOffset(x_lod, ref_lod, &out_offset);
int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
int thread_y = 16;
int thread_z = 1024 / thread_x / thread_y;
int block_x = static_cast<int>(ref_lod.size());
dim3 block_size(thread_x, thread_y, thread_z);
dim3 grid_size(block_x, 1);
sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
x_lod.CUDAData(context.GetPlace()),
out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
dx->mutable_data<T>(context.GetPlace()));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <numeric> // std::iota
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
...@@ -26,6 +27,57 @@ template <typename T, int MajorType = Eigen::RowMajor, ...@@ -26,6 +27,57 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
struct SequenceExpandFunctor {
void operator()(
const DeviceContext& ctx, const LoDTensor& x,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* out);
};
template <typename DeviceContext, typename T>
struct SequenceExpandGradFunctor {
void operator()(
const DeviceContext& ctx, const LoDTensor& dout,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* dx);
};
template <typename T>
struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
void operator()(
const platform::CPUDeviceContext& context, const LoDTensor& x,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* out) {
int out_offset = 0;
auto& eigen_place = *context.eigen_device();
for (size_t i = 1; i < ref_lod.size(); ++i) {
int repeat_num = ref_lod[i] - ref_lod[i - 1];
int x_start = x_lod[i - 1];
int x_end = x_lod[i];
int x_seq_len = x_end - x_start;
if (repeat_num > 0) {
auto x_sub_tensor = x.Slice(x_start, x_end);
x_sub_tensor.Resize({1, x_sub_tensor.numel()});
int out_start = out_offset;
if (out->lod().size() == 1) {
out_start = out->lod()[0][out_offset];
}
auto out_sub_tensor =
out->Slice(out_start, out_start + x_seq_len * repeat_num);
out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
EigenMatrix<T>::From(x_sub_tensor)
.broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
}
out_offset += repeat_num;
}
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SequenceExpandKernel : public framework::OpKernel<T> { class SequenceExpandKernel : public framework::OpKernel<T> {
public: public:
...@@ -47,45 +99,36 @@ class SequenceExpandKernel : public framework::OpKernel<T> { ...@@ -47,45 +99,36 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
return; return;
} }
auto& out_lod = *out->mutable_lod(); // x lod level is at most 1.
framework::Vector<size_t> out_lod;
if (x_lod.size() == 1) { if (x_lod.size() == 1) {
out_lod.resize(1); out_lod.push_back(0);
out_lod[0] = {0};
}
int out_offset = 0; int out_offset = 0;
auto& eigen_place =
*context.template device_context<DeviceContext>().eigen_device();
for (size_t i = 1; i < y_lod[ref_level].size(); ++i) { for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]; int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
int x_start = i - 1; int x_start = x_lod[0][i - 1];
int x_end = i; int x_end = x_lod[0][i];
if (x_lod.size() == 1) {
x_start = x_lod[0][i - 1];
x_end = x_lod[0][i];
}
int x_seq_len = x_end - x_start; int x_seq_len = x_end - x_start;
if (repeat_num > 0) {
auto x_sub_tensor = x->Slice(x_start, x_end);
x_sub_tensor.Resize({1, x_sub_tensor.numel()});
int out_start = out_offset;
if (x_lod.size() == 1) {
out_start = out_lod[0][out_offset];
}
auto out_sub_tensor =
out->Slice(out_start, out_start + x_seq_len * repeat_num);
out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
EigenMatrix<T>::From(x_sub_tensor)
.broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
}
for (int j = 0; j < repeat_num; ++j) { for (int j = 0; j < repeat_num; ++j) {
if (x_lod.size() == 1) { out_lod.push_back(out_lod.back() + x_seq_len);
out_lod[0].push_back(out_lod[0].back() + x_seq_len);
}
out_offset++; out_offset++;
} }
} }
// write lod to out if x has lod
auto& ref_lod = *out->mutable_lod();
ref_lod[0] = out_lod;
}
framework::Vector<size_t> ref_x_lod;
if (x->lod().size() == 1) {
ref_x_lod = x->lod()[0];
} else {
// x_lod doesn't has lod, use fake x lod, level = 0
ref_x_lod.resize(x->dims()[0] + 1);
std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
}
SequenceExpandFunctor<DeviceContext, T> functor;
functor(context.template device_context<DeviceContext>(), *x, ref_x_lod,
y_lod[ref_level], out);
} }
}; };
...@@ -101,6 +144,36 @@ class SequenceExpandKernel : public framework::OpKernel<T> { ...@@ -101,6 +144,36 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
* Grad(X).lod = Input(X).lod * Grad(X).lod = Input(X).lod
* *
* */ * */
template <typename T>
struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
void operator()(
const platform::CPUDeviceContext& context, const LoDTensor& dout,
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* dx) {
math::SetConstant<platform::CPUDeviceContext, T> set_zero;
set_zero(context, dx, static_cast<T>(0));
int dout_offset = 0;
for (size_t i = 1; i < ref_lod.size(); ++i) {
int repeat_num = ref_lod[i] - ref_lod[i - 1];
if (repeat_num > 0) {
int x_start = x_lod[i - 1];
int x_end = x_lod[i];
int x_seq_len = x_end - x_start;
auto dx_sub = dx->Slice(x_start, x_end);
dx_sub.Resize(flatten_to_1d(dx_sub.dims()));
int dout_end = dout_offset + repeat_num * x_seq_len;
auto dout_sub = dout.Slice(dout_offset, dout_end);
dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
col_sum(context, dout_sub, &dx_sub);
dout_offset += repeat_num * x_seq_len;
}
}
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SequenceExpandGradKernel : public framework::OpKernel<T> { class SequenceExpandGradKernel : public framework::OpKernel<T> {
public: public:
...@@ -114,43 +187,26 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> { ...@@ -114,43 +187,26 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
g_x->mutable_data<T>(context.GetPlace()); g_x->mutable_data<T>(context.GetPlace());
g_x->set_lod(x->lod()); g_x->set_lod(x->lod());
auto& x_lod = x->lod();
auto& y_lod = y->lod(); auto& y_lod = y->lod();
if (ref_level == -1) ref_level = y_lod.size() - 1; if (ref_level == -1) ref_level = y_lod.size() - 1;
// just copy the gradient // just copy the gradient
if (y_lod[ref_level].size() <= 1) { if (y_lod[ref_level].size() <= 1) {
framework::TensorCopy(*g_out, context.GetPlace(), g_x); framework::TensorCopy(*g_out, context.GetPlace(), g_x);
return; return;
} }
auto& dev_ctx = context.template device_context<DeviceContext>(); framework::Vector<size_t> ref_x_lod;
framework::Vector<size_t> ref_lod = y_lod[ref_level];
math::SetConstant<DeviceContext, T> set_zero; if (x->lod().size() == 1) {
set_zero(dev_ctx, g_x, static_cast<T>(0)); ref_x_lod = x->lod()[0];
} else {
int g_out_offset = 0; // x_lod doesn't has lod, use fake x lod, level = 0
for (size_t i = 1; i < y_lod[ref_level].size(); ++i) { ref_x_lod.resize(x->dims()[0] + 1);
int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]; std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
if (repeat_num > 0) {
int x_start = i - 1;
int x_end = i;
if (x_lod.size() == 1) {
x_start = x_lod[0][i - 1];
x_end = x_lod[0][i];
}
int x_seq_len = x_end - x_start;
auto g_x_sub = g_x->Slice(x_start, x_end);
g_x_sub.Resize(flatten_to_1d(g_x_sub.dims()));
int g_out_end = g_out_offset + repeat_num * x_seq_len;
auto g_out_sub = g_out->Slice(g_out_offset, g_out_end);
g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]});
math::ColwiseSum<DeviceContext, T> col_sum;
col_sum(dev_ctx, g_out_sub, &g_x_sub);
g_out_offset += repeat_num * x_seq_len;
}
} }
SequenceExpandGradFunctor<DeviceContext, T> functor;
functor(context.template device_context<DeviceContext>(), *g_out, ref_x_lod,
ref_lod, g_x);
} }
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sequence_pool_op.h" #include "paddle/fluid/operators/sequence_pool_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sequence_softmax_op.h" #include "paddle/fluid/operators/sequence_softmax_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -65,7 +65,8 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -65,7 +65,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
auto &grad_rows = grad->rows(); auto &grad_rows = grad->rows();
size_t grad_row_numel = grad_value.numel() / grad_rows.size(); size_t grad_row_numel = grad_value.numel() / grad_rows.size();
PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height); PADDLE_ENFORCE_EQ(static_cast<int64_t>(grad_row_numel),
param_out->numel() / grad_height);
auto *grad_data = grad_value.data<T>(); auto *grad_data = grad_value.data<T>();
auto *out_data = param_out->data<T>(); auto *out_data = param_out->data<T>();
...@@ -73,7 +74,7 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -73,7 +74,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < grad_rows.size(); i++) { for (size_t i = 0; i < grad_rows.size(); i++) {
PADDLE_ENFORCE(grad_rows[i] < grad_height, PADDLE_ENFORCE(grad_rows[i] < grad_height,
"Input rows index should less than height"); "Input rows index should less than height");
for (int64_t j = 0; j < grad_row_numel; j++) { for (size_t j = 0; j < grad_row_numel; j++) {
out_data[grad_rows[i] * grad_row_numel + j] -= out_data[grad_rows[i] * grad_row_numel + j] -=
lr[0] * grad_data[i * grad_row_numel + j]; lr[0] * grad_data[i * grad_row_numel + j];
} }
...@@ -107,7 +108,7 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -107,7 +108,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE(grad.rows()[i] < grad.height(), PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
"Input rows index should less than height"); "Input rows index should less than height");
int64_t id_index = param.index(grad.rows()[i]); int64_t id_index = param.index(grad.rows()[i]);
for (int64_t j = 0; j < grad_row_width; j++) { for (size_t j = 0; j < grad_row_width; j++) {
out_data[id_index * grad_row_width + j] -= out_data[id_index * grad_row_width + j] -=
lr[0] * grad_data[i * grad_row_width + j]; lr[0] * grad_data[i * grad_row_width + j];
} }
......
...@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <iostream>
#include "mkldnn.hpp" #include "mkldnn.hpp"
#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#include <iostream>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -63,9 +62,11 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -63,9 +62,11 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
softmax_md, 1 /*dim: C*/); softmax_md, 1 /*dim: C*/);
// create memory primitives // create memory primitives
auto softmax_src_memory = auto softmax_src_memory =
memory({softmax_md, mkldnn_engine}, (void*)input_data); memory({softmax_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(input_data)));
auto softmax_dst_memory = auto softmax_dst_memory =
memory({softmax_md, mkldnn_engine}, (void*)output_data); memory({softmax_md, mkldnn_engine},
static_cast<void*>(const_cast<T*>(output_data)));
auto softmax_prim_desc = auto softmax_prim_desc =
softmax_forward::primitive_desc(softmax_desc, mkldnn_engine); softmax_forward::primitive_desc(softmax_desc, mkldnn_engine);
auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory, auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory,
......
...@@ -60,7 +60,9 @@ class SplitIdsOpKernel : public framework::OpKernel<T> { ...@@ -60,7 +60,9 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
} else if (ids_var->IsType<framework::SelectedRows>()) { } else if (ids_var->IsType<framework::SelectedRows>()) {
const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids"); const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
auto &ids_dims = ids_selected_rows->value().dims(); auto &ids_dims = ids_selected_rows->value().dims();
PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), ""); PADDLE_ENFORCE_EQ(ids_dims[0],
static_cast<int64_t>(ids_selected_rows->rows().size()),
"");
const T *ids = ids_selected_rows->value().data<T>(); const T *ids = ids_selected_rows->value().data<T>();
const auto &ids_rows = ids_selected_rows->rows(); const auto &ids_rows = ids_selected_rows->rows();
auto outs = ctx.MultiOutput<framework::SelectedRows>("Out"); auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
...@@ -77,7 +79,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> { ...@@ -77,7 +79,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
framework::DDim ddim = framework::make_ddim( framework::DDim ddim = framework::make_ddim(
{static_cast<int64_t>(out->rows().size()), row_width}); {static_cast<int64_t>(out->rows().size()), row_width});
T *output = out->mutable_value()->mutable_data<T>(ddim, place); T *output = out->mutable_value()->mutable_data<T>(ddim, place);
for (size_t i = 0; i < ddim[0]; ++i) { for (int64_t i = 0; i < ddim[0]; ++i) {
memcpy(output + i * row_width, ids + out->rows()[i] * row_width, memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <chrono> #include <chrono> // NOLINT
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
......
...@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, ...@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& src_stride, const framework::DDim& src_stride,
const framework::DDim& dst_dim, const framework::DDim& dst_dim,
const framework::DDim& dst_stride, T* dst) { const framework::DDim& dst_stride, T* dst) {
using namespace detail; paddle::operators::detail::StridedCopyDimVisitor<T> func(
StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst); dev_ctx, src, src_stride, dst_stride, dst);
boost::apply_visitor(func, dst_dim); boost::apply_visitor(func, dst_dim);
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
namespace paddle { namespace paddle {
...@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col, ...@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
} }
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam, __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* src, int beam_size, const T* src,
bool& firstStep, bool& is_empty, bool* firstStep, bool* is_empty,
Pair<T>& max, int dim, Pair<T>* max, int dim,
const int tid) { const int tid) {
if (beam > 0) { if (*beam > 0) {
int length = beam < beam_size ? beam : beam_size; int length = (*beam) < beam_size ? *beam : beam_size;
if (firstStep) { if (*firstStep) {
firstStep = false; *firstStep = false;
GetTopK<T, BlockSize>(topk, src, tid, dim, length); GetTopK<T, BlockSize>(topk, src, tid, dim, length);
} else { } else {
for (int k = 0; k < MaxLength; k++) { for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - beam) { if (k < MaxLength - (*beam)) {
topk[k] = topk[k + beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(-INFINITY, -1); topk[k].set(-INFINITY, -1);
} }
} }
if (!is_empty) { if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max, GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
length); length);
} }
} }
max = topk[MaxLength - 1]; *max = topk[MaxLength - 1];
if (max.v == -1) is_empty = true; if ((*max).v == -1) *is_empty = true;
beam = 0; *beam = 0;
} }
} }
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam, __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
int beam_size, const T* val, int beam_size, const T* val,
int* col, bool& firstStep, int* col, bool* firstStep,
bool& is_empty, Pair<T>& max, bool* is_empty, Pair<T>* max,
int dim, const int tid) { int dim, const int tid) {
if (beam > 0) { if (*beam > 0) {
int length = beam < beam_size ? beam : beam_size; int length = (*beam) < beam_size ? *beam : beam_size;
if (firstStep) { if (*firstStep) {
firstStep = false; *firstStep = false;
GetTopK<T, BlockSize>(topk, val, col, tid, dim, length); GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
} else { } else {
for (int k = 0; k < MaxLength; k++) { for (int k = 0; k < MaxLength; k++) {
if (k < MaxLength - beam) { if (k < MaxLength - *beam) {
topk[k] = topk[k + beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(-INFINITY, -1); topk[k].set(-INFINITY, -1);
} }
} }
if (!is_empty) { if (!(*is_empty)) {
GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max, GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
length); length);
} }
} }
max = topk[MaxLength - 1]; *max = topk[MaxLength - 1];
if (max.v == -1) is_empty = true; if ((*max).v == -1) *is_empty = true;
beam = 0; *beam = 0;
} }
} }
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid, __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
Pair<T> topk[], T** topVal, Pair<T> topk[], T** topVal,
int64_t** topIds, int& beam, int& k, int64_t** topIds, int* beam, int* k,
const int tid, const int warp) { const int tid, const int warp) {
while (true) { while (true) {
__syncthreads(); __syncthreads();
...@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid, ...@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
(*topVal)++; (*topVal)++;
(*topIds)++; (*topIds)++;
} }
if (tid == maxid[0]) beam++; if (tid == maxid[0]) (*beam)++;
if (--k == 0) break; if (--(*k) == 0) break;
__syncthreads(); __syncthreads();
if (tid == maxid[0]) { if (tid == maxid[0]) {
if (beam < MaxLength) { if (*beam < MaxLength) {
sh_topk[tid] = topk[beam]; sh_topk[tid] = topk[*beam];
} }
} }
if (maxid[0] / 32 == warp) { if (maxid[0] / 32 == warp) {
if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break; if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
} }
} }
} }
...@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, ...@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
topk[k].set(-INFINITY, -1); topk[k].set(-INFINITY, -1);
} }
while (k) { while (k) {
ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k, ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
src + blockIdx.x * lds, firststep, src + blockIdx.x * lds, &firststep,
is_empty, max, dim, tid); &is_empty, &max, dim, tid);
sh_topk[tid] = topk[0]; sh_topk[tid] = topk[0];
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output, BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
&indices, beam, k, tid, warp); &indices, &beam, &k, tid, warp);
} }
} }
...@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
KeMatrixTopK<T, 5, 256><<< KeMatrixTopK<T, 5, 256><<<
grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>( grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context()) ctx.device_context())
.stream()>>>(output_data, output->dims()[1], .stream()>>>(
indices_data, input_data, output_data, output->dims()[1], indices_data, input_data, input_width,
input_width, input_width, int(k)); input_width, static_cast<int>(k));
} }
}; };
......
...@@ -24,7 +24,19 @@ template <typename T> ...@@ -24,7 +24,19 @@ template <typename T>
class CPUUniformRandomKernel : public framework::OpKernel<T> { class CPUUniformRandomKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* tensor = ctx.Output<framework::Tensor>("Out"); framework::Tensor* tensor = nullptr;
auto out_var = ctx.OutputVar("Out");
if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>();
} else if (out_var->IsType<framework::SelectedRows>()) {
auto shape = ctx.Attr<std::vector<int>>("shape");
tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(framework::make_ddim(shape));
} else {
PADDLE_THROW(
"uniform_random_op's output only"
"supports SelectedRows and Tensor");
}
T* data = tensor->mutable_data<T>(ctx.GetPlace()); T* data = tensor->mutable_data<T>(ctx.GetPlace());
unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed")); unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
std::minstd_rand engine; std::minstd_rand engine;
......
...@@ -43,7 +43,19 @@ template <typename T> ...@@ -43,7 +43,19 @@ template <typename T>
class GPUUniformRandomKernel : public framework::OpKernel<T> { class GPUUniformRandomKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* tensor = context.Output<framework::Tensor>("Out"); framework::Tensor* tensor = nullptr;
auto out_var = context.OutputVar("Out");
if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>();
} else if (out_var->IsType<framework::SelectedRows>()) {
auto shape = context.Attr<std::vector<int>>("shape");
tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(framework::make_ddim(shape));
} else {
PADDLE_THROW(
"uniform_random_op's output only"
"supports SelectedRows and Tensor");
}
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed")); unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
if (seed == 0) { if (seed == 0) {
......
...@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdlib.h> #include <string>
#include <string.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -18,16 +18,22 @@ limitations under the License. */ ...@@ -18,16 +18,22 @@ limitations under the License. */
#error device_ptr_cast must be include by .cu file #error device_ptr_cast must be include by .cu file
#endif #endif
#include <thrust/device_ptr.h> #include <type_traits> // For std::remove_pointer and std::is_pointer.
#include "thrust/device_ptr.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace details { namespace details {
// PointerToThrustDevicePtr has two speicalizations, one casts a (CUDA
// device) pointer into thrust::device_ptr, the other keeps rest types
// un-casted.
template <typename T, bool is_ptr> template <typename T, bool is_ptr>
struct DevicePtrCast; struct PointerToThrustDevicePtr;
template <typename T> template <typename T>
struct DevicePtrCast<T, true> { struct PointerToThrustDevicePtr<T, true> {
using ELEM = typename std::remove_pointer<T>::type; using ELEM = typename std::remove_pointer<T>::type;
using RTYPE = thrust::device_ptr<ELEM>; using RTYPE = thrust::device_ptr<ELEM>;
...@@ -37,17 +43,26 @@ struct DevicePtrCast<T, true> { ...@@ -37,17 +43,26 @@ struct DevicePtrCast<T, true> {
}; };
template <typename T> template <typename T>
struct DevicePtrCast<T, false> { struct PointerToThrustDevicePtr<T, false> {
using RTYPE = T; using RTYPE = T;
inline RTYPE operator()(RTYPE it) const { return it; } inline RTYPE operator()(RTYPE it) const { return it; }
}; };
// Cast T to thrust::device_ptr if T is a pointer. // CastToCUDATransformIterator casts a pointer to thrust::device_ptr
// Otherwise, e.g., T is a iterator, return T itself. // so it could be used as the iterator of thrust::transform. It
// doesn't cast other types.
//
// We need CastToCUDATransformIterator because it is often that we
// want to use device memory pointers as transform iterators, e.g., to
// transform a block of float32 to float16. In this case, we want
// CastToCUDATransformIterator to cast float16/32 pointers to
// thrust::device_ptr, otherwise they cannot work as the iterator
// required by thrust::transform. At the same time, we don't want to
// cast thrust::device_ptr to thrust::device_ptr repeatedly.
template <typename T> template <typename T>
auto DevPtrCast(T t) -> auto CastToCUDATransformIterator(T t) ->
typename DevicePtrCast<T, std::is_pointer<T>::value>::RTYPE { typename PointerToThrustDevicePtr<T, std::is_pointer<T>::value>::RTYPE {
DevicePtrCast<T, std::is_pointer<T>::value> cast; PointerToThrustDevicePtr<T, std::is_pointer<T>::value> cast;
return cast(t); return cast(t);
} }
......
...@@ -175,7 +175,7 @@ CUDADeviceContext::~CUDADeviceContext() { ...@@ -175,7 +175,7 @@ CUDADeviceContext::~CUDADeviceContext() {
Place CUDADeviceContext::GetPlace() const { return place_; } Place CUDADeviceContext::GetPlace() const { return place_; }
void CUDADeviceContext::Wait() const { void CUDADeviceContext::Wait() const {
std::lock_guard<std::mutex> guard(mutex_); std::lock_guard<std::recursive_mutex> guard(mutex_);
PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
PADDLE_ENFORCE(cudaGetLastError()); PADDLE_ENFORCE(cudaGetLastError());
} }
......
...@@ -98,13 +98,20 @@ class CUDADeviceContext : public DeviceContext { ...@@ -98,13 +98,20 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return cuda stream in the device context. */ /*! \brief Return cuda stream in the device context. */
cudaStream_t stream() const; cudaStream_t stream() const;
template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) {
std::lock_guard<std::recursive_mutex> guard(mutex_);
callback();
PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
}
private: private:
CUDAPlace place_; CUDAPlace place_;
std::unique_ptr<Eigen::GpuDevice> eigen_device_; std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_; std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
mutable std::mutex mutex_; mutable std::recursive_mutex mutex_;
cudaStream_t stream_; cudaStream_t stream_;
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
......
...@@ -14,29 +14,44 @@ limitations under the License. */ ...@@ -14,29 +14,44 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <type_traits>
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include <algorithm>
#include <type_traits>
#ifdef __NVCC__ #ifdef __NVCC__
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include "paddle/fluid/platform/details/device_ptr_cast.h" #include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
#endif #endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
// Transform on host or device. It provides the same API in std library. // Transform applys a unary or a binary functor on each element in a
// range defined by a pair of iterators.
//
// - The specialization for CPU calls std::transform.
// - The specialization for CUDA calls thrust::tranform.
//
// NOTE: We need to define InputIter and OutputIter defined as
// different types, because the InputIter points op's inputs and
// OutputIter pints to op's outputs.
//
// NOTE: We don't assume that InputIter to be const InputType* and
// OutputIter to be OutputType*, because we might use a iterator
// class, paddle::fluid::operators::RowwiseTRansformIterator.
template <typename DeviceContext> template <typename DeviceContext>
struct Transform { struct Transform {
// The unary version.
template <typename InputIter, typename OutputIter, typename UnaryOperation> template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const DeviceContext& context, InputIter first, InputIter last, void operator()(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op); OutputIter result, UnaryOperation op);
// The binary version.
template <typename InputIter1, typename InputIter2, typename OutputIter, template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation> typename BinaryOperation>
void operator()(const DeviceContext& context, InputIter1 first1, void operator()(const DeviceContext& context, InputIter1 first1,
...@@ -70,8 +85,9 @@ struct Transform<platform::CUDADeviceContext> { ...@@ -70,8 +85,9 @@ struct Transform<platform::CUDADeviceContext> {
auto place = context.GetPlace(); auto place = context.GetPlace();
PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
thrust::transform(thrust::cuda::par.on(context.stream()), thrust::transform(thrust::cuda::par.on(context.stream()),
details::DevPtrCast(first), details::DevPtrCast(last), details::CastToCUDATransformIterator(first),
details::DevPtrCast(result), op); details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result), op);
} }
template <typename InputIter1, typename InputIter2, typename OutputIter, template <typename InputIter1, typename InputIter2, typename OutputIter,
...@@ -82,9 +98,10 @@ struct Transform<platform::CUDADeviceContext> { ...@@ -82,9 +98,10 @@ struct Transform<platform::CUDADeviceContext> {
auto place = context.GetPlace(); auto place = context.GetPlace();
PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
thrust::transform(thrust::cuda::par.on(context.stream()), thrust::transform(thrust::cuda::par.on(context.stream()),
details::DevPtrCast(first1), details::DevPtrCast(last1), details::CastToCUDATransformIterator(first1),
details::DevPtrCast(first2), details::DevPtrCast(result), details::CastToCUDATransformIterator(last1),
op); details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result), op);
} }
}; };
#endif #endif
......
...@@ -18,11 +18,12 @@ limitations under the License. */ ...@@ -18,11 +18,12 @@ limitations under the License. */
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
namespace {
template <typename T> template <typename T>
class Scale { class Scale {
public: public:
explicit Scale(const T& scale) : scale_(scale) {} explicit Scale(const T& scale) : scale_(scale) {}
HOSTDEVICE T operator()(const T& a) const { return a * scale_; } HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
private: private:
...@@ -35,11 +36,23 @@ class Multiply { ...@@ -35,11 +36,23 @@ class Multiply {
HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
}; };
} // namespace
using paddle::memory::Alloc;
using paddle::memory::Free;
using paddle::memory::Copy;
using paddle::platform::CPUPlace;
using paddle::platform::CUDAPlace;
using paddle::platform::CPUDeviceContext;
using paddle::platform::CUDADeviceContext;
using paddle::platform::Transform;
TEST(Transform, CPUUnary) { TEST(Transform, CPUUnary) {
using namespace paddle::platform;
CPUDeviceContext ctx; CPUDeviceContext ctx;
float buf[4] = {0.1, 0.2, 0.3, 0.4}; float buf[4] = {0.1, 0.2, 0.3, 0.4};
Transform<paddle::platform::CPUDeviceContext> trans; Transform<CPUDeviceContext> trans;
trans(ctx, buf, buf + 4, buf, Scale<float>(10)); trans(ctx, buf, buf + 4, buf, Scale<float>(10));
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5); ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
...@@ -47,14 +60,12 @@ TEST(Transform, CPUUnary) { ...@@ -47,14 +60,12 @@ TEST(Transform, CPUUnary) {
} }
TEST(Transform, GPUUnary) { TEST(Transform, GPUUnary) {
using namespace paddle::platform;
using namespace paddle::memory;
CUDAPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4)); float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream()); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
Transform<paddle::platform::CUDADeviceContext> trans; Transform<CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10)); trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx.Wait(); ctx.Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream()); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
...@@ -65,10 +76,8 @@ TEST(Transform, GPUUnary) { ...@@ -65,10 +76,8 @@ TEST(Transform, GPUUnary) {
} }
TEST(Transform, CPUBinary) { TEST(Transform, CPUBinary) {
using namespace paddle::platform;
using namespace paddle::memory;
int buf[4] = {1, 2, 3, 4}; int buf[4] = {1, 2, 3, 4};
Transform<paddle::platform::CPUDeviceContext> trans; Transform<CPUDeviceContext> trans;
CPUDeviceContext ctx; CPUDeviceContext ctx;
trans(ctx, buf, buf + 4, buf, buf, Multiply<int>()); trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
...@@ -77,14 +86,12 @@ TEST(Transform, CPUBinary) { ...@@ -77,14 +86,12 @@ TEST(Transform, CPUBinary) {
} }
TEST(Transform, GPUBinary) { TEST(Transform, GPUBinary) {
using namespace paddle::platform;
using namespace paddle::memory;
int buf[4] = {1, 2, 3, 4}; int buf[4] = {1, 2, 3, 4};
CUDAPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf))); int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
Transform<paddle::platform::CUDADeviceContext> trans; Transform<CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>()); trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx.Wait(); ctx.Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream()); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
......
...@@ -14,29 +14,25 @@ limitations under the License. */ ...@@ -14,29 +14,25 @@ limitations under the License. */
#pragma once #pragma once
#ifdef __CUDACC__ // Boost 1.41.0 requires __CUDACC_VER__, but in CUDA 9 __CUDACC_VER__
#ifdef __CUDACC_VER_MAJOR__ // is removed, so we have to manually define __CUDACC_VER__ instead.
// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define // For details, please refer to
// __CUDACC_VER__ instead. // https://github.com/PaddlePaddle/Paddle/issues/6626
#if defined(__CUDACC__) && defined(__CUDACC_VER_MAJOR__)
#undef __CUDACC_VER__ #undef __CUDACC_VER__
#define __CUDACC_VER__ \ #define __CUDACC_VER__ \
(__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \ __CUDACC_VER_BUILD__ + __CUDACC_VER_MAJOR__ * 10000 + \
__CUDACC_VER_BUILD__) __CUDACC_VER_MINOR__ * 100
#endif
#endif #endif
#include <boost/config.hpp> #include "boost/config.hpp"
#ifdef PADDLE_WITH_CUDA // Because Boost 1.41.0's variadic templates has bug on nvcc, boost
// will disable variadic template support in NVCC mode. Define
// Because boost's variadic templates has bug on nvcc, boost will disable // BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
// variadic template support when GPU enabled on nvcc. // function symbols. For details,
// Define BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
// function symbols.
//
// https://github.com/PaddlePaddle/Paddle/issues/3386 // https://github.com/PaddlePaddle/Paddle/issues/3386
#ifdef PADDLE_WITH_CUDA
#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES #ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
#define BOOST_NO_CXX11_VARIADIC_TEMPLATES #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
#endif #endif
......
...@@ -43,5 +43,5 @@ TEST(Chunk, Compressor) { ...@@ -43,5 +43,5 @@ TEST(Chunk, Compressor) {
ch.Clear(); ch.Clear();
ch.Parse(ss); ch.Parse(ss);
ASSERT_EQ(ch.NumBytes(), 18); ASSERT_EQ(ch.NumBytes(), 18ul);
} }
file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py) file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py) file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/*.py)
set(PY_FILES paddle/__init__.py set(PY_FILES paddle/__init__.py
${UTILS_PY_FILES} ${UTILS_PY_FILES}
${FLUID_PY_FILES}) ${FLUID_PY_FILES})
...@@ -7,7 +7,7 @@ set(PY_FILES paddle/__init__.py ...@@ -7,7 +7,7 @@ set(PY_FILES paddle/__init__.py
if(NOT WITH_FLUID_ONLY) if(NOT WITH_FLUID_ONLY)
file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py) file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py) file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py) file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/*.py)
set(PY_FILES ${PY_FILES} set(PY_FILES ${PY_FILES}
${TRAINER_PY_FILES} ${TRAINER_PY_FILES}
${HELPERS_PY_FILES} ${HELPERS_PY_FILES}
...@@ -55,7 +55,7 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/pad ...@@ -55,7 +55,7 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/pad
add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND touch stub.cc COMMAND touch stub.cc
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
......
...@@ -1115,4 +1115,6 @@ class DistributeTranspiler: ...@@ -1115,4 +1115,6 @@ class DistributeTranspiler:
for op2 in find_ops: for op2 in find_ops:
if ufind.is_connected(op1, op2): if ufind.is_connected(op1, op2):
lr_ops.append(op1) lr_ops.append(op1)
# we only need to append op for once
break
return lr_ops return lr_ops
...@@ -29,17 +29,20 @@ dtype_to_size = { ...@@ -29,17 +29,20 @@ dtype_to_size = {
core.VarDesc.VarType.BOOL: 1 core.VarDesc.VarType.BOOL: 1
} }
sub_block_ops = [ SUB_BLOCK_OPS = [
"while", "while_grad", "parallel_do", "parallel_do_grad", "while", "while_grad", "parallel_do", "parallel_do_grad",
"conditional_block", "conditional_block_grad" "conditional_block", "conditional_block_grad"
] ]
SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
("conditional_block", "conditional_block_grad")]
PRINT_LOG = False PRINT_LOG = False
class ControlFlowGraph(object): class ControlFlowGraph(object):
def __init__(self, Program, ops, forward_num, skip_opt): def __init__(self, program, ops, forward_num, skip_opt):
self._program = Program self._program = program
self._ops = ops self._ops = ops
self._forward_num = forward_num self._forward_num = forward_num
self._successors = defaultdict(set) self._successors = defaultdict(set)
...@@ -51,6 +54,7 @@ class ControlFlowGraph(object): ...@@ -51,6 +54,7 @@ class ControlFlowGraph(object):
self._skip_opt = skip_opt self._skip_opt = skip_opt
def _add_connections(self, connections): def _add_connections(self, connections):
"""Populates _successors and _presuccessors for two neighbor nodes."""
for node1, node2 in connections: for node1, node2 in connections:
self._add(node1, node2) self._add(node1, node2)
...@@ -58,7 +62,11 @@ class ControlFlowGraph(object): ...@@ -58,7 +62,11 @@ class ControlFlowGraph(object):
self._successors[node1].add(node2) self._successors[node1].add(node2)
self._presuccessors[node2].add(node1) self._presuccessors[node2].add(node1)
# TODO(panyx0718): We need to have a unified way of building intermediate
# representation.
def _build_graph(self): def _build_graph(self):
"""Build a graph based on op sequence.
"""
self.op_size = len(self._ops) self.op_size = len(self._ops)
op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)] op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
self._add_connections(op_node_connections) self._add_connections(op_node_connections)
...@@ -82,15 +90,14 @@ class ControlFlowGraph(object): ...@@ -82,15 +90,14 @@ class ControlFlowGraph(object):
self._live_out[i].add(new_name) self._live_out[i].add(new_name)
def _reach_fixed_point(self, live_in, live_out): def _reach_fixed_point(self, live_in, live_out):
"""Check if the liveness set has stablized."""
if len(live_in) != len(self._live_in): if len(live_in) != len(self._live_in):
return False return False
if len(live_out) != len(self._live_out): if len(live_out) != len(self._live_out):
return False return False
for i in range(self.op_size): for i in range(self.op_size):
if live_in[i] != self._live_in[i]: if (live_in[i] != self._live_in[i] or
return False live_out[i] != self._live_out[i]):
for i in range(self.op_size):
if live_out[i] != self._live_out[i]:
return False return False
return True return True
...@@ -98,6 +105,8 @@ class ControlFlowGraph(object): ...@@ -98,6 +105,8 @@ class ControlFlowGraph(object):
self._build_graph() self._build_graph()
live_in = defaultdict(set) live_in = defaultdict(set)
live_out = defaultdict(set) live_out = defaultdict(set)
# Repeatedly apply liveness updates until the algorithm stablize
# on a complete set live input vars and live output vars.
while True: while True:
for i in range(self.op_size, 0, -1): for i in range(self.op_size, 0, -1):
live_in[i] = set(self._live_in[i]) live_in[i] = set(self._live_in[i])
...@@ -141,6 +150,8 @@ class ControlFlowGraph(object): ...@@ -141,6 +150,8 @@ class ControlFlowGraph(object):
return False return False
return True return True
# TODO(panyx0718): This needs to be less hacky. It seems memory optimization
# doesn't consider vars copied between cpu and gpu.
def _update_skip_opt_set(self): def _update_skip_opt_set(self):
for i in range(self.op_size): for i in range(self.op_size):
op = self._ops[i] op = self._ops[i]
...@@ -154,7 +165,7 @@ class ControlFlowGraph(object): ...@@ -154,7 +165,7 @@ class ControlFlowGraph(object):
bwd_id = 0 bwd_id = 0
for i in range(self.op_size): for i in range(self.op_size):
op = self._ops[i] op = self._ops[i]
if op.type() in sub_block_ops: if op.type() in SUB_BLOCK_OPS:
continue continue
block_desc = op.block() block_desc = op.block()
is_forward = i < self._forward_num is_forward = i < self._forward_num
...@@ -177,13 +188,15 @@ class ControlFlowGraph(object): ...@@ -177,13 +188,15 @@ class ControlFlowGraph(object):
def compare_shape(x_shape, cache_shape, opt_level): def compare_shape(x_shape, cache_shape, opt_level):
if opt_level == 0: if opt_level == 0:
return x_shape == cache_shape return x_shape == cache_shape
if opt_level == 1: elif opt_level == 1:
if (x_shape[0] == -1) ^ (cache_shape[0] == -1): if (x_shape[0] == -1) ^ (cache_shape[0] == -1):
return False return False
x_size = abs(reduce(lambda x, y: x * y, x_shape)) x_size = abs(reduce(lambda x, y: x * y, x_shape))
cache_size = abs(reduce(lambda x, y: x * y, cache_shape)) cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
if x_size <= cache_size: if x_size <= cache_size:
return True return True
else:
raise ValueError("only support opt_level 0 or 1.")
return False return False
self._dataflow_analyze() self._dataflow_analyze()
...@@ -191,10 +204,9 @@ class ControlFlowGraph(object): ...@@ -191,10 +204,9 @@ class ControlFlowGraph(object):
self.pool = [] self.pool = []
for i in range(self.op_size): for i in range(self.op_size):
op = self._ops[i] op = self._ops[i]
if op.type() in sub_block_ops: if op.type() in SUB_BLOCK_OPS:
continue continue
block_desc = op.block() block_desc = op.block()
self.current_block_desc = block_desc
is_forward = i < self._forward_num is_forward = i < self._forward_num
if self.pool: if self.pool:
defs_can_optimize = filter( defs_can_optimize = filter(
...@@ -211,37 +223,40 @@ class ControlFlowGraph(object): ...@@ -211,37 +223,40 @@ class ControlFlowGraph(object):
for index, cache_pair in enumerate(self.pool): for index, cache_pair in enumerate(self.pool):
cache_var = cache_pair[0] cache_var = cache_pair[0]
cache_shape = cache_pair[1] cache_shape = cache_pair[1]
if compare_shape(x_shape, cache_shape, level): if not compare_shape(x_shape, cache_shape, level):
if self._has_var(block_desc, cache_var, is_forward): continue
if not self._has_var(block_desc, cache_var, is_forward):
continue
x_dtype = self._find_var(block_desc, x, x_dtype = self._find_var(block_desc, x,
is_forward).dtype() is_forward).dtype()
cache_dtype = self._find_var( cache_dtype = self._find_var(block_desc, cache_var,
block_desc, cache_var, is_forward).dtype() is_forward).dtype()
# TODO(qijun): actually, we should compare dtype_to_size[x_dtype] # TODO(qijun): actually, we should compare
# and dtype_to_size[cache_dtype] # dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
if x_dtype == cache_dtype: if x_dtype != cache_dtype:
continue
if PRINT_LOG: if PRINT_LOG:
print( print(("Hit Cache !!!! cache pool index "
("Hit Cache !!!! cache pool index "
"is %d, var name is %s, " "is %d, var name is %s, "
"cached var name is %s, " "cached var name is %s, "
"var shape is %s ") % "var shape is %s ") % (index, x, cache_var,
(index, x, cache_var,
str(cache_shape))) str(cache_shape)))
self.pool.pop(index) self.pool.pop(index)
if x == cache_var: if x == cache_var:
break break
_rename_arg_( # Rename the var to the cache var already with
self._ops, x, cache_var, begin_idx=i) # memory allocated in order to reuse the memory.
self._program.block(block_desc.id).var( _rename_arg_(self._ops, x, cache_var, begin_idx=i)
str(x)).desc = self._find_var( self._program.block(block_desc.id).var(str(
block_desc, cache_var, is_forward) x)).desc = self._find_var(block_desc, cache_var,
self._update_graph( is_forward)
x, cache_var, begin_idx=i) self._update_graph(x, cache_var, begin_idx=i)
break break
in_diff, out_diff = self._get_diff(self._live_in[i], in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
self._live_out[i])
can_optimize = filter( can_optimize = filter(
lambda x: self._check_var_validity(block_desc, x, is_forward), lambda x: self._check_var_validity(block_desc, x, is_forward),
in_diff) in_diff)
...@@ -252,6 +267,19 @@ class ControlFlowGraph(object): ...@@ -252,6 +267,19 @@ class ControlFlowGraph(object):
def _process_sub_block_pair(pdesc, sub_block_pair): def _process_sub_block_pair(pdesc, sub_block_pair):
"""Creates a list of tuple each of which tracks info of a subblock.
Note: this function doesn't handle nested subblocks yet.
TODO(panyx0718): assert if case nested subblocks happen.
:param pdesc: ProgramDesc.
:param sub_block_pair: A list op pairs. Each op pair is the forward
op and backward op. The ops in the list are special that they contain
a subblock of ops.
:return: A list of tuples, each tuple is (all ops in a subblock pair
including forward and backward, number of forward ops,
all output args names of the ops in the subblock pairs).
"""
ops_list = [] ops_list = []
block_desc = pdesc.block(0) block_desc = pdesc.block(0)
op_size = block_desc.op_size() op_size = block_desc.op_size()
...@@ -308,6 +336,11 @@ def _process_sub_block_pair(pdesc, sub_block_pair): ...@@ -308,6 +336,11 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
def _get_cfgs(input_program): def _get_cfgs(input_program):
"""Process each block and create ControlFlowGraph for each of them.
:param input_program: Program object.
:return: A list of ControlFlowGraph, each corresponds to a block.
"""
ops_list = [] ops_list = []
pdesc = input_program.get_desc() pdesc = input_program.get_desc()
block_desc = pdesc.block(0) block_desc = pdesc.block(0)
...@@ -316,11 +349,8 @@ def _get_cfgs(input_program): ...@@ -316,11 +349,8 @@ def _get_cfgs(input_program):
ops_list.append( ops_list.append(
([block_desc.op(i) for i in range(op_size)], op_size, set())) ([block_desc.op(i) for i in range(op_size)], op_size, set()))
sub_block_pair = [("while", "while_grad"), ("parallel_do", # Only process one level of nested subblock.
"parallel_do_grad"), ops_list.extend(_process_sub_block_pair(pdesc, SUB_BLOCK_PAIR))
("conditional_block", "conditional_block_grad")]
ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
cfgs = [ cfgs = [
ControlFlowGraph(input_program, ops, forward_num, skip_opt) ControlFlowGraph(input_program, ops, forward_num, skip_opt)
...@@ -330,6 +360,17 @@ def _get_cfgs(input_program): ...@@ -330,6 +360,17 @@ def _get_cfgs(input_program):
def memory_optimize(input_program, print_log=False, level=0): def memory_optimize(input_program, print_log=False, level=0):
"""Optimize memory by reusing var memory.
Note: it doesn't not support subblock nested in subblock.
:param input_program: Input Program
:param print_log: whether to print debug log.
:param level: If level=0, reuse if the shape is completely equal, o
:return:
"""
if level != 0 and level != 1:
raise ValueError("only support opt_level 0 or 1.")
global PRINT_LOG global PRINT_LOG
PRINT_LOG = print_log PRINT_LOG = print_log
cfgs = _get_cfgs(input_program) cfgs = _get_cfgs(input_program)
......
...@@ -12,17 +12,16 @@ ...@@ -12,17 +12,16 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import contextlib
import math import math
import numpy as np import numpy as np
import os
import time
import unittest
import paddle import paddle
import paddle.dataset.conll05 as conll05 import paddle.dataset.conll05 as conll05
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.initializer import init_on_cpu
import contextlib
import time
import unittest
import os
word_dict, verb_dict, label_dict = conll05.get_dict() word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict) word_dict_len = len(word_dict)
...@@ -37,7 +36,7 @@ depth = 8 ...@@ -37,7 +36,7 @@ depth = 8
mix_hidden_lr = 1e-3 mix_hidden_lr = 1e-3
IS_SPARSE = True IS_SPARSE = True
PASS_NUM = 10 PASS_NUM = 100
BATCH_SIZE = 10 BATCH_SIZE = 10
embedding_name = 'emb' embedding_name = 'emb'
...@@ -77,7 +76,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -77,7 +76,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
emb_layers.append(mark_embedding) emb_layers.append(mark_embedding)
hidden_0_layers = [ hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
for emb in emb_layers
] ]
hidden_0 = fluid.layers.sums(input=hidden_0_layers) hidden_0 = fluid.layers.sums(input=hidden_0_layers)
...@@ -94,8 +94,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -94,8 +94,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
for i in range(1, depth): for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[ mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim), fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim) fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
]) ])
lstm = fluid.layers.dynamic_lstm( lstm = fluid.layers.dynamic_lstm(
...@@ -109,8 +109,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ...@@ -109,8 +109,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
input_tmp = [mix_hidden, lstm] input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[ feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=label_dict_len), fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len) fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
]) ])
return feature_out return feature_out
...@@ -171,7 +171,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -171,7 +171,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
# check other optimizers and check why out will be NAN # check other optimizers and check why out will be NAN
sgd_optimizer = fluid.optimizer.SGD( sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay( learning_rate=fluid.layers.exponential_decay(
learning_rate=0.0001, learning_rate=0.01,
decay_steps=100000, decay_steps=100000,
decay_rate=0.5, decay_rate=0.5,
staircase=True)) staircase=True))
...@@ -233,7 +233,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -233,7 +233,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
print("second per batch: " + str((time.time( print("second per batch: " + str((time.time(
) - start_time) / batch_id)) ) - start_time) / batch_id))
# Set the threshold low to speed up the CI test # Set the threshold low to speed up the CI test
if float(pass_precision) > 0.05: if float(pass_precision) > 0.01:
if save_dirname is not None: if save_dirname is not None:
# TODO(liuyiqun): Change the target to crf_decode # TODO(liuyiqun): Change the target to crf_decode
fluid.io.save_inference_model(save_dirname, [ fluid.io.save_inference_model(save_dirname, [
......
...@@ -157,7 +157,6 @@ def train(nn_type, ...@@ -157,7 +157,6 @@ def train(nn_type,
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
pserver_endpoints = os.getenv("PSERVERS")
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# The fully connected test is removed whe the WITH_MKLDNN flag is OFF
# Because the fully connected layer has only one kernel (MKLDNN)
if(NOT WITH_MKLDNN)
list(REMOVE_ITEM TEST_OPS test_fc_op)
endif(NOT WITH_MKLDNN)
if(NOT WITH_DISTRIBUTE) if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_recv_op) list(REMOVE_ITEM TEST_OPS test_recv_op)
endif(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE)
...@@ -28,6 +34,8 @@ function(py_test_modules TARGET_NAME) ...@@ -28,6 +34,8 @@ function(py_test_modules TARGET_NAME)
endif() endif()
endfunction() endfunction()
list(REMOVE_ITEM TEST_OPS test_sequence_expand)
# test time consuming OPs in a separate process for expliot parallism # test time consuming OPs in a separate process for expliot parallism
list(REMOVE_ITEM TEST_OPS test_parallel_executor) list(REMOVE_ITEM TEST_OPS test_parallel_executor)
list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_warpctc_op)
...@@ -64,6 +72,8 @@ else() ...@@ -64,6 +72,8 @@ else()
endforeach(TEST_OP) endforeach(TEST_OP)
endif(WITH_FAST_BUNDLE_TEST) endif(WITH_FAST_BUNDLE_TEST)
#
py_test_modules(test_sequence_expand MODULES test_sequence_expand)
# tests with high overhead # tests with high overhead
py_test_modules(test_parallel_executor MODULES test_parallel_executor) py_test_modules(test_parallel_executor MODULES test_parallel_executor)
py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR}) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
......
...@@ -505,3 +505,148 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): ...@@ -505,3 +505,148 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
train_loss, test_loss, atol=1e-8), train_loss, test_loss, atol=1e-8),
"Train loss: " + str(train_loss) + "\n Test loss:" + "Train loss: " + str(train_loss) + "\n Test loss:" +
str(test_loss)) str(test_loss))
import paddle.dataset.conll05 as conll05
import paddle.fluid as fluid
word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_dict_len = len(verb_dict)
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
mix_hidden_lr = 1e-3
embedding_name = 'emb'
def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
**ignored):
# 8 features
predicate_embedding = fluid.layers.embedding(
input=predicate,
size=[pred_dict_len, word_dim],
dtype='float32',
param_attr='vemb')
mark_embedding = fluid.layers.embedding(
input=mark, size=[mark_dict_len, mark_dim], dtype='float32')
word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
emb_layers = [
fluid.layers.embedding(
size=[word_dict_len, word_dim],
input=x,
param_attr=fluid.ParamAttr(
name=embedding_name, trainable=False)) for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
for emb in emb_layers
]
hidden_0 = fluid.layers.sums(input=hidden_0_layers)
lstm_0 = fluid.layers.dynamic_lstm(
input=hidden_0,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid')
# stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
])
lstm = fluid.layers.dynamic_lstm(
input=mix_hidden,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid',
is_reverse=((i % 2) == 1))
input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
])
return feature_out
class TestCRFModel(unittest.TestCase):
def test_all(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
word = fluid.layers.data(
name='word_data', shape=[1], dtype='int64', lod_level=1)
predicate = fluid.layers.data(
name='verb_data', shape=[1], dtype='int64', lod_level=1)
ctx_n2 = fluid.layers.data(
name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
ctx_n1 = fluid.layers.data(
name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
ctx_0 = fluid.layers.data(
name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
ctx_p1 = fluid.layers.data(
name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
ctx_p2 = fluid.layers.data(
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(
name='mark_data', shape=[1], dtype='int64', lod_level=1)
feature_out = db_lstm(**locals())
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
crf_cost = fluid.layers.linear_chain_crf(
input=feature_out,
label=target,
param_attr=fluid.ParamAttr(
name='crfw', learning_rate=1e-1))
avg_cost = fluid.layers.mean(crf_cost)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.01,
decay_steps=100000,
decay_rate=0.5,
staircase=True))
sgd_optimizer.minimize(avg_cost)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192),
batch_size=16)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
pe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
feeder = fluid.DataFeeder(
feed_list=[
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
mark, target
],
place=fluid.CPUPlace())
data = train_data()
for i in xrange(10):
cur_batch = next(data)
print map(numpy.array,
pe.run(feed_dict=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))[0]
...@@ -47,8 +47,10 @@ class TestSequenceExpand(OpTest): ...@@ -47,8 +47,10 @@ class TestSequenceExpand(OpTest):
x_len = x_idx[i] - x_idx[i - 1] x_len = x_idx[i] - x_idx[i - 1]
if repeat_num > 0: if repeat_num > 0:
x_sub = x_data[x_idx[i - 1]:x_idx[i], :] x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
x_sub = np.repeat(x_sub, repeat_num, axis=0) stacked_x_sub = x_sub
out = np.vstack((out, x_sub)) for r in range(repeat_num - 1):
stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
out = np.vstack((out, stacked_x_sub))
if x_lod is not None: if x_lod is not None:
for j in xrange(repeat_num): for j in xrange(repeat_num):
out_lod[0].append(out_lod[0][-1] + x_len) out_lod[0].append(out_lod[0][-1] + x_len)
...@@ -101,11 +103,11 @@ class TestSequenceExpandCase3(TestSequenceExpand): ...@@ -101,11 +103,11 @@ class TestSequenceExpandCase3(TestSequenceExpand):
class TestSequenceExpandCase4(TestSequenceExpand): class TestSequenceExpandCase4(TestSequenceExpand):
def set_data(self): def set_data(self):
data = [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3] data = np.random.uniform(0.1, 1, [5 * 2, 1])
x_data = np.array(data).reshape([5, 2]).astype('float32') x_data = np.array(data).reshape([5, 2]).astype('float32')
x_lod = [[0, 2, 5]] x_lod = [[0, 2, 5]]
y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32') y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
y_lod = [[0, 1, 2], [0, 1, 2]] y_lod = [[0, 1, 3], [0, 1, 3]]
self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
......
...@@ -15,6 +15,16 @@ ...@@ -15,6 +15,16 @@
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
def output_hist(out):
hist, _ = np.histogram(out, range=(-5, 10))
hist = hist.astype("float32")
hist /= float(out.size)
prob = 0.1 * np.ones((10))
return hist, prob
class TestUniformRandomOp(OpTest): class TestUniformRandomOp(OpTest):
...@@ -33,11 +43,37 @@ class TestUniformRandomOp(OpTest): ...@@ -33,11 +43,37 @@ class TestUniformRandomOp(OpTest):
self.check_output_customized(self.verify_output) self.check_output_customized(self.verify_output)
def verify_output(self, outs): def verify_output(self, outs):
tensor = outs[0] hist, prob = output_hist(np.array(outs[0]))
hist, _ = np.histogram(outs[0], range=(-5, 10)) self.assertTrue(
hist = hist.astype("float32") np.allclose(
hist /= float(outs[0].size) hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
prob = 0.1 * np.ones((10))
class TestUniformRandomOpSelectedRows(unittest.TestCase):
def get_places(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
return places
def test_check_output(self):
for place in self.get_places():
self.check_with_place(place)
def check_with_place(self, place):
scope = core.Scope()
out = scope.var("X").get_selected_rows()
op = Operator(
"uniform_random",
Out="X",
shape=[4, 784],
min=-5.0,
max=10.0,
seed=10)
op.run(scope, place)
self.assertEqual(out.get_tensor().shape(), [4, 784])
hist, prob = output_hist(np.array(out.get_tensor()))
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
hist, prob, rtol=0, atol=0.01), "hist: " + str(hist)) hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册