提交 130ac177 编写于 作者: Z zhouwei25

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <queue> #include <queue>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
...@@ -124,7 +125,9 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -124,7 +125,9 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps, std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
std::vector<OpHandleBase *> *fetch_ops, std::vector<OpHandleBase *> *fetch_ops,
std::vector<OpHandleBase *> *ready_fetch_ops) { std::vector<OpHandleBase *> *ready_fetch_ops) {
for (auto &fetch_var_name : fetch_tensors) { std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
fetch_tensors.end());
for (auto &fetch_var_name : fetch_tensor_set) {
for (auto &var_map : graph_->Get<GraphVars>(kGraphVars)) { for (auto &var_map : graph_->Get<GraphVars>(kGraphVars)) {
auto it = var_map.find(fetch_var_name); auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) { if (it != var_map.end()) {
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -157,7 +156,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -157,7 +156,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
FeedFetchList *fetch_data) { FeedFetchList *fetch_data) {
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars; std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
std::unordered_set<VarHandleBase *> local_ready_vars; std::unordered_set<VarHandleBase *> local_ready_vars;
for (auto &fetch_var_name : fetch_tensors) { std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
fetch_tensors.end());
for (auto &fetch_var_name : fetch_tensor_set) {
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) { for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
auto it = var_map.find(fetch_var_name); auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) { if (it != var_map.end()) {
......
...@@ -28,8 +28,9 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -28,8 +28,9 @@ class AucOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("Label"), PADDLE_ENFORCE(ctx->HasInput("Label"),
"Input of Label should not be null."); "Input of Label should not be null.");
auto predict_width = ctx->GetInputDim("Predict")[1]; auto predict_width = ctx->GetInputDim("Predict")[1];
PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, predict_width, 2, PADDLE_INFERSHAPE_ENFORCE_LE(ctx, predict_width, 2,
"Only support binary classification"); "Only support binary classification,"
"prediction dims[1] should be 1 or 2");
auto predict_height = ctx->GetInputDim("Predict")[0]; auto predict_height = ctx->GetInputDim("Predict")[0];
auto label_height = ctx->GetInputDim("Label")[0]; auto label_height = ctx->GetInputDim("Label")[0];
......
...@@ -75,7 +75,10 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -75,7 +75,10 @@ class AucKernel : public framework::OpKernel<T> {
const auto *label_data = label->data<int64_t>(); const auto *label_data = label->data<int64_t>();
for (size_t i = 0; i < batch_size; i++) { for (size_t i = 0; i < batch_size; i++) {
auto predict_data = inference_data[i * inference_width + 1]; // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob
// if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob
auto predict_data =
inference_data[i * inference_width + (inference_width - 1)];
PADDLE_ENFORCE_LE(predict_data, 1, PADDLE_ENFORCE_LE(predict_data, 1,
"The predict data must less or equal 1."); "The predict data must less or equal 1.");
PADDLE_ENFORCE_GE(predict_data, 0, PADDLE_ENFORCE_GE(predict_data, 0,
......
...@@ -29,55 +29,6 @@ using mkldnn::reorder; ...@@ -29,55 +29,6 @@ using mkldnn::reorder;
using mkldnn::stream; using mkldnn::stream;
using platform::to_void_cast; using platform::to_void_cast;
// Generate keys for storing/retriving primitives for this operator
std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
const memory::dims& input_dims,
const std::string& pooling_type,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const memory::data_type& dt, const memory::format& fmt,
const std::string& suffix) {
std::string key;
key.reserve(platform::MKLDNNHandler::MaxKeyLength);
platform::MKLDNNHandler::AppendKeyDims(&key, input_dims);
platform::MKLDNNHandler::AppendKey(&key, pooling_type);
platform::MKLDNNHandler::AppendKeyVec(&key, ksize);
platform::MKLDNNHandler::AppendKeyVec(&key, strides);
platform::MKLDNNHandler::AppendKeyVec(&key, paddings);
platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
platform::MKLDNNHandler::AppendKey(&key, std::to_string(fmt));
platform::MKLDNNHandler::AppendKey(&key, suffix);
if (platform::get_cur_mkldnn_session_id() ==
platform::kMKLDNNSessionID_Default) {
auto tid = std::this_thread::get_id();
std::stringstream ss;
ss << tid;
platform::MKLDNNHandler::AppendKey(&key, "-t:");
platform::MKLDNNHandler::AppendKey(&key, ss.str());
}
return key;
}
static inline int ComputeCeiledOutput(int input_size, int kernel_size,
int padding, int stride) {
return (input_size - kernel_size + 2 * padding) / stride + 1;
}
static inline void CorrectOutputSize(
const std::vector<int>& src_tz, const std::vector<int>& dst_tz,
const std::vector<int>& kernel_size, const std::vector<int>& paddings,
const std::vector<int>& strides,
std::vector<int>& right_bot_padding) { // NOLINT
for (size_t i = 0; i < right_bot_padding.size(); i++) {
int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i],
paddings[i], strides[i]);
if (desired_size != dst_tz[i + 2]) {
right_bot_padding[i] += strides[i];
}
}
}
template <typename T> template <typename T>
class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public: public:
...@@ -99,7 +50,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -99,7 +50,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize"); std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides"); std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
bool is_test = ctx.Attr<bool>("is_test");
if (ctx.Attr<bool>("global_pooling")) { if (ctx.Attr<bool>("global_pooling")) {
for (size_t i = 0; i < ksize.size(); ++i) { for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0; paddings[i] = 0;
...@@ -126,139 +77,46 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -126,139 +77,46 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn::memory::data_type dt = mkldnn::memory::data_type dt =
paddle::framework::ToMKLDNNDataType(input->type()); paddle::framework::ToMKLDNNDataType(input->type());
auto fmt = input->format(); auto fmt = input->format();
const std::string key =
CreateKey(ctx, src_tz, pooling_type, ksize, strides, paddings, dt, fmt,
ctx.op().Output("Out"));
const std::string key_pool_p = key + "@pool_p";
const std::string key_pool_pd = key + "@pool_pd";
const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
const std::string key_pool_workspace_memory =
key + "@pool_workspace_memory";
std::shared_ptr<mkldnn::memory> src_memory, dst_memory;
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd;
std::shared_ptr<mkldnn::memory> pool_src_memory_p, pool_dst_memory_p;
auto pool_p =
std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
if (pool_p == nullptr) {
const std::vector<int>& padding_left_top(paddings);
std::vector<int> padding_right_bottom(paddings);
bool ceil_mode = ctx.Attr<bool>("ceil_mode");
if (ceil_mode) {
CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
padding_right_bottom);
}
auto src_md = platform::MKLDNNMemDesc(src_tz, dt, input_format); const std::string key = platform::PoolingMKLDNNHandler::GetHash(
src_tz, pooling_type, ksize, strides, paddings, dt, fmt,
/* create memory descriptor for pooling without specified format ctx.op().Output("Out"));
* ('any') which lets a primitive (pooling in this case) choose
* the memory format preferred for best performance
*/
auto dst_md =
platform::MKLDNNMemDesc(dst_tz, dt, mkldnn::memory::format::any);
auto propagation = src_md.data.data_type == mkldnn_f32
? mkldnn::prop_kind::forward_training
: mkldnn::prop_kind::forward_scoring;
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
CreatePrimitiveDesc(src_md, dst_md, propagation, strides,
padding_left_top, padding_right_bottom, ksize,
pooling_type, mkldnn_engine, ceil_mode, is_test);
// save pool_pd into global device context to be referred in backward path
if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd);
src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
to_void_cast<T>(input_data));
dst_memory =
std::make_shared<memory>(pool_pd->dst_primitive_desc(), output_data);
dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory);
if (is_test) {
pool_p = std::make_shared<pooling_forward>(*pool_pd, *src_memory,
*dst_memory);
} else {
std::shared_ptr<mkldnn::memory> workspace_memory =
CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
// save pool_workspace_memory to be referred in backward path
dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
pool_p = std::make_shared<pooling_forward>(
*pool_pd, *src_memory, *dst_memory, *workspace_memory);
}
dev_ctx.SetBlob(key_pool_p, pool_p); platform::PoolingMKLDNNHandler handler(pooling_type, dt,
ctx.Attr<bool>("is_test"), dev_ctx,
output_format = mkldnn_engine, key);
(memory::format)dst_memory->get_primitive_desc().desc().data.format;
} else { auto src_md = platform::MKLDNNMemDesc(src_tz, dt, input_format);
// Primitives already exist
pool_src_memory_p = auto src_memory =
std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p)); handler.AcquireSrcMemory(src_md, to_void_cast<T>(input_data));
PADDLE_ENFORCE(pool_src_memory_p != nullptr,
"Fail to find pooling src mem_p in device context"); /* create memory descriptor for pooling without specified format
pool_dst_memory_p = * ('any') which lets a primitive (pooling in this case) choose
std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p)); * the memory format preferred for best performance
PADDLE_ENFORCE(pool_dst_memory_p != nullptr, */
"Fail to find pooling dst mem_p in device context"); auto dst_md =
pool_src_memory_p->set_data_handle(to_void_cast<T>(input_data)); platform::MKLDNNMemDesc(dst_tz, dt, mkldnn::memory::format::any);
pool_dst_memory_p->set_data_handle(output_data);
auto pooling_pd = handler.AcquirePoolingPrimitiveDescriptor(
output_format = (memory::format)pool_dst_memory_p->get_primitive_desc() src_tz, dst_tz, src_md, dst_md, ksize, strides, paddings,
.desc() ctx.Attr<bool>("ceil_mode"));
.data.format;
} auto dst_memory =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
auto pool_p = handler.AcquirePooling(dst_memory, src_memory);
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{*pool_p}; std::vector<mkldnn::primitive> pipeline{*pool_p};
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
output_format =
(memory::format)dst_memory->get_primitive_desc().desc().data.format;
output->set_layout(DataLayout::kMKLDNN); output->set_layout(DataLayout::kMKLDNN);
output->set_format(output_format); output->set_format(output_format);
} }
private:
std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
const mkldnn::prop_kind& propagation, const std::vector<int>& stride,
const std::vector<int>& padding_left_top,
const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
const std::string& pooling_type, const mkldnn::engine& engine,
bool ceil_mode, bool is_test) const {
auto mkldnn_forward_prop_kind = is_test
? mkldnn::prop_kind::forward_inference
: mkldnn::prop_kind::forward_training;
auto pool_desc = mkldnn::pooling_forward::desc(
mkldnn_forward_prop_kind,
pooling_type == "max" ? mkldnn::algorithm::pooling_max
: mkldnn::algorithm::pooling_avg,
src, dst, stride, kernel, padding_left_top, padding_right_bot,
mkldnn::padding_kind::zero);
auto p_pool_pd =
new mkldnn::pooling_forward::primitive_desc(pool_desc, engine);
return std::unique_ptr<mkldnn::pooling_forward::primitive_desc>(p_pool_pd);
}
std::unique_ptr<mkldnn::memory> CreateWorkspaceMemory(
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd,
const std::string& pooling_type, const mkldnn::engine& engine) const {
mkldnn::memory::primitive_desc workspace_md =
pooling_type == "max"
? pool_pd->workspace_primitive_desc()
: mkldnn::memory::primitive_desc({{},
platform::MKLDNNGetDataType<T>(),
mkldnn::memory::format::nchw},
engine);
auto p_workspace_memory = new mkldnn::memory(workspace_md);
return std::unique_ptr<mkldnn::memory>(p_workspace_memory);
}
}; };
template <typename T> template <typename T>
...@@ -299,6 +157,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -299,6 +157,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
ctx.template device_context<platform::MKLDNNDeviceContext>(); ctx.template device_context<platform::MKLDNNDeviceContext>();
const mkldnn::engine& mkldnn_engine = dev_ctx.GetEngine(); const mkldnn::engine& mkldnn_engine = dev_ctx.GetEngine();
std::vector<mkldnn::primitive> pipeline;
const T* out_grad_data = out_grad->data<T>(); const T* out_grad_data = out_grad->data<T>();
T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace()); T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace());
memory::format in_x_grad_format{memory::format::format_undef}; memory::format in_x_grad_format{memory::format::format_undef};
...@@ -310,119 +170,41 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -310,119 +170,41 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// Get an unique name from "argument" name of "Out" variable // Get an unique name from "argument" name of "Out" variable
// This name will be used as key when referring info from device context // This name will be used as key when referring info from device context
const std::string key = CreateKey(ctx, diff_src_tz, pooling_type, ksize, const std::string key = platform::PoolingMKLDNNHandler::GetHash(
strides, paddings, memory::data_type::f32, diff_src_tz, pooling_type, ksize, strides, paddings,
in_x->format(), ctx.op().Input("Out")); memory::data_type::f32, in_x->format(), ctx.op().Input("Out"));
const std::string key_pool_bwd_p = key + "@pool_bwd_p";
const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
const std::string key_pool_pd = key + "@pool_pd";
const std::string key_pool_workspace_memory =
key + "@pool_workspace_memory";
auto user_diff_dst_memory =
memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()},
mkldnn_engine},
to_void_cast<T>(out_grad_data));
std::shared_ptr<memory> diff_src_memory;
std::shared_ptr<memory> diff_dst_memory;
auto dst_memory =
std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
PADDLE_ENFORCE(dst_memory != nullptr,
"Fail to find dst_memory in device context");
primitive reorder_diff_dst;
bool is_diff_dst_reordered = false;
auto pool_bwd_p = std::static_pointer_cast<pooling_backward>(
dev_ctx.GetBlob(key_pool_bwd_p));
if (pool_bwd_p == nullptr) {
// Retrieve src_memory/dst_memory saved in forward pass
auto src_memory =
std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
PADDLE_ENFORCE(src_memory != nullptr,
"Fail to find src_memory in device context");
// Retrieve pool_pd/pool_workspace_memory from device context
auto pool_pd =
std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
dev_ctx.GetBlob(key_pool_pd));
PADDLE_ENFORCE(pool_pd != nullptr,
"Fail to find pool_pd in device context");
auto workspace_memory = std::static_pointer_cast<memory>(
dev_ctx.GetBlob(key_pool_workspace_memory));
PADDLE_ENFORCE(workspace_memory != nullptr,
"Fail to find workspace_memory in device context");
// create memory descriptors for pooling
auto diff_src_md = src_memory.get()->get_primitive_desc().desc();
auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc();
auto pool_bwd_desc = mkldnn::pooling_backward::desc(
pooling_type == "max" ? mkldnn::algorithm::pooling_max
: mkldnn::algorithm::pooling_avg,
diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
mkldnn::padding_kind::zero);
auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
pool_bwd_desc, mkldnn_engine, *pool_pd);
// reorder between user_diff_dst and pool diff_dst if needed
diff_dst_memory = std::make_shared<memory>(user_diff_dst_memory);
if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
user_diff_dst_memory.get_primitive_desc()) {
diff_dst_memory =
std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
is_diff_dst_reordered = true;
}
diff_src_memory = std::make_shared<memory>( platform::PoolingMKLDNNHandler handler(
pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data); pooling_type, paddle::framework::ToMKLDNNDataType(in_x_grad->type()),
false, dev_ctx, mkldnn_engine, key);
dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory);
dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory);
pool_bwd_p = std::make_shared<pooling_backward>(
pool_bwd_pd, *diff_dst_memory, *workspace_memory, *diff_src_memory);
dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
} else {
// Primitives already exist
diff_src_memory = std::static_pointer_cast<memory>(
dev_ctx.GetBlob(key_pool_diff_src_mem_p));
PADDLE_ENFORCE(diff_src_memory != nullptr,
"Fail to find pooling src mem_p in device context");
diff_dst_memory = std::static_pointer_cast<memory>(
dev_ctx.GetBlob(key_pool_diff_dst_mem_p));
PADDLE_ENFORCE(diff_dst_memory != nullptr,
"Fail to find pooling dst mem_p in device context");
diff_src_memory->set_data_handle(reinterpret_cast<void*>(in_x_grad_data));
diff_dst_memory->set_data_handle(const_cast<T*>(out_grad_data));
// reorder between user_diff_dst and pool diff_dst if needed
if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
user_diff_dst_memory.get_primitive_desc()) {
diff_dst_memory =
std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
is_diff_dst_reordered = true;
}
}
in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc() auto workspace = handler.AcquireWorkspaceMemory();
.desc()
.data.format; auto diff_dst_md = platform::MKLDNNMemDesc(
{diff_dst_tz}, platform::MKLDNNGetDataType<T>(), out_grad->format());
auto diff_dst_memory = handler.AcquireDiffDstMemory(
diff_dst_md, to_void_cast<T>(out_grad_data));
auto diff_src_md =
platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType<T>(),
mkldnn::memory::format::any);
auto bwd_pd = handler.AcquirePoolingBackwardPrimitiveDescriptor(
diff_dst_md, diff_src_md, ksize, strides, paddings);
auto diff_src_memory = handler.AcquireDiffSrcMemoryFromPrimitive(
reinterpret_cast<void*>(in_x_grad_data));
auto pool_bwd_p = handler.AcquirePoolingBackward(diff_dst_memory, workspace,
diff_src_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline;
if (is_diff_dst_reordered) {
pipeline.push_back(reorder_diff_dst);
}
pipeline.push_back(*pool_bwd_p); pipeline.push_back(*pool_bwd_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc()
.desc()
.data.format;
in_x_grad->set_layout(DataLayout::kMKLDNN); in_x_grad->set_layout(DataLayout::kMKLDNN);
in_x_grad->set_format(in_x_grad_format); in_x_grad->set_format(in_x_grad_format);
} // Compute() } // Compute()
......
...@@ -58,10 +58,14 @@ class ScatterGradOp : public framework::OperatorWithKernel { ...@@ -58,10 +58,14 @@ class ScatterGradOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("Updates"), if (ctx->HasOutput(framework::GradVarName("Updates"))) {
ctx->GetInputDim("Updates")); ctx->SetOutputDim(framework::GradVarName("Updates"),
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Updates"));
ctx->GetInputDim(framework::GradVarName("Out"))); }
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"),
ctx->GetInputDim(framework::GradVarName("Out")));
}
} }
protected: protected:
......
...@@ -47,12 +47,15 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -47,12 +47,15 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates")); auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Ids = ctx.Input<Tensor>("Ids"); auto *Ids = ctx.Input<Tensor>("Ids");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out")); auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
if (dX) {
// In place gradient: dX = dO // In place gradient: dX = dO
dX->ShareDataWith(*dOut); framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
dUpdates->mutable_data<T>(ctx.GetPlace()); }
// Gradient by Gather: dUpdates = dO[Ids] if (dUpdates) {
GPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates); dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates = dO[Ids]
GPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
}
} }
}; };
......
...@@ -74,11 +74,15 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> { ...@@ -74,11 +74,15 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
auto *Ids = ctx.Input<Tensor>("Ids"); auto *Ids = ctx.Input<Tensor>("Ids");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out")); auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dX = dO if (dX) {
framework::TensorCopySync(*dOut, ctx.GetPlace(), dX); // In place gradient: dX = dO
dUpdates->mutable_data<T>(ctx.GetPlace()); framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
// Gradient by Gather: dUpdates = dO[Ids] }
CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates); if (dUpdates) {
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates = dO[Ids]
CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
}
} }
}; };
......
...@@ -108,17 +108,11 @@ int GetCUDADeviceCount() { ...@@ -108,17 +108,11 @@ int GetCUDADeviceCount() {
int GetCUDAComputeCapability(int id) { int GetCUDAComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
cudaDeviceProp device_prop; cudaDeviceProp device_prop;
auto e = cudaGetDeviceProperties(&device_prop, id); auto error_code = cudaGetDeviceProperties(&device_prop, id);
std::ostringstream ostr; PADDLE_ENFORCE(error_code,
ostr << "cudaGetDeviceProperties failed in" "cudaGetDeviceProperties failed in "
"paddle::platform::GetCUDAComputeCapability!" "paddle::platform::GetCUDAComputeCapability, error code : %d",
"Error Type ID = " error_code);
<< e << " Please see detail in:"
"https://docs.nvidia.com/cuda/cuda-runtime-api/"
"group__CUDART__TYPES.html#group__CUDART__TYPES_"
"1g3f51e3575c2178246db0a94a430e0038";
std::string ErrorLog = ostr.str();
PADDLE_ENFORCE(e, ErrorLog);
return device_prop.major * 10 + device_prop.minor; return device_prop.major * 10 + device_prop.minor;
} }
......
...@@ -122,6 +122,18 @@ class MKLDNNHandler { ...@@ -122,6 +122,18 @@ class MKLDNNHandler {
return mem_p; return mem_p;
} }
std::shared_ptr<mkldnn::memory> AcquireMemory(
const mkldnn::memory::primitive_desc& mpd, const std::string& suffix) {
auto local_key = key_ + suffix;
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
if (mem_p == nullptr) {
mem_p = std::make_shared<mkldnn::memory>(mpd);
dev_ctx_.SetBlob(local_key, mem_p);
}
return mem_p;
}
std::shared_ptr<mkldnn::memory> AcquireMemory( std::shared_ptr<mkldnn::memory> AcquireMemory(
const std::shared_ptr<mkldnn::memory>& user_memory_p, const std::shared_ptr<mkldnn::memory>& user_memory_p,
const std::shared_ptr<mkldnn::memory>& target_memory_p, const std::shared_ptr<mkldnn::memory>& target_memory_p,
...@@ -424,6 +436,223 @@ class ActivationMKLDNNHandler : public MKLDNNHandler { ...@@ -424,6 +436,223 @@ class ActivationMKLDNNHandler : public MKLDNNHandler {
std::shared_ptr<mkldnn::eltwise_backward::primitive_desc> activation_bwd_pd_; std::shared_ptr<mkldnn::eltwise_backward::primitive_desc> activation_bwd_pd_;
}; };
class PoolingMKLDNNHandler : public MKLDNNHandler {
public:
PoolingMKLDNNHandler(const std::string& pooling_type,
mkldnn::memory::data_type dt, bool is_test,
const platform::MKLDNNDeviceContext& dev_ctx,
mkldnn::engine engine, const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key),
dt_(dt),
pooling_type_(pooling_type),
is_test_(is_test) {}
std::shared_ptr<mkldnn::pooling_forward::primitive_desc>
AcquirePoolingPrimitiveDescriptor(
const std::vector<int>& src_tz, const std::vector<int>& dst_tz,
const mkldnn::memory::desc& src_md, const mkldnn::memory::desc& dst_md,
const std::vector<int>& ksize, const std::vector<int>& strides,
const std::vector<int>& paddings, bool ceil_mode) {
// Pooling PD has to be passed to Grad op that
// may be executed by diffrent thread, hence
// for that one we use key that does not contain TID
const std::string key_pooling_pd = key_common_ + "@pooling_pd";
fwd_pd_ = std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
dev_ctx_.GetBlob(key_pooling_pd));
if (fwd_pd_ == nullptr) {
static std::mutex acquire_barrier;
std::lock_guard<std::mutex> block_threads_until_finish_this_job(
acquire_barrier);
fwd_pd_ =
std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
dev_ctx_.GetBlob(key_pooling_pd));
if (fwd_pd_ == nullptr) {
std::vector<int> padding_left_top(paddings);
std::vector<int> padding_right_bottom(paddings);
if (ceil_mode) {
CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
padding_right_bottom);
}
auto mkldnn_forward_prop_kind =
is_test_ ? mkldnn::prop_kind::forward_inference
: mkldnn::prop_kind::forward_training;
auto pooling_desc = mkldnn::pooling_forward::desc(
mkldnn_forward_prop_kind,
pooling_type_ == "max" ? mkldnn::algorithm::pooling_max
: mkldnn::algorithm::pooling_avg,
src_md, dst_md, strides, ksize, padding_left_top,
padding_right_bottom, mkldnn::padding_kind::zero);
fwd_pd_.reset(
new mkldnn::pooling_forward::primitive_desc(pooling_desc, engine_));
dev_ctx_.SetBlob(key_pooling_pd, fwd_pd_);
}
}
return fwd_pd_;
}
std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_primitive_desc(), ptr,
"@dst_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(void) {
mkldnn::memory::primitive_desc workspace_mpd =
pooling_type_ == "max"
? fwd_pd_->workspace_primitive_desc()
: mkldnn::memory::primitive_desc(
{{}, dt_, mkldnn::memory::format::nchw}, engine_);
// Pooling PD has to be passed to Grad op that
// may be executed by diffrent thread, hence
// for that one we use key that does not contain TID
auto local_key = key_common_ + "@workspace";
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
if (mem_p == nullptr) {
static std::mutex acquire_barrier;
std::lock_guard<std::mutex> block_threads_until_finish_this_job(
acquire_barrier);
mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
if (mem_p == nullptr) {
mem_p = std::make_shared<mkldnn::memory>(workspace_mpd);
dev_ctx_.SetBlob(local_key, mem_p);
}
}
return mem_p;
}
std::shared_ptr<mkldnn::pooling_forward> AcquirePooling(
std::shared_ptr<mkldnn::memory> dst_memory,
std::shared_ptr<mkldnn::memory> src_memory) {
auto prim_key = key_ + "@pooling_p";
auto pooling_p = std::static_pointer_cast<mkldnn::pooling_forward>(
dev_ctx_.GetBlob(prim_key));
if (pooling_p == nullptr) {
if (is_test_) {
pooling_p = std::make_shared<mkldnn::pooling_forward>(
*fwd_pd_, *(src_memory), *(dst_memory));
} else {
// For training we need to create workspace
// to store indices from backward
auto workspace_memory = this->AcquireWorkspaceMemory();
pooling_p = std::make_shared<mkldnn::pooling_forward>(
*fwd_pd_, *src_memory, *dst_memory, *workspace_memory);
}
dev_ctx_.SetBlob(prim_key, pooling_p);
}
return pooling_p;
}
std::shared_ptr<mkldnn::pooling_backward::primitive_desc>
AcquirePoolingBackwardPrimitiveDescriptor(
const mkldnn::memory::desc& diff_dst_md,
const mkldnn::memory::desc& diff_src_md, const std::vector<int>& ksize,
const std::vector<int>& strides, const std::vector<int>& paddings) {
const std::string key_pooling_pd = key_common_ + "@pooling_pd";
const std::string key_pooling_bwd_pd = key_ + "@pooling_bwd_pd";
bwd_pd_ =
std::static_pointer_cast<mkldnn::pooling_backward::primitive_desc>(
dev_ctx_.GetBlob(key_pooling_bwd_pd));
if (bwd_pd_ == nullptr) {
fwd_pd_ =
std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
dev_ctx_.GetBlob(key_pooling_pd));
// PD from FWD op has to exist.
PADDLE_ENFORCE(fwd_pd_ != nullptr, "Pooling MKL-DNN not found in cache!");
auto backward_desc = mkldnn::pooling_backward::desc(
pooling_type_ == "max" ? mkldnn::algorithm::pooling_max
: mkldnn::algorithm::pooling_avg,
diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
mkldnn::padding_kind::zero);
bwd_pd_.reset(new mkldnn::pooling_backward::primitive_desc(
backward_desc, engine_, *fwd_pd_));
dev_ctx_.SetBlob(key_pooling_bwd_pd, bwd_pd_);
}
return bwd_pd_;
}
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto diff_dst_pd = bwd_pd_->diff_dst_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
"@diff_dst_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromPrimitive(void* ptr) {
return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_primitive_desc(),
ptr, "@diff_src_mem_p");
}
std::shared_ptr<mkldnn::pooling_backward> AcquirePoolingBackward(
std::shared_ptr<mkldnn::memory> diff_dst_memory,
std::shared_ptr<mkldnn::memory> workspace,
std::shared_ptr<mkldnn::memory> diff_src_memory) {
auto prim_key = key_ + "@pooling_bwd_p";
auto pooling_bwd_p = std::static_pointer_cast<mkldnn::pooling_backward>(
dev_ctx_.GetBlob(prim_key));
if (pooling_bwd_p == nullptr) {
pooling_bwd_p = std::make_shared<mkldnn::pooling_backward>(
*bwd_pd_, *diff_dst_memory, *workspace, *diff_src_memory);
dev_ctx_.SetBlob(prim_key, pooling_bwd_p);
}
return pooling_bwd_p;
}
static std::string GetHash(
const memory::dims& input_dims, const std::string& pooling_type,
const std::vector<int>& ksize, const std::vector<int>& strides,
const std::vector<int>& paddings, const memory::data_type& dt,
const memory::format& fmt, const std::string& suffix) {
std::string key;
key.reserve(platform::MKLDNNHandler::MaxKeyLength);
platform::MKLDNNHandler::AppendKeyDims(&key, input_dims);
platform::MKLDNNHandler::AppendKey(&key, pooling_type);
platform::MKLDNNHandler::AppendKeyVec(&key, ksize);
platform::MKLDNNHandler::AppendKeyVec(&key, strides);
platform::MKLDNNHandler::AppendKeyVec(&key, paddings);
platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
platform::MKLDNNHandler::AppendKey(&key, std::to_string(fmt));
platform::MKLDNNHandler::AppendKey(&key, suffix);
return key;
}
private:
static inline int ComputeCeiledOutput(int input_size, int kernel_size,
int padding, int stride) {
return (input_size - kernel_size + 2 * padding) / stride + 1;
}
static inline void CorrectOutputSize(
const std::vector<int>& src_tz, const std::vector<int>& dst_tz,
const std::vector<int>& kernel_size, const std::vector<int>& paddings,
const std::vector<int>& strides,
std::vector<int>& right_bot_padding) { // NOLINT
for (size_t i = 0; i < right_bot_padding.size(); i++) {
int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i],
paddings[i], strides[i]);
if (desired_size != dst_tz[i + 2]) {
right_bot_padding[i] += strides[i];
}
}
}
private:
mkldnn::memory::data_type dt_;
std::string pooling_type_;
bool is_test_;
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> fwd_pd_;
std::shared_ptr<mkldnn::pooling_backward::primitive_desc> bwd_pd_;
};
class TransposeMKLDNNHandler : public MKLDNNHandler { class TransposeMKLDNNHandler : public MKLDNNHandler {
public: public:
TransposeMKLDNNHandler(std::vector<int>& dims, // NOLINT TransposeMKLDNNHandler(std::vector<int>& dims, // NOLINT
......
...@@ -543,7 +543,7 @@ function assert_api_spec_approvals() { ...@@ -543,7 +543,7 @@ function assert_api_spec_approvals() {
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
echo "You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang) approval for the api change! ${API_FILE} for the avoidance of the bad C++ code habits." echo "You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang) approval for the usage (either add or delete) of const_cast."
exit 1 exit 1
fi fi
fi fi
...@@ -968,7 +968,7 @@ function build_document_preview() { ...@@ -968,7 +968,7 @@ function build_document_preview() {
function example() { function example() {
pip install /paddle/build/python/dist/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl pip install /paddle/build/python/dist/*.whl
paddle version paddle version
cd ${PADDLE_ROOT}/python/paddle/fluid cd ${PADDLE_ROOT}/python/paddle/fluid
python sampcd_processor.py python sampcd_processor.py
......
...@@ -212,10 +212,10 @@ if(WITH_DISTRIBUTE) ...@@ -212,10 +212,10 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_dgc_op MODULES test_dgc_op) py_test_modules(test_dgc_op MODULES test_dgc_op)
endif() endif()
if(NOT APPLE) if(NOT APPLE)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 300) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 350)
set_tests_properties(test_dist_mnist_nccl PROPERTIES TIMEOUT 300) set_tests_properties(test_dist_mnist_nccl PROPERTIES TIMEOUT 350)
set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 300) set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 350)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 300) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 350)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl) py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
bash_test_modules(test_launch MODULES test_launch.sh) bash_test_modules(test_launch MODULES test_launch.sh)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
from paddle.fluid import metrics
class TestAucSinglePredOp(OpTest):
def setUp(self):
self.op_type = "auc"
pred = np.random.random((128, 2)).astype("float32")
pred0 = pred[:, 0].reshape(128, 1)
labels = np.random.randint(0, 2, (128, 1)).astype("int64")
num_thresholds = 200
stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")
self.inputs = {
'Predict': pred0,
'Label': labels,
"StatPos": stat_pos,
"StatNeg": stat_neg
}
self.attrs = {
'curve': 'ROC',
'num_thresholds': num_thresholds,
"slide_steps": 1
}
python_auc = metrics.Auc(name="auc",
curve='ROC',
num_thresholds=num_thresholds)
for i in range(128):
pred[i][1] = pred[i][0]
python_auc.update(pred, labels)
self.outputs = {
'AUC': np.array(python_auc.eval()),
'StatPosOut': np.array(python_auc._stat_pos),
'StatNegOut': np.array(python_auc._stat_neg)
}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册