diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 11052273d2849b4b8836c55466e205b8fd0789de..7daab6dac19768e1d35c84bfd78d319c8a62512b 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" @@ -124,7 +125,9 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps( std::unordered_map> *op_deps, std::vector *fetch_ops, std::vector *ready_fetch_ops) { - for (auto &fetch_var_name : fetch_tensors) { + std::unordered_set fetch_tensor_set(fetch_tensors.begin(), + fetch_tensors.end()); + for (auto &fetch_var_name : fetch_tensor_set) { for (auto &var_map : graph_->Get(kGraphVars)) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index ed9d7d991f830428f79a56a440cb9c9a5ad86509..db28e1fe202116f49e0266a7bc24ddfb351c8bb4 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" - #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/profiler.h" @@ -157,7 +156,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( FeedFetchList *fetch_data) { std::unordered_map> fetched_vars; std::unordered_set local_ready_vars; - for (auto &fetch_var_name : fetch_tensors) { + std::unordered_set fetch_tensor_set(fetch_tensors.begin(), + fetch_tensors.end()); + for (auto &fetch_var_name : fetch_tensor_set) { for (auto &var_map : graph_->Get(details::kGraphVars)) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 001d26936886f12efc6eaa0333bb12e4e7118d67..e0eebad08bb6b9a15d9c0f356215404884bee0e9 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -28,8 +28,9 @@ class AucOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Label"), "Input of Label should not be null."); auto predict_width = ctx->GetInputDim("Predict")[1]; - PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, predict_width, 2, - "Only support binary classification"); + PADDLE_INFERSHAPE_ENFORCE_LE(ctx, predict_width, 2, + "Only support binary classification," + "prediction dims[1] should be 1 or 2"); auto predict_height = ctx->GetInputDim("Predict")[0]; auto label_height = ctx->GetInputDim("Label")[0]; diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h index 4ab5cfe53c67eeaa995d7e955eec63a065c5eec5..6fb4749b35a37dfbb18d322920b2744d7a0882d4 100644 --- a/paddle/fluid/operators/metrics/auc_op.h +++ b/paddle/fluid/operators/metrics/auc_op.h @@ -75,7 +75,10 @@ class AucKernel : public framework::OpKernel { const auto *label_data = label->data(); for (size_t i = 0; i < batch_size; i++) { - auto predict_data = inference_data[i * inference_width + 1]; + // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob + // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob + auto predict_data = + inference_data[i * inference_width + (inference_width - 1)]; PADDLE_ENFORCE_LE(predict_data, 1, "The predict data must less or equal 1."); PADDLE_ENFORCE_GE(predict_data, 0, diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index ea0abf930e7f548b93afca937c27fa8d25a35e94..52554800a30f2c8b666781706a9ad1f6d251b093 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -29,55 +29,6 @@ using mkldnn::reorder; using mkldnn::stream; using platform::to_void_cast; -// Generate keys for storing/retriving primitives for this operator -std::string CreateKey(const paddle::framework::ExecutionContext& ctx, - const memory::dims& input_dims, - const std::string& pooling_type, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const memory::data_type& dt, const memory::format& fmt, - const std::string& suffix) { - std::string key; - key.reserve(platform::MKLDNNHandler::MaxKeyLength); - platform::MKLDNNHandler::AppendKeyDims(&key, input_dims); - platform::MKLDNNHandler::AppendKey(&key, pooling_type); - platform::MKLDNNHandler::AppendKeyVec(&key, ksize); - platform::MKLDNNHandler::AppendKeyVec(&key, strides); - platform::MKLDNNHandler::AppendKeyVec(&key, paddings); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt)); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(fmt)); - platform::MKLDNNHandler::AppendKey(&key, suffix); - if (platform::get_cur_mkldnn_session_id() == - platform::kMKLDNNSessionID_Default) { - auto tid = std::this_thread::get_id(); - std::stringstream ss; - ss << tid; - platform::MKLDNNHandler::AppendKey(&key, "-t:"); - platform::MKLDNNHandler::AppendKey(&key, ss.str()); - } - return key; -} - -static inline int ComputeCeiledOutput(int input_size, int kernel_size, - int padding, int stride) { - return (input_size - kernel_size + 2 * padding) / stride + 1; -} - -static inline void CorrectOutputSize( - const std::vector& src_tz, const std::vector& dst_tz, - const std::vector& kernel_size, const std::vector& paddings, - const std::vector& strides, - std::vector& right_bot_padding) { // NOLINT - for (size_t i = 0; i < right_bot_padding.size(); i++) { - int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i], - paddings[i], strides[i]); - if (desired_size != dst_tz[i + 2]) { - right_bot_padding[i] += strides[i]; - } - } -} - template class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -99,7 +50,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector ksize = ctx.Attr>("ksize"); std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); - bool is_test = ctx.Attr("is_test"); + if (ctx.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; @@ -126,139 +77,46 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn::memory::data_type dt = paddle::framework::ToMKLDNNDataType(input->type()); auto fmt = input->format(); - const std::string key = - CreateKey(ctx, src_tz, pooling_type, ksize, strides, paddings, dt, fmt, - ctx.op().Output("Out")); - const std::string key_pool_p = key + "@pool_p"; - const std::string key_pool_pd = key + "@pool_pd"; - const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; - const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; - const std::string key_pool_workspace_memory = - key + "@pool_workspace_memory"; - - std::shared_ptr src_memory, dst_memory; - std::shared_ptr pool_pd; - std::shared_ptr pool_src_memory_p, pool_dst_memory_p; - - auto pool_p = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); - if (pool_p == nullptr) { - const std::vector& padding_left_top(paddings); - std::vector padding_right_bottom(paddings); - bool ceil_mode = ctx.Attr("ceil_mode"); - if (ceil_mode) { - CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, - padding_right_bottom); - } - auto src_md = platform::MKLDNNMemDesc(src_tz, dt, input_format); - - /* create memory descriptor for pooling without specified format - * ('any') which lets a primitive (pooling in this case) choose - * the memory format preferred for best performance - */ - auto dst_md = - platform::MKLDNNMemDesc(dst_tz, dt, mkldnn::memory::format::any); - auto propagation = src_md.data.data_type == mkldnn_f32 - ? mkldnn::prop_kind::forward_training - : mkldnn::prop_kind::forward_scoring; - std::shared_ptr pool_pd = - CreatePrimitiveDesc(src_md, dst_md, propagation, strides, - padding_left_top, padding_right_bottom, ksize, - pooling_type, mkldnn_engine, ceil_mode, is_test); - - // save pool_pd into global device context to be referred in backward path - if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd); - - src_memory = std::make_shared(pool_pd->src_primitive_desc(), - to_void_cast(input_data)); - dst_memory = - std::make_shared(pool_pd->dst_primitive_desc(), output_data); - - dev_ctx.SetBlob(key_pool_src_mem_p, src_memory); - dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory); - - if (is_test) { - pool_p = std::make_shared(*pool_pd, *src_memory, - *dst_memory); - } else { - std::shared_ptr workspace_memory = - CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine); - - // save pool_workspace_memory to be referred in backward path - dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); - - pool_p = std::make_shared( - *pool_pd, *src_memory, *dst_memory, *workspace_memory); - } + const std::string key = platform::PoolingMKLDNNHandler::GetHash( + src_tz, pooling_type, ksize, strides, paddings, dt, fmt, + ctx.op().Output("Out")); - dev_ctx.SetBlob(key_pool_p, pool_p); - - output_format = - (memory::format)dst_memory->get_primitive_desc().desc().data.format; - } else { - // Primitives already exist - pool_src_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); - PADDLE_ENFORCE(pool_src_memory_p != nullptr, - "Fail to find pooling src mem_p in device context"); - pool_dst_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); - PADDLE_ENFORCE(pool_dst_memory_p != nullptr, - "Fail to find pooling dst mem_p in device context"); - pool_src_memory_p->set_data_handle(to_void_cast(input_data)); - pool_dst_memory_p->set_data_handle(output_data); - - output_format = (memory::format)pool_dst_memory_p->get_primitive_desc() - .desc() - .data.format; - } + platform::PoolingMKLDNNHandler handler(pooling_type, dt, + ctx.Attr("is_test"), dev_ctx, + mkldnn_engine, key); + + auto src_md = platform::MKLDNNMemDesc(src_tz, dt, input_format); + + auto src_memory = + handler.AcquireSrcMemory(src_md, to_void_cast(input_data)); + + /* create memory descriptor for pooling without specified format + * ('any') which lets a primitive (pooling in this case) choose + * the memory format preferred for best performance + */ + auto dst_md = + platform::MKLDNNMemDesc(dst_tz, dt, mkldnn::memory::format::any); + + auto pooling_pd = handler.AcquirePoolingPrimitiveDescriptor( + src_tz, dst_tz, src_md, dst_md, ksize, strides, paddings, + ctx.Attr("ceil_mode")); + + auto dst_memory = + handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + + auto pool_p = handler.AcquirePooling(dst_memory, src_memory); // push primitive to stream and wait until it's executed std::vector pipeline{*pool_p}; stream(stream::kind::eager).submit(pipeline).wait(); + output_format = + (memory::format)dst_memory->get_primitive_desc().desc().data.format; + output->set_layout(DataLayout::kMKLDNN); output->set_format(output_format); } - - private: - std::unique_ptr CreatePrimitiveDesc( - const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst, - const mkldnn::prop_kind& propagation, const std::vector& stride, - const std::vector& padding_left_top, - const std::vector& padding_right_bot, const std::vector& kernel, - const std::string& pooling_type, const mkldnn::engine& engine, - bool ceil_mode, bool is_test) const { - auto mkldnn_forward_prop_kind = is_test - ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training; - auto pool_desc = mkldnn::pooling_forward::desc( - mkldnn_forward_prop_kind, - pooling_type == "max" ? mkldnn::algorithm::pooling_max - : mkldnn::algorithm::pooling_avg, - src, dst, stride, kernel, padding_left_top, padding_right_bot, - mkldnn::padding_kind::zero); - - auto p_pool_pd = - new mkldnn::pooling_forward::primitive_desc(pool_desc, engine); - return std::unique_ptr(p_pool_pd); - } - - std::unique_ptr CreateWorkspaceMemory( - std::shared_ptr pool_pd, - const std::string& pooling_type, const mkldnn::engine& engine) const { - mkldnn::memory::primitive_desc workspace_md = - pooling_type == "max" - ? pool_pd->workspace_primitive_desc() - : mkldnn::memory::primitive_desc({{}, - platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw}, - engine); - - auto p_workspace_memory = new mkldnn::memory(workspace_md); - return std::unique_ptr(p_workspace_memory); - } }; template @@ -299,6 +157,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { ctx.template device_context(); const mkldnn::engine& mkldnn_engine = dev_ctx.GetEngine(); + std::vector pipeline; + const T* out_grad_data = out_grad->data(); T* in_x_grad_data = in_x_grad->mutable_data(ctx.GetPlace()); memory::format in_x_grad_format{memory::format::format_undef}; @@ -310,119 +170,41 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { // Get an unique name from "argument" name of "Out" variable // This name will be used as key when referring info from device context - const std::string key = CreateKey(ctx, diff_src_tz, pooling_type, ksize, - strides, paddings, memory::data_type::f32, - in_x->format(), ctx.op().Input("Out")); - const std::string key_pool_bwd_p = key + "@pool_bwd_p"; - const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p"; - const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p"; - const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; - const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; - const std::string key_pool_pd = key + "@pool_pd"; - const std::string key_pool_workspace_memory = - key + "@pool_workspace_memory"; - - auto user_diff_dst_memory = - memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()}, - mkldnn_engine}, - to_void_cast(out_grad_data)); - - std::shared_ptr diff_src_memory; - std::shared_ptr diff_dst_memory; - auto dst_memory = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); - PADDLE_ENFORCE(dst_memory != nullptr, - "Fail to find dst_memory in device context"); - - primitive reorder_diff_dst; - bool is_diff_dst_reordered = false; - auto pool_bwd_p = std::static_pointer_cast( - dev_ctx.GetBlob(key_pool_bwd_p)); - if (pool_bwd_p == nullptr) { - // Retrieve src_memory/dst_memory saved in forward pass - auto src_memory = - std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); - PADDLE_ENFORCE(src_memory != nullptr, - "Fail to find src_memory in device context"); - // Retrieve pool_pd/pool_workspace_memory from device context - auto pool_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_pool_pd)); - PADDLE_ENFORCE(pool_pd != nullptr, - "Fail to find pool_pd in device context"); - auto workspace_memory = std::static_pointer_cast( - dev_ctx.GetBlob(key_pool_workspace_memory)); - PADDLE_ENFORCE(workspace_memory != nullptr, - "Fail to find workspace_memory in device context"); - - // create memory descriptors for pooling - auto diff_src_md = src_memory.get()->get_primitive_desc().desc(); - auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc(); - - auto pool_bwd_desc = mkldnn::pooling_backward::desc( - pooling_type == "max" ? mkldnn::algorithm::pooling_max - : mkldnn::algorithm::pooling_avg, - diff_src_md, diff_dst_md, strides, ksize, paddings, paddings, - mkldnn::padding_kind::zero); - auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( - pool_bwd_desc, mkldnn_engine, *pool_pd); - - // reorder between user_diff_dst and pool diff_dst if needed - diff_dst_memory = std::make_shared(user_diff_dst_memory); - if (memory::primitive_desc(dst_memory->get_primitive_desc()) != - user_diff_dst_memory.get_primitive_desc()) { - diff_dst_memory = - std::make_shared(dst_memory.get()->get_primitive_desc()); - reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); - is_diff_dst_reordered = true; - } + const std::string key = platform::PoolingMKLDNNHandler::GetHash( + diff_src_tz, pooling_type, ksize, strides, paddings, + memory::data_type::f32, in_x->format(), ctx.op().Input("Out")); - diff_src_memory = std::make_shared( - pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data); - - dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory); - dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory); - - pool_bwd_p = std::make_shared( - pool_bwd_pd, *diff_dst_memory, *workspace_memory, *diff_src_memory); - dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p); - - } else { - // Primitives already exist - diff_src_memory = std::static_pointer_cast( - dev_ctx.GetBlob(key_pool_diff_src_mem_p)); - PADDLE_ENFORCE(diff_src_memory != nullptr, - "Fail to find pooling src mem_p in device context"); - diff_dst_memory = std::static_pointer_cast( - dev_ctx.GetBlob(key_pool_diff_dst_mem_p)); - PADDLE_ENFORCE(diff_dst_memory != nullptr, - "Fail to find pooling dst mem_p in device context"); - - diff_src_memory->set_data_handle(reinterpret_cast(in_x_grad_data)); - diff_dst_memory->set_data_handle(const_cast(out_grad_data)); - - // reorder between user_diff_dst and pool diff_dst if needed - if (memory::primitive_desc(dst_memory->get_primitive_desc()) != - user_diff_dst_memory.get_primitive_desc()) { - diff_dst_memory = - std::make_shared(dst_memory.get()->get_primitive_desc()); - reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); - is_diff_dst_reordered = true; - } - } + platform::PoolingMKLDNNHandler handler( + pooling_type, paddle::framework::ToMKLDNNDataType(in_x_grad->type()), + false, dev_ctx, mkldnn_engine, key); - in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc() - .desc() - .data.format; + auto workspace = handler.AcquireWorkspaceMemory(); + + auto diff_dst_md = platform::MKLDNNMemDesc( + {diff_dst_tz}, platform::MKLDNNGetDataType(), out_grad->format()); + + auto diff_dst_memory = handler.AcquireDiffDstMemory( + diff_dst_md, to_void_cast(out_grad_data)); + + auto diff_src_md = + platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType(), + mkldnn::memory::format::any); + + auto bwd_pd = handler.AcquirePoolingBackwardPrimitiveDescriptor( + diff_dst_md, diff_src_md, ksize, strides, paddings); + + auto diff_src_memory = handler.AcquireDiffSrcMemoryFromPrimitive( + reinterpret_cast(in_x_grad_data)); + + auto pool_bwd_p = handler.AcquirePoolingBackward(diff_dst_memory, workspace, + diff_src_memory); - // push primitive to stream and wait until it's executed - std::vector pipeline; - if (is_diff_dst_reordered) { - pipeline.push_back(reorder_diff_dst); - } pipeline.push_back(*pool_bwd_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format; in_x_grad->set_layout(DataLayout::kMKLDNN); in_x_grad->set_format(in_x_grad_format); } // Compute() diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index f5a1b32e5c240933d79a524937b5a8222118fdd9..4eb5b7ad9d1fe128ade904cf61e0178d59b374b8 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -58,10 +58,14 @@ class ScatterGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("Updates"), - ctx->GetInputDim("Updates")); - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); + if (ctx->HasOutput(framework::GradVarName("Updates"))) { + ctx->SetOutputDim(framework::GradVarName("Updates"), + ctx->GetInputDim("Updates")); + } + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + } } protected: diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu index e9ad347538157342adb24813546e927040b4f9d2..e17617b40da356d74bdffcf53a6c9189d13c64f1 100644 --- a/paddle/fluid/operators/scatter_op.cu +++ b/paddle/fluid/operators/scatter_op.cu @@ -47,12 +47,15 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); auto *Ids = ctx.Input("Ids"); auto *dOut = ctx.Input(framework::GradVarName("Out")); - - // In place gradient: dX = dO - dX->ShareDataWith(*dOut); - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + if (dX) { + // In place gradient: dX = dO + framework::TensorCopy(*dOut, ctx.GetPlace(), dX); + } + if (dUpdates) { + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather: dUpdates = dO[Ids] + GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + } } }; diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h index 9c237dc0f1f115ce76a3b982a8c6ca1dfccb0b87..3b6184de77f4fc05aa2f2900ebc656ed06a8edfc 100644 --- a/paddle/fluid/operators/scatter_op.h +++ b/paddle/fluid/operators/scatter_op.h @@ -74,11 +74,15 @@ class ScatterGradientOpKernel : public framework::OpKernel { auto *Ids = ctx.Input("Ids"); auto *dOut = ctx.Input(framework::GradVarName("Out")); - // In place gradient: dX = dO - framework::TensorCopySync(*dOut, ctx.GetPlace(), dX); - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + if (dX) { + // In place gradient: dX = dO + framework::TensorCopySync(*dOut, ctx.GetPlace(), dX); + } + if (dUpdates) { + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather: dUpdates = dO[Ids] + CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + } } }; diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 445eb4a599a6566df3272f065ed188dd2316e227..86e4379c4fbfd01a1cf33aabfd838ede1e60464c 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -108,17 +108,11 @@ int GetCUDADeviceCount() { int GetCUDAComputeCapability(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); cudaDeviceProp device_prop; - auto e = cudaGetDeviceProperties(&device_prop, id); - std::ostringstream ostr; - ostr << "cudaGetDeviceProperties failed in" - "paddle::platform::GetCUDAComputeCapability!" - "Error Type ID = " - << e << " Please see detail in:" - "https://docs.nvidia.com/cuda/cuda-runtime-api/" - "group__CUDART__TYPES.html#group__CUDART__TYPES_" - "1g3f51e3575c2178246db0a94a430e0038"; - std::string ErrorLog = ostr.str(); - PADDLE_ENFORCE(e, ErrorLog); + auto error_code = cudaGetDeviceProperties(&device_prop, id); + PADDLE_ENFORCE(error_code, + "cudaGetDeviceProperties failed in " + "paddle::platform::GetCUDAComputeCapability, error code : %d", + error_code); return device_prop.major * 10 + device_prop.minor; } diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index d478d66fc5617bed9d67d53b436fa8c1456537bb..70f8d9dbd8d623aac53b37d0dd5dc980cee8bfb1 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -122,6 +122,18 @@ class MKLDNNHandler { return mem_p; } + std::shared_ptr AcquireMemory( + const mkldnn::memory::primitive_desc& mpd, const std::string& suffix) { + auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + mem_p = std::make_shared(mpd); + dev_ctx_.SetBlob(local_key, mem_p); + } + return mem_p; + } + std::shared_ptr AcquireMemory( const std::shared_ptr& user_memory_p, const std::shared_ptr& target_memory_p, @@ -424,6 +436,223 @@ class ActivationMKLDNNHandler : public MKLDNNHandler { std::shared_ptr activation_bwd_pd_; }; +class PoolingMKLDNNHandler : public MKLDNNHandler { + public: + PoolingMKLDNNHandler(const std::string& pooling_type, + mkldnn::memory::data_type dt, bool is_test, + const platform::MKLDNNDeviceContext& dev_ctx, + mkldnn::engine engine, const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key), + dt_(dt), + pooling_type_(pooling_type), + is_test_(is_test) {} + + std::shared_ptr + AcquirePoolingPrimitiveDescriptor( + const std::vector& src_tz, const std::vector& dst_tz, + const mkldnn::memory::desc& src_md, const mkldnn::memory::desc& dst_md, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, bool ceil_mode) { + // Pooling PD has to be passed to Grad op that + // may be executed by diffrent thread, hence + // for that one we use key that does not contain TID + const std::string key_pooling_pd = key_common_ + "@pooling_pd"; + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pooling_pd)); + if (fwd_pd_ == nullptr) { + static std::mutex acquire_barrier; + std::lock_guard block_threads_until_finish_this_job( + acquire_barrier); + fwd_pd_ = + std::static_pointer_cast( + dev_ctx_.GetBlob(key_pooling_pd)); + if (fwd_pd_ == nullptr) { + std::vector padding_left_top(paddings); + std::vector padding_right_bottom(paddings); + if (ceil_mode) { + CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, + padding_right_bottom); + } + auto mkldnn_forward_prop_kind = + is_test_ ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training; + auto pooling_desc = mkldnn::pooling_forward::desc( + mkldnn_forward_prop_kind, + pooling_type_ == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + src_md, dst_md, strides, ksize, padding_left_top, + padding_right_bottom, mkldnn::padding_kind::zero); + + fwd_pd_.reset( + new mkldnn::pooling_forward::primitive_desc(pooling_desc, engine_)); + dev_ctx_.SetBlob(key_pooling_pd, fwd_pd_); + } + } + return fwd_pd_; + } + + std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_primitive_desc(), ptr, + "@dst_mem_p"); + } + + std::shared_ptr AcquireWorkspaceMemory(void) { + mkldnn::memory::primitive_desc workspace_mpd = + pooling_type_ == "max" + ? fwd_pd_->workspace_primitive_desc() + : mkldnn::memory::primitive_desc( + {{}, dt_, mkldnn::memory::format::nchw}, engine_); + // Pooling PD has to be passed to Grad op that + // may be executed by diffrent thread, hence + // for that one we use key that does not contain TID + auto local_key = key_common_ + "@workspace"; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + static std::mutex acquire_barrier; + std::lock_guard block_threads_until_finish_this_job( + acquire_barrier); + mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + mem_p = std::make_shared(workspace_mpd); + dev_ctx_.SetBlob(local_key, mem_p); + } + } + return mem_p; + } + + std::shared_ptr AcquirePooling( + std::shared_ptr dst_memory, + std::shared_ptr src_memory) { + auto prim_key = key_ + "@pooling_p"; + + auto pooling_p = std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + if (pooling_p == nullptr) { + if (is_test_) { + pooling_p = std::make_shared( + *fwd_pd_, *(src_memory), *(dst_memory)); + } else { + // For training we need to create workspace + // to store indices from backward + auto workspace_memory = this->AcquireWorkspaceMemory(); + + pooling_p = std::make_shared( + *fwd_pd_, *src_memory, *dst_memory, *workspace_memory); + } + dev_ctx_.SetBlob(prim_key, pooling_p); + } + return pooling_p; + } + + std::shared_ptr + AcquirePoolingBackwardPrimitiveDescriptor( + const mkldnn::memory::desc& diff_dst_md, + const mkldnn::memory::desc& diff_src_md, const std::vector& ksize, + const std::vector& strides, const std::vector& paddings) { + const std::string key_pooling_pd = key_common_ + "@pooling_pd"; + const std::string key_pooling_bwd_pd = key_ + "@pooling_bwd_pd"; + bwd_pd_ = + std::static_pointer_cast( + dev_ctx_.GetBlob(key_pooling_bwd_pd)); + if (bwd_pd_ == nullptr) { + fwd_pd_ = + std::static_pointer_cast( + dev_ctx_.GetBlob(key_pooling_pd)); + // PD from FWD op has to exist. + PADDLE_ENFORCE(fwd_pd_ != nullptr, "Pooling MKL-DNN not found in cache!"); + + auto backward_desc = mkldnn::pooling_backward::desc( + pooling_type_ == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + diff_src_md, diff_dst_md, strides, ksize, paddings, paddings, + mkldnn::padding_kind::zero); + bwd_pd_.reset(new mkldnn::pooling_backward::primitive_desc( + backward_desc, engine_, *fwd_pd_)); + + dev_ctx_.SetBlob(key_pooling_bwd_pd, bwd_pd_); + } + return bwd_pd_; + } + + std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { // NOLINT + auto diff_dst_pd = bwd_pd_->diff_dst_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, + "@diff_dst_mem_p", pipeline); + } + + std::shared_ptr AcquireDiffSrcMemoryFromPrimitive(void* ptr) { + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_primitive_desc(), + ptr, "@diff_src_mem_p"); + } + + std::shared_ptr AcquirePoolingBackward( + std::shared_ptr diff_dst_memory, + std::shared_ptr workspace, + std::shared_ptr diff_src_memory) { + auto prim_key = key_ + "@pooling_bwd_p"; + + auto pooling_bwd_p = std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + if (pooling_bwd_p == nullptr) { + pooling_bwd_p = std::make_shared( + *bwd_pd_, *diff_dst_memory, *workspace, *diff_src_memory); + dev_ctx_.SetBlob(prim_key, pooling_bwd_p); + } + + return pooling_bwd_p; + } + + static std::string GetHash( + const memory::dims& input_dims, const std::string& pooling_type, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, const memory::data_type& dt, + const memory::format& fmt, const std::string& suffix) { + std::string key; + key.reserve(platform::MKLDNNHandler::MaxKeyLength); + platform::MKLDNNHandler::AppendKeyDims(&key, input_dims); + platform::MKLDNNHandler::AppendKey(&key, pooling_type); + platform::MKLDNNHandler::AppendKeyVec(&key, ksize); + platform::MKLDNNHandler::AppendKeyVec(&key, strides); + platform::MKLDNNHandler::AppendKeyVec(&key, paddings); + platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt)); + platform::MKLDNNHandler::AppendKey(&key, std::to_string(fmt)); + platform::MKLDNNHandler::AppendKey(&key, suffix); + return key; + } + + private: + static inline int ComputeCeiledOutput(int input_size, int kernel_size, + int padding, int stride) { + return (input_size - kernel_size + 2 * padding) / stride + 1; + } + + static inline void CorrectOutputSize( + const std::vector& src_tz, const std::vector& dst_tz, + const std::vector& kernel_size, const std::vector& paddings, + const std::vector& strides, + std::vector& right_bot_padding) { // NOLINT + for (size_t i = 0; i < right_bot_padding.size(); i++) { + int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i], + paddings[i], strides[i]); + if (desired_size != dst_tz[i + 2]) { + right_bot_padding[i] += strides[i]; + } + } + } + + private: + mkldnn::memory::data_type dt_; + std::string pooling_type_; + bool is_test_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; +}; + class TransposeMKLDNNHandler : public MKLDNNHandler { public: TransposeMKLDNNHandler(std::vector& dims, // NOLINT diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index fadd9c280b0056849030dcbc26d289a5f92d91c9..1b415df1aa89b3d1b7524153037b0b81d6fa0065 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -543,7 +543,7 @@ function assert_api_spec_approvals() { python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641` echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "FALSE" ]; then - echo "You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang) approval for the api change! ${API_FILE} for the avoidance of the bad C++ code habits." + echo "You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang) approval for the usage (either add or delete) of const_cast." exit 1 fi fi @@ -968,7 +968,7 @@ function build_document_preview() { function example() { - pip install /paddle/build/python/dist/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl + pip install /paddle/build/python/dist/*.whl paddle version cd ${PADDLE_ROOT}/python/paddle/fluid python sampcd_processor.py diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index ecd40dc6edeb3ae456a515799bd721c9812b3d7b..503ff20b844db83f9af6f1b104612cb7654c4290 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -212,10 +212,10 @@ if(WITH_DISTRIBUTE) py_test_modules(test_dgc_op MODULES test_dgc_op) endif() if(NOT APPLE) - set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 300) - set_tests_properties(test_dist_mnist_nccl PROPERTIES TIMEOUT 300) - set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 300) - set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 300) + set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 350) + set_tests_properties(test_dist_mnist_nccl PROPERTIES TIMEOUT 350) + set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 350) + set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 350) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl) bash_test_modules(test_launch MODULES test_launch.sh) diff --git a/python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py b/python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py new file mode 100644 index 0000000000000000000000000000000000000000..6d3e93fa57b081fa1ce0ec6309ee166335b05ec9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py @@ -0,0 +1,64 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +from paddle.fluid import metrics + + +class TestAucSinglePredOp(OpTest): + def setUp(self): + self.op_type = "auc" + pred = np.random.random((128, 2)).astype("float32") + pred0 = pred[:, 0].reshape(128, 1) + labels = np.random.randint(0, 2, (128, 1)).astype("int64") + num_thresholds = 200 + + stat_pos = np.zeros((num_thresholds + 1, )).astype("int64") + stat_neg = np.zeros((num_thresholds + 1, )).astype("int64") + + self.inputs = { + 'Predict': pred0, + 'Label': labels, + "StatPos": stat_pos, + "StatNeg": stat_neg + } + self.attrs = { + 'curve': 'ROC', + 'num_thresholds': num_thresholds, + "slide_steps": 1 + } + + python_auc = metrics.Auc(name="auc", + curve='ROC', + num_thresholds=num_thresholds) + for i in range(128): + pred[i][1] = pred[i][0] + python_auc.update(pred, labels) + + self.outputs = { + 'AUC': np.array(python_auc.eval()), + 'StatPosOut': np.array(python_auc._stat_pos), + 'StatNegOut': np.array(python_auc._stat_neg) + } + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main()