未验证 提交 287ca7d5 编写于 作者: Z Zeng Jinle 提交者: GitHub

MLPerf Optimization for Release/2.2 (#37109)

* add mlperf optimization PRs

* update
上级 70cb0a54
......@@ -218,7 +218,7 @@ function(op_library TARGET)
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "sparse_attention_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
"fused_bn_add_activation_op" "fused_attention_op" "fused_feedforward_op")
"fused_bn_add_activation_op" "fused_attention_op" "fused_feedforward_op" "resnet_unit_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
......
......@@ -143,6 +143,8 @@ struct BuildStrategy {
// Turn off inplace addto by default.
bool enable_addto_{false};
bool allow_cuda_graph_capture_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model.
......
......@@ -130,10 +130,12 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
}
}
// Wait FetchOps.
ClearFetchOp(graph_, &fetch_ops);
if (!fetch_ops.empty()) {
ClearFetchOp(graph_, &fetch_ops);
for (auto &place : places_) {
fetch_ctxs_.Get(place)->Wait();
for (auto &place : places_) {
fetch_ctxs_.Get(place)->Wait();
}
}
return fetches;
......
......@@ -86,19 +86,28 @@ struct ScaleLossGradFunctor {
}
};
std::string ScaleLossGradOpHandle::LossGradName() const {
return static_cast<VarHandle *>(this->outputs_[0])->name();
}
void ScaleLossGradOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
// Doesn't wait any event
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name();
RunOnVar(local_exec_scopes_[0]->FindVar(LossGradName()), true);
}
auto *tensor =
local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>();
void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
auto *tensor = var->GetMutable<LoDTensor>();
tensor->Resize(make_ddim({1}));
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
this->dev_ctxes_.at(place_));
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
if (record_event) {
this->RunAndRecordEvent(
[&] { framework::VisitDataType(out_dtype_, func); });
} else {
framework::VisitDataType(out_dtype_, func);
}
#else
ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr);
framework::VisitDataType(out_dtype_, func);
......
......@@ -46,6 +46,12 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
std::string Name() const override;
platform::Place GetPlace() const { return place_; }
void RunOnVar(Variable *var, bool record_event = false);
std::string LossGradName() const;
protected:
void RunImpl() override;
......
......@@ -22,7 +22,9 @@
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
namespace details {
......@@ -49,8 +51,29 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
PrepareLocalExeScopes();
}
static void RunProgramDescs(const ProgramDescs &programs,
const std::vector<Scope *> &local_exec_scopes,
const std::vector<platform::Place> &places) {
for (auto &program : programs) {
for (auto &op_desc : program.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes[i], places[i]);
}
}
}
}
FetchResultType ScopeBufferedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors, bool return_merged) {
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
strategy_.num_iteration_per_drop_scope_ =
std::numeric_limits<size_t>::max();
DropLocalExeScopes(/*need_wait=*/false);
}
#endif
if (drop_scope_counter_ == 0) {
platform::RecordEvent e("InitLocalVars");
InitVariables();
......@@ -84,7 +107,7 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run(
++drop_scope_counter_;
if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ ||
DropScopeOrNot()) {
DropLocalExeScopes();
DropLocalExeScopes(!platform::IsCUDAGraphCapturing());
}
if (VLOG_IS_ON(5)) {
......@@ -128,15 +151,7 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
if (graph.Has(details::kStartupProgramDescs)) {
auto &program_descs =
graph.Get<details::ProgramDescs>(details::kStartupProgramDescs);
for (auto &program_desc : program_descs) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes_[i], places_[i]);
}
}
}
RunProgramDescs(program_descs, local_exec_scopes_, places_);
}
is_initialized_ = true;
}
......@@ -144,23 +159,17 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
if (graph.Has(details::kProgramDescs)) {
auto &program_descs =
graph.Get<details::ProgramDescs>(details::kProgramDescs);
for (auto &program_desc : program_descs) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes_[i], places_[i]);
}
}
}
RunProgramDescs(program_descs, local_exec_scopes_, places_);
}
}
void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) {
platform::RecordEvent drop_scope_event("DropLocalExeScopes");
drop_scope_counter_ = 0;
for (auto &p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
if (need_wait) {
for (auto &p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
}
scope_monitor_.ClearHistoryLocalExecScopes();
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
......
......@@ -53,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
FetchResultType Run(const std::vector<std::string>& fetch_tensors,
bool return_merged) override;
void DropLocalExeScopes();
void DropLocalExeScopes(bool need_wait = true);
bool NeedCreateLocalExeScope();
......
......@@ -115,6 +115,7 @@ message BuildStrategy {
optional bool enable_auto_fusion = 11 [ default = false ];
optional bool enable_addto = 12 [ default = false ];
optional bool fix_op_run_order = 13 [ default = false ];
optional bool allow_cuda_graph_capture = 14 [ default = false ];
}
message ExecutionStrategy {
......
......@@ -179,7 +179,8 @@ void InplaceAddToOpPass::Run(Graph *graph) const {
out_var_ptr->GeneratedOp());
// NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy
if (right_generated_op->Name() != "conv2d_grad") {
if (right_generated_op->Name() != "conv2d_grad" &&
right_generated_op->Name() != "resnet_unit_grad") {
continue;
}
......@@ -224,11 +225,13 @@ static bool IsValidConv2DGradDataGradNode(const Node &node) {
if (node.inputs.empty()) return false;
auto *generated_op = node.inputs[0];
auto *op_desc = generated_op->Op();
if (op_desc == nullptr || op_desc->Type() != "conv2d_grad") {
if (op_desc == nullptr || (op_desc->Type() != "conv2d_grad" &&
op_desc->Type() != "resnet_unit_grad")) {
return false;
}
const auto &outputs = op_desc->Outputs();
auto iter = outputs.find(GradVarName("Input"));
std::string grad_var_name = op_desc->Type() == "conv2d_grad" ? "Input" : "X";
auto iter = outputs.find(GradVarName(grad_var_name));
return iter != outputs.end() && !iter->second.empty() &&
iter->second[0] == node.Name() &&
!op_desc->GetAttrIfExists<bool>("use_addto");
......
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view multi_devices_helper)
cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
......@@ -21,14 +22,23 @@ namespace paddle {
namespace framework {
namespace ir {
template <typename T>
static bool IsMatchedPlaceSingleDeviceOp(details::OpHandleBase *op_base,
const platform::Place &place) {
auto *op = dynamic_cast<T *>(op_base);
return op && op->GetPlace() == place;
}
static bool IsLockAndRecordEventFreeComputationOpHandle(
details::ComputationOpHandle *op, const OpGraphView &graph_view) {
if (!platform::is_gpu_place(op->GetPlace()) &&
!platform::is_xpu_place(op->GetPlace()))
return false;
for (auto &pending_op : graph_view.PendingOps(op)) {
auto *tmp = dynamic_cast<details::ComputationOpHandle *>(pending_op);
if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) {
if (!IsMatchedPlaceSingleDeviceOp<details::ComputationOpHandle>(
pending_op, op->GetPlace()) &&
!IsMatchedPlaceSingleDeviceOp<details::ScaleLossGradOpHandle>(
pending_op, op->GetPlace())) {
return false;
}
}
......
......@@ -15,8 +15,10 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <mutex>
#include <unordered_map>
#include <vector>
#include "glog/logging.h"
namespace paddle {
namespace framework {
......
......@@ -27,6 +27,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
......@@ -34,6 +35,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -43,6 +45,10 @@ limitations under the License. */
DECLARE_double(eager_delete_tensor_gb);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_bool(sync_nccl_allreduce);
#endif
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
......@@ -669,6 +675,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
// ncclOp
std::vector<ir::Graph *> async_graphs =
CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name);
PrepareForCUDAGraphCapture(graph);
graph = member_->ApplyMemoryOptimizePass(graph);
async_graphs[0] = graph;
......@@ -882,6 +889,23 @@ void ParallelExecutor::BCastParamsToDevices(
FetchResultType ParallelExecutor::Run(
const std::vector<std::string> &fetch_tensors, bool return_merged) {
VLOG(3) << "enter ParallelExecutor Run";
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(fetch_tensors.empty(), true,
platform::errors::InvalidArgument(
"Cannot fetch data when using CUDA Graph."));
PADDLE_ENFORCE_EQ(
member_->build_strategy_.allow_cuda_graph_capture_, true,
platform::errors::InvalidArgument(
"You must turn on build_strategy.allow_cuda_graph_capture = True "
"to enable CUDA Graph capturing."));
PADDLE_ENFORCE_EQ(
member_->places_[0], platform::CUDAGraphCapturingPlace(),
platform::errors::InvalidArgument("The place to capture CUDAGraph is "
"not the same as the place to run."));
}
#endif
#ifdef WITH_GPERFTOOLS
if (gProfileStarted) {
ProfilerFlush();
......@@ -932,6 +956,16 @@ void ParallelExecutor::SkipMemoryReuse(
void ParallelExecutor::FeedTensorsIntoLocalScopes(
const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
if (platform::IsCUDAGraphCapturing()) {
for (auto &tensor : tensors) {
PADDLE_ENFORCE_EQ(
tensor.empty(), true,
platform::errors::PermissionDenied(
"Feeding data is not permitted when capturing CUDA Graph."));
}
return;
}
if (!member_->AllowPartialFeed()) {
PADDLE_ENFORCE_EQ(tensors.size(), member_->local_scopes_.size(),
platform::errors::Unimplemented(
......@@ -987,6 +1021,14 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
const std::unordered_map<std::string, LoDTensor> &tensors) {
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(
tensors.empty(), true,
platform::errors::PermissionDenied(
"Feeding data is not permitted when capturing CUDA Graph."));
return;
}
size_t num_places = member_->places_.size();
bool allow_partial_feed = member_->AllowPartialFeed();
......@@ -1568,6 +1610,107 @@ const ir::Graph &ParallelExecutor::Graph() const {
return member_->executor_->Graph();
}
void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) {
const auto &build_strategy = member_->build_strategy_;
if (!build_strategy.allow_cuda_graph_capture_) return;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_EQ(
build_strategy.async_mode_, false,
platform::errors::InvalidArgument(
"Async Executor does not support CUDA Graph capturing."));
PADDLE_ENFORCE_EQ(
platform::IsCUDAGraphCapturing(), false,
platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
"when running the first batch."));
PADDLE_ENFORCE_EQ(
member_->places_.size(), 1,
platform::errors::InvalidArgument(
"CUDA Graph is only supported when one GPU device is running."));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), true,
platform::errors::InvalidArgument(
"CUDA Graph is only supported on NVIDIA GPU device."));
PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, false,
platform::errors::InvalidArgument(
"FLAGS_sync_nccl_allreduce must be False to support "
"CUDA Graph capturing."));
std::unordered_map<std::string, std::vector<VarDesc *>> all_vars;
for (auto &node : graph->Nodes()) {
if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
auto *var_desc = node->Var();
all_vars[var_desc->Name()].emplace_back(var_desc);
}
}
auto mark_var_as_persistable = [&all_vars](const std::string &name) {
auto iter = all_vars.find(name);
if (iter != all_vars.end()) {
for (auto *var_desc : iter->second) {
var_desc->SetPersistable(true);
}
}
};
// Step 1: All fused vars must be persistable.
if (graph->Has(details::kFusedVars)) {
auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
for (auto &fused_var : fused_vars) {
fused_var.second.persistable_ = true;
mark_var_as_persistable(fused_var.first);
}
}
// Step 2: All pinned vars must be persistable.
if (graph->Has(details::kPinnedVars)) {
auto &pinned_vars = graph->Get<details::PinnedVars>(details::kPinnedVars);
for (auto &pinned_var : pinned_vars) {
mark_var_as_persistable(pinned_var);
}
}
// Step 3: Move all main programs to startup programs to make sure that
// the main programs would only be run once.
if (graph->Has(details::kProgramDescs)) {
auto &startup_programs =
graph->GetOrInit<details::ProgramDescs>(details::kStartupProgramDescs);
auto &main_programs =
graph->Get<details::ProgramDescs>(details::kProgramDescs);
for (auto &main_program : main_programs) {
startup_programs.emplace_back(main_program);
}
graph->Erase(details::kProgramDescs);
}
// Step 4: Mark all vars in startup programs to be persistable.
if (graph->Has(details::kStartupProgramDescs)) {
auto &startup_programs =
graph->GetOrInit<details::ProgramDescs>(details::kStartupProgramDescs);
for (auto &startup_program : startup_programs) {
for (auto &op_desc : startup_program.Block(0).AllOps()) {
for (auto &output : op_desc->OutputArgumentNames()) {
mark_var_as_persistable(output);
}
}
}
}
// Step 5: ScaleLossGrad must be run beforehand to avoid H2D copy.
auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*graph);
auto *scope = member_->local_scopes_[0];
for (auto *op : ops) {
auto *loss_grad_op = dynamic_cast<details::ScaleLossGradOpHandle *>(op);
if (loss_grad_op == nullptr) continue;
auto loss_grad_name = loss_grad_op->LossGradName();
mark_var_as_persistable(loss_grad_name);
loss_grad_op->RunOnVar(scope->Var(loss_grad_name));
loss_grad_op->SetSkipRunning(true);
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDA Graph is only supported on NVIDIA GPU device."));
#endif
}
} // namespace framework
} // namespace paddle
......
......@@ -144,6 +144,8 @@ class ParallelExecutor {
void SetReaderOpDeviceInfoOfGraphs(
const std::vector<ir::Graph *> &final_graphs);
void PrepareForCUDAGraphCapture(ir::Graph *graph);
ParallelExecutorPrivate *member_;
std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
std::vector<VariableInfo> var_infos_;
......
......@@ -82,7 +82,11 @@ endif()
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy )
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
if (WITH_GPU)
target_link_libraries(allocator_facade cuda_graph)
endif()
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
if (WITH_TESTING)
......
......@@ -32,6 +32,9 @@
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_graph.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu/xpu_info.h"
#endif
......@@ -47,17 +50,64 @@ PADDLE_DEFINE_EXPORTED_bool(
"Whether to use system allocator to allocate CPU and GPU memory. "
"Only used for unittests.");
DECLARE_string(allocator_strategy);
namespace paddle {
namespace memory {
namespace allocation {
#ifdef PADDLE_WITH_CUDA
class CUDAGraphAllocator
: public Allocator,
public std::enable_shared_from_this<CUDAGraphAllocator> {
private:
class PrivateAllocation : public Allocation {
public:
PrivateAllocation(CUDAGraphAllocator* allocator,
AllocationPtr underlying_allocation)
: Allocation(underlying_allocation->ptr(),
underlying_allocation->size(),
underlying_allocation->place()),
allocator_(allocator->shared_from_this()),
underlying_allocation_(std::move(underlying_allocation)) {}
private:
std::shared_ptr<Allocator> allocator_;
AllocationPtr underlying_allocation_;
};
explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
: underlying_allocator_(allocator) {}
public:
static std::shared_ptr<Allocator> Create(
const std::shared_ptr<Allocator>& allocator) {
return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
}
protected:
Allocation* AllocateImpl(size_t size) {
VLOG(10) << "Allocate " << size << " for CUDA Graph";
return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
}
void FreeImpl(Allocation* allocation) {
VLOG(10) << "delete for CUDA Graph";
delete allocation;
}
private:
std::shared_ptr<Allocator> underlying_allocator_;
};
#endif
class AllocatorFacadePrivate {
public:
using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
AllocatorFacadePrivate() {
auto strategy = GetAllocatorStrategy();
switch (strategy) {
explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
strategy_ = GetAllocatorStrategy();
switch (strategy_) {
case AllocatorStrategy::kNaiveBestFit: {
InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_XPU
......@@ -91,7 +141,8 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id));
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
allow_free_idle_chunk);
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
......@@ -117,7 +168,7 @@ class AllocatorFacadePrivate {
default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Unsupported allocator strategy: %d", static_cast<int>(strategy)));
"Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
}
}
InitZeroSizeAllocators();
......@@ -130,11 +181,29 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe();
}
inline const AllocatorMap& GetAllocatorMap() {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
auto id = platform::CUDAGraph::CapturingID();
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_NE(
iter, cuda_graph_allocator_map_.end(),
platform::errors::PermissionDenied(
"No memory pool is prepared for CUDA Graph capturing."));
return iter->second->allocators_;
} else {
return allocators_;
}
#else
return allocators_;
#endif
}
inline const std::shared_ptr<Allocator>& GetAllocator(
const platform::Place& place, size_t size) {
const auto& allocators =
(size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
: allocators_)
: GetAllocatorMap())
: zero_size_allocators_);
auto iter = allocators.find(place);
PADDLE_ENFORCE_NE(iter, allocators.end(),
......@@ -145,6 +214,7 @@ class AllocatorFacadePrivate {
private:
void InitSystemAllocators() {
if (!system_allocators_.empty()) return;
system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
#ifdef PADDLE_WITH_XPU
int device_count = platform::GetXPUDeviceCount();
......@@ -183,10 +253,11 @@ class AllocatorFacadePrivate {
allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
}
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) {
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
bool allow_free_idle_chunk) {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize());
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
}
#endif
......@@ -226,6 +297,7 @@ class AllocatorFacadePrivate {
};
void InitZeroSizeAllocators() {
if (!zero_size_allocators_.empty()) return;
std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace());
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......@@ -279,12 +351,57 @@ class AllocatorFacadePrivate {
}
}
#ifdef PADDLE_WITH_CUDA
public:
void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
platform::errors::InvalidArgument(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=\"auto_growth\", but got "
"FLAGS_allocator_strategy=\"%s\"",
FLAGS_allocator_strategy));
auto& allocator = cuda_graph_allocator_map_[id];
PADDLE_ENFORCE_EQ(
allocator.get(), nullptr,
platform::errors::InvalidArgument(
"The memory pool of the CUDA Graph with ID %d have been prepared.",
id));
allocator.reset(
new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
for (auto& item : allocator->allocators_) {
auto& old_allocator = item.second;
old_allocator = CUDAGraphAllocator::Create(old_allocator);
}
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
}
void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
platform::errors::InvalidArgument(
"Cannot find CUDA Graph with ID = %d", id));
cuda_graph_allocator_map_.erase(iter);
VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
}
#endif
private:
AllocatorMap allocators_;
AllocatorMap zero_size_allocators_;
AllocatorMap system_allocators_;
#ifdef PADDLE_WITH_CUDA
std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
cuda_graph_allocator_map_;
#endif
AllocatorStrategy strategy_;
static AllocatorMap zero_size_allocators_;
static AllocatorMap system_allocators_;
};
AllocatorFacadePrivate::AllocatorMap
AllocatorFacadePrivate::zero_size_allocators_;
AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;
// Pimpl. Make interface clean.
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
// delete m_ may cause core dump when the destructor of python in conflict with
......@@ -316,6 +433,16 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
#ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
return m_->PrepareMemoryPoolForCUDAGraph(id);
}
void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
return m_->RemoveMemoryPoolOfCUDAGraph(id);
}
#endif
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -18,6 +18,9 @@
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif
#include "paddle/fluid/platform/place.h"
namespace paddle {
......@@ -54,6 +57,11 @@ class AllocatorFacade {
uint64_t Release(const platform::Place& place);
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
#ifdef PADDLE_WITH_CUDA
void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
#endif
// TODO(yy): Allocate a Copy-On-Write allocation?
private:
AllocatorFacade();
......
......@@ -39,11 +39,12 @@ namespace allocation {
AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
size_t chunk_size)
size_t chunk_size, bool allow_free_idle_chunk)
: underlying_allocator_(
std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
alignment_(alignment),
chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)) {}
chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
allow_free_idle_chunk_(allow_free_idle_chunk) {}
Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
size = AlignedSize(size, alignment_);
......@@ -139,6 +140,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
}
uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
if (!allow_free_idle_chunk_) {
return 0;
}
uint64_t bytes = 0;
for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) {
auto &blocks = chunk_it->blocks_;
......
......@@ -31,7 +31,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
public:
AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
size_t chunk_size = 0);
size_t chunk_size = 0, bool allow_free_idle_chunk = true);
bool IsAllocThreadSafe() const override { return true; }
......@@ -86,6 +86,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
std::list<Chunk> chunks_;
size_t alignment_;
size_t chunk_size_;
bool allow_free_idle_chunk_;
SpinLock spinlock_;
};
......
......@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/cudnn_desc.h"
namespace paddle {
namespace operators {
......@@ -480,6 +481,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic,
const framework::ExecutionContext& ctx) {
platform::CUDAGraphCaptureModeGuard guard;
auto dtype = platform::CudnnDataType<T>::type;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0;
......@@ -601,6 +603,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
}
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
platform::CUDAGraphCaptureModeGuard guard;
size_t workspace_size = 0;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
......
......@@ -18,7 +18,8 @@ register_operators(EXCLUDES
fused_bn_add_activation_op
fused_attention_op
fused_feedforward_op
fused_transformer_op)
fused_transformer_op
resnet_unit_op)
# fusion_gru_op does not have CUDA kernel
op_library(fusion_gru_op)
......@@ -86,4 +87,11 @@ if (WITH_GPU OR WITH_ROCM)
op_library(fused_attention_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_attention);\n")
endif()
# resnet_unit needs cudnn 8.0 above
if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
op_library(resnet_unit_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(resnet_unit);\n")
cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory)
cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory)
endif()
endif()
此差异已折叠。
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
namespace dynload = platform::dynload;
template <typename T>
using BatchNormParamType =
typename platform::CudnnDataType<T>::BatchNormParamType;
#if CUDNN_VERSION >= 8000
template <typename T>
struct BNStatsFinalizeArgs {
BNStatsFinalizeArgs() {
dtype = platform::CudnnDataType<T>::type;
param_dtype = platform::CudnnDataType<BatchNormParamType<T>>::type;
format = CUDNN_TENSOR_NHWC;
}
void Set(const std::vector<int> &param_shape) {
PADDLE_ENFORCE_EQ(
param_shape.size(), 4U,
platform::errors::InvalidArgument(
"The size of param_shape is expected to 4. But recieved "
"param_shape's size is %d, param_shape is [%s].",
param_shape.size(), framework::make_ddim(param_shape)));
in_desc.set(param_shape, format, param_dtype);
out_desc.set(param_shape, format, dtype);
}
cudnnDataType_t dtype;
cudnnDataType_t param_dtype;
cudnnTensorFormat_t format;
platform::TensorDescriptor in_desc;
platform::TensorDescriptor out_desc;
};
template <typename T>
class CudnnBNStatsFinalize {
public:
CudnnBNStatsFinalize(const platform::CUDADeviceContext &ctx,
const std::vector<int> &param_shape)
: train_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING),
inference_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE) {
args_.Set(param_shape);
}
~CudnnBNStatsFinalize() {}
void Forward(const platform::CUDADeviceContext &ctx, const Tensor &sum,
const Tensor &sum_of_squares, const Tensor &scale,
const Tensor &bias, Tensor *saved_mean, Tensor *saved_invstd,
Tensor *running_mean, Tensor *running_var, Tensor *equiv_scale,
Tensor *equiv_bias, double eps, float momentum,
int64_t ele_count, bool is_train) {
auto place = ctx.GetPlace();
if (is_train) {
TrainInit(ctx);
} else {
InferenceInit(ctx);
}
auto &op = is_train ? train_op_ : inference_op_;
// Set variant_param for both inference_op_ and train_op_
float *sum_ptr = const_cast<float *>(sum.data<float>());
float *sum_of_squares_ptr =
const_cast<float *>(sum_of_squares.data<float>());
float *scale_ptr = const_cast<float *>(scale.data<float>());
float *bias_ptr = const_cast<float *>(bias.data<float>());
float *saved_mean_ptr = saved_mean->mutable_data<float>(place);
float *saved_invstd_ptr = saved_invstd->mutable_data<float>(place);
float *running_mean_ptr = running_mean->mutable_data<float>(place);
float *running_var_ptr = running_var->mutable_data<float>(place);
T *equiv_scale_ptr = equiv_scale->mutable_data<T>(place);
T *equiv_bias_ptr = equiv_bias->mutable_data<T>(place);
op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr);
op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr);
op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_MEAN, running_mean_ptr);
op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_VAR, running_var_ptr);
op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, equiv_scale_ptr);
op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, equiv_bias_ptr);
op.SetOpVariantParamAttrPtr<double>(CUDNN_SCALAR_DOUBLE_BN_EPSILON, &eps);
// Set extra variant_param only for train_op_:
if (is_train) {
op.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr);
op.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr);
op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_MEAN, saved_mean_ptr);
op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_INVSTD, saved_invstd_ptr);
double avg_factor = 1.0 - momentum;
op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT,
&ele_count);
op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR,
&avg_factor);
}
// fused op execute
auto handle = ctx.cudnn_handle();
op.Execute(handle);
}
private:
void TrainInit(const platform::CUDADeviceContext &ctx) {
// Set constant_param for train op
train_op_.SetOpConstParamAttr(
{CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER,
CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER,
CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER,
CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER,
CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER,
CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER,
CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED);
// Set input and output desc for train op
train_op_.SetOpConstParamDesc(
{CUDNN_PARAM_YSTATS_DESC, CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC},
args_.in_desc.desc());
train_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC,
args_.out_desc.desc());
// Get workspace
auto handle = ctx.cudnn_handle();
train_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
// Check workspace size, also creates plan.
size_t workspace_size_bytes = train_op_.GetWorkspaceSizeInBytes(handle);
PADDLE_ENFORCE_EQ(workspace_size_bytes, 0U,
platform::errors::InvalidArgument(
"Unexpected non-zero workspace size for "
"CudnnBNStatsFinalize."));
train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
static_cast<void *>(nullptr));
train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
&workspace_size_bytes);
}
void InferenceInit(const platform::CUDADeviceContext &ctx) {
// Set constant_param for inference op
inference_op_.SetOpConstParamAttr(
{CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER,
CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER,
CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER,
CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED);
// Set input and output desc for inference op
inference_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC,
args_.in_desc.desc());
inference_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC,
args_.out_desc.desc());
// Get workspace
auto handle = ctx.cudnn_handle();
inference_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
// Check workspace size, also creates plan.
size_t workspace_size_bytes = inference_op_.GetWorkspaceSizeInBytes(handle);
PADDLE_ENFORCE_EQ(workspace_size_bytes, 0U,
platform::errors::InvalidArgument(
"Unexpected non-zero workspace size for "
"CudnnBNStatsFinalize."));
inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
static_cast<void *>(nullptr));
inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
&workspace_size_bytes);
}
BNStatsFinalizeArgs<T> args_;
CudnnFusionOp train_op_;
CudnnFusionOp inference_op_;
};
#endif
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace dynload = platform::dynload;
#if CUDNN_VERSION >= 8000
// A wrapper for cuDNN fused_op API.
class CudnnFusionOp {
public:
explicit CudnnFusionOp(cudnnFusedOps_t op_id) : plan_created_(false) {
// New 'fused op' descriptor creation
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsPlan(&op_, op_id));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreateFusedOpsConstParamPack(&op_const_params_, op_id));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsVariantParamPack(
&op_variant_params_, op_id));
}
~CudnnFusionOp() PADDLE_MAY_THROW {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_));
}
// Execute fused op
void Execute(cudnnHandle_t cudnn_handle) {
PADDLE_ENFORCE_EQ(
plan_created_, true,
platform::errors::Fatal(
"CudnnFusionOp exec requested without a valid 'plan', need: "
"<set const params>, GetWorkspaceSizeBytes(), Execute()."));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnFusedOpsExecute(cudnn_handle, op_, op_variant_params_));
}
// Set const param pack attribute given a descriptor.
template <typename T>
void SetOpConstParamDesc(cudnnFusedOpsConstParamLabel_t param_label,
T *param_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnSetFusedOpsConstParamPackAttribute(
op_const_params_, param_label, param_ptr));
plan_created_ = false;
}
// Set multiple const param pack attribute given a descriptor.
template <typename T>
void SetOpConstParamDesc(
const std::vector<cudnnFusedOpsConstParamLabel_t> &param_labels,
T *param_ptr) {
for (auto param_label : param_labels) {
SetOpConstParamDesc(param_label, param_ptr);
}
}
// Set const param pack attribute given a value of param.
template <typename T>
void SetOpConstParamAttr(cudnnFusedOpsConstParamLabel_t param_label,
T param) {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnSetFusedOpsConstParamPackAttribute(op_const_params_,
param_label, &param));
plan_created_ = false;
}
// Set multiple const param pack attribute given a value of param.
template <typename T>
void SetOpConstParamAttr(
const std::vector<cudnnFusedOpsConstParamLabel_t> &param_labels,
T param) {
for (auto param_label : param_labels) {
SetOpConstParamAttr(param_label, param);
}
}
// Set a variant param pack attribute given a reference to a param.
template <typename T>
void SetOpVariantParamAttrPtr(cudnnFusedOpsVariantParamLabel_t param_label,
T *param_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnSetFusedOpsVariantParamPackAttribute(
op_variant_params_, param_label, param_ptr));
}
// Set multiple const param pack attributes given a reference to a param.
template <typename T>
void SetOpVariantParamAttrPtr(
const std::vector<cudnnFusedOpsVariantParamLabel_t> &param_labels,
const T *param_ptr) {
for (auto param_label : param_labels) {
SetOpVariantParamAttrPtr(param_label, param_ptr);
}
}
// Get the workspace, which is required before Execute().
size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) {
if (!plan_created_) {
workspace_bytes_ = 0U;
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan(
cudnn_handle, op_, op_const_params_, &workspace_bytes_));
plan_created_ = true;
}
return workspace_bytes_;
}
private:
bool plan_created_;
size_t workspace_bytes_;
cudnnFusedOpsPlan_t op_;
cudnnFusedOpsConstParamPack_t op_const_params_;
cudnnFusedOpsVariantParamPack_t op_variant_params_;
};
class CudnnFusionOpCache {
public:
static CudnnFusionOpCache &Instance() {
static CudnnFusionOpCache instance;
return instance;
}
framework::AlgorithmsCache<CudnnFusionOp *> *GetForward() {
return &forward_cache_;
}
framework::AlgorithmsCache<CudnnFusionOp *> *GetBackward() {
return &backward_cache_;
}
private:
CudnnFusionOpCache() {}
~CudnnFusionOpCache() {
// Need to delete the memory of cache.
}
CudnnFusionOpCache(const CudnnFusionOpCache &) {}
private:
framework::AlgorithmsCache<CudnnFusionOp *> forward_cache_;
framework::AlgorithmsCache<CudnnFusionOp *> backward_cache_;
};
#endif // CUDNN_VERSION >= 8000
} // namespace operators
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
namespace dynload = platform::dynload;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
#if CUDNN_VERSION >= 8000
static size_t RoundUp(int64_t a, int64_t b) { return (a + b - 1) / b * b; }
template <typename T>
struct NormConvolutionArgs {
NormConvolutionArgs() {
dtype = platform::CudnnDataType<T>::type;
format = CUDNN_TENSOR_NHWC;
compute_type = platform::CudnnDataType<float>::type;
}
void Set(const platform::CUDADeviceContext &ctx,
const std::vector<int> &input_shape,
const std::vector<int> &filter_shape,
const std::vector<int> &output_shape, int padding, int stride,
int dilation, int group) {
PADDLE_ENFORCE_EQ(
input_shape.size(), 4U,
platform::errors::InvalidArgument(
"The size of input_shape is expected to 4. But recieved "
"input_shape's size is %d, input_shape is [%s].",
input_shape.size(), framework::make_ddim(input_shape)));
PADDLE_ENFORCE_EQ(
filter_shape.size(), 4U,
platform::errors::InvalidArgument(
"The size of filter_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s].",
filter_shape.size(), framework::make_ddim(filter_shape)));
PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] &&
(filter_shape[1] == 1 || filter_shape[1] == 3),
true,
platform::errors::InvalidArgument(
"The filter_shape is expected to store as nhwc, and "
"h = w = 1 or 3. But recieved filter_shape is [%s].",
framework::make_ddim(filter_shape)));
PADDLE_ENFORCE_EQ((filter_shape[0] % 32 == 0 && filter_shape[3] % 8 == 0),
true,
platform::errors::InvalidArgument(
"The input channel is expected to be multiple of 8, "
"and the output channel is expected to be multiple "
"of 32. But recieved input channel is %d, output "
"channel is %d.",
filter_shape[3], filter_shape[0]));
PADDLE_ENFORCE_EQ(
output_shape.size(), 4U,
platform::errors::InvalidArgument(
"The size of output_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s].",
output_shape.size(), framework::make_ddim(output_shape)));
is_support = IsSupport(ctx, filter_shape, stride, dilation, group);
PADDLE_ENFORCE_EQ(
is_support, true,
platform::errors::InvalidArgument(
"Current test is only supported in the platforms with "
"compatiblity greater than or equal to 70 and the kernel size "
"must be equal to 1 or 3. When the kernel size is 1, "
"the stride must be 1 if the compatiblity is equal to 70. "
"Besides, the dilation and group must be equal to 1. But recieved "
"compatiblity is %d, kernel size is %d, stride is %d, "
"dilation is %d, group is %d",
ctx.GetComputeCapability(), filter_shape[1], stride, dilation,
group));
for (size_t i = 0; i < input_shape.size(); ++i) {
in_dims.push_back(input_shape[i]);
}
for (size_t i = 0; i < filter_shape.size(); ++i) {
filter_dims.push_back(filter_shape[i]);
}
paddings = {padding, padding};
strides = {stride, stride};
dilations = {dilation, dilation};
in_desc.set(input_shape, format, dtype);
filter_desc.set(filter_shape, format, dtype, group);
out_desc.set(output_shape, format, dtype);
int output_channel = filter_shape[0];
std::vector<int> stats_shape = {1, 1, 1, output_channel};
out_stats_desc.set(stats_shape, format, compute_type);
conv_desc.set(dtype, paddings, strides, dilations, false, group);
}
bool IsSupport(const platform::CUDADeviceContext &ctx,
const std::vector<int> &filter_shape, int stride, int dilation,
int group) {
int kernel_size = filter_shape[1];
if (dilation != 1 || group != 1) {
return false;
}
if (ctx.GetComputeCapability() == 70) {
if ((kernel_size == 3) || ((kernel_size == 1) && (stride == 1))) {
return true;
}
} else if (ctx.GetComputeCapability() > 70) {
if ((kernel_size == 3) || (kernel_size == 1)) {
return true;
}
}
return false;
}
cudnnDataType_t dtype;
cudnnTensorFormat_t format;
cudnnDataType_t compute_type;
std::vector<int64_t> in_dims;
std::vector<int64_t> filter_dims;
std::vector<int> strides;
std::vector<int> paddings;
std::vector<int> dilations;
platform::TensorDescriptor in_desc;
platform::FilterDescriptor filter_desc;
platform::TensorDescriptor out_desc;
platform::TensorDescriptor out_stats_desc;
platform::ConvolutionDescriptor conv_desc;
bool is_support;
};
template <typename T>
class CudnnNormConvolution {
public:
CudnnNormConvolution(const platform::CUDADeviceContext &ctx,
const std::vector<int> &input_shape,
const std::vector<int> &filter_shape,
const std::vector<int> &output_shape, const int &padding,
const int &stride, const int &dilation,
const int &group) {
args_.Set(ctx, input_shape, filter_shape, output_shape, padding, stride,
dilation, group);
}
~CudnnNormConvolution() {}
void Forward(const platform::CUDADeviceContext &ctx, const Tensor &input,
const Tensor &filter, Tensor *output, Tensor *sum,
Tensor *sum_of_squares) {
auto cudnn_handle = ctx.cudnn_handle();
auto place = ctx.GetPlace();
CudnnFusionOp *fwd_op = GetForwardOp(ctx);
size_t workspace_size = RoundUp(
static_cast<int64_t>(fwd_op->GetWorkspaceSizeInBytes(cudnn_handle)),
512);
// Set variant_param
// input ptr
T *input_ptr = const_cast<T *>(input.data<T>());
T *filter_ptr = const_cast<T *>(filter.data<T>());
fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr);
fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr);
fwd_op->SetOpVariantParamAttrPtr(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size);
// output ptr
T *output_ptr = output->mutable_data<T>(place);
float *sum_ptr = sum->mutable_data<float>(place);
float *sum_of_squares_ptr = sum_of_squares->mutable_data<float>(place);
fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr);
fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr);
fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr);
ctx.cudnn_workspace_handle().RunFunc(
[&](void *workspace_ptr) {
// workspace ptr
fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
// fused op execute
fwd_op->Execute(cudnn_handle);
},
workspace_size);
}
private:
CudnnFusionOp *GetForwardOp(const platform::CUDADeviceContext &ctx) {
framework::AlgorithmsCache<CudnnFusionOp *> &cache =
*(CudnnFusionOpCache::Instance().GetForward());
CudnnFusionOp *fwd_op = cache.GetAlgorithm(
args_.in_dims, args_.filter_dims, args_.strides, args_.paddings,
args_.dilations, 0, static_cast<int64_t>(args_.dtype), [&]() {
CudnnFusionOp *fwd_op =
new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS);
// Set constant_param
fwd_op->SetOpConstParamAttr(
{CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER,
CUDNN_PARAM_YDATA_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED);
fwd_op->SetOpConstParamAttr(
{CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED);
// conv desc
fwd_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC,
args_.conv_desc.desc());
// input desc
fwd_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc());
// filter desc
fwd_op->SetOpConstParamDesc(CUDNN_PARAM_WDESC,
args_.filter_desc.desc());
// output desc
fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc());
// output_stats desc
fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC,
args_.out_stats_desc.desc());
// batch_norm mode
fwd_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
// Make cudnn fused ops plan
fwd_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle());
return fwd_op;
});
return fwd_op;
}
private:
NormConvolutionArgs<T> args_;
};
template <typename T>
class CudnnNormConvolutionGrad {
public:
CudnnNormConvolutionGrad(const platform::CUDADeviceContext &ctx,
const std::vector<int> &input_shape,
const std::vector<int> &filter_shape,
const std::vector<int> &output_shape,
const int &padding, const int &stride,
const int &dilation, const int &group) {
args_.Set(ctx, input_shape, filter_shape, output_shape, padding, stride,
dilation, group);
dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
~CudnnNormConvolutionGrad() {}
void Backward(const platform::CUDADeviceContext &ctx, const Tensor &input,
const Tensor &filter, const Tensor &output_grad,
Tensor *input_grad, Tensor *filter_grad,
bool use_addto = false) {
auto place = ctx.GetPlace();
T *input_ptr = const_cast<T *>(input.data<T>());
T *filter_ptr = const_cast<T *>(filter.data<T>());
T *output_grad_ptr = const_cast<T *>(output_grad.data<T>());
if (filter_grad) {
T *filter_grad_ptr = filter_grad->mutable_data<T>(place);
BackwardFilter(ctx, output_grad_ptr, input_ptr, filter_grad_ptr);
}
if (input_grad) {
T *input_grad_ptr = input_grad->mutable_data<T>(place);
BackwardData(ctx, output_grad_ptr, filter_ptr, input_grad_ptr, use_addto);
}
}
private:
void BackwardFilter(const platform::CUDADeviceContext &ctx,
T *output_grad_ptr, T *input_ptr, T *filter_grad_ptr) {
auto cudnn_handle = ctx.cudnn_handle();
CudnnFusionOp *wgrad_op = GetBackwardFilterOp(ctx);
size_t workspace_size = RoundUp(
static_cast<int64_t>(wgrad_op->GetWorkspaceSizeInBytes(cudnn_handle)),
512);
wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr);
wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, output_grad_ptr);
wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DWDATA, filter_grad_ptr);
wgrad_op->SetOpVariantParamAttrPtr(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size);
ctx.cudnn_workspace_handle().RunFunc(
[&](void *workspace_ptr) {
// workspace ptr
wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
workspace_ptr);
// fused op execute
wgrad_op->Execute(cudnn_handle);
},
workspace_size);
}
void BackwardData(const platform::CUDADeviceContext &ctx, T *output_grad_ptr,
T *filter_ptr, T *input_grad_ptr, bool use_addto = false) {
auto cudnn_handle = ctx.cudnn_handle();
size_t workspace_size = GetWorkspaceSizeBwdData(ctx);
// Convolution dgrad followed optionally by batchnorm dgrad
ScalingParamType<T> alpha = 1.0f;
ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
ctx.cudnn_workspace_handle().RunFunc(
[&](void *cudnn_workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnConvolutionBackwardData(
cudnn_handle, &alpha, args_.filter_desc.desc(), filter_ptr,
args_.out_desc.desc(), output_grad_ptr,
args_.conv_desc.desc(), dgrad_algo_, cudnn_workspace_ptr,
workspace_size, &beta, args_.in_desc.desc(), input_grad_ptr));
},
workspace_size);
}
CudnnFusionOp *GetBackwardFilterOp(const platform::CUDADeviceContext &ctx) {
framework::AlgorithmsCache<CudnnFusionOp *> &cache =
*(CudnnFusionOpCache::Instance().GetBackward());
CudnnFusionOp *wgrad_op = cache.GetAlgorithm(
args_.in_dims, args_.filter_dims, args_.strides, args_.paddings,
args_.dilations, 0, static_cast<int64_t>(args_.dtype), [&]() {
CudnnFusionOp *wgrad_op =
new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD);
wgrad_op->SetOpConstParamAttr(
{CUDNN_PARAM_DYDATA_PLACEHOLDER, CUDNN_PARAM_XDATA_PLACEHOLDER,
CUDNN_PARAM_DWDATA_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED);
// conv desc
wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC,
args_.conv_desc.desc());
// input desc
wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC,
args_.in_desc.desc());
// filter desc
wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DWDESC,
args_.filter_desc.desc());
// output desc
wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DYDESC,
args_.out_desc.desc());
wgrad_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
// Make cudnn fused ops plan
wgrad_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle());
return wgrad_op;
});
return wgrad_op;
}
size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) {
size_t workspace_size = 0U;
auto handle = ctx.cudnn_handle();
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, args_.filter_desc.desc(), args_.out_desc.desc(),
args_.conv_desc.desc(), args_.in_desc.desc(), dgrad_algo_,
&workspace_size));
return RoundUp(workspace_size, 512);
}
private:
NormConvolutionArgs<T> args_;
cudnnConvolutionBwdDataAlgo_t dgrad_algo_;
};
#endif
} // namespace operators
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <random>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/float16.h"
namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace op = paddle::operators;
using Tensor = paddle::framework::Tensor;
USE_OP(conv2d);
USE_OP(conv2d_grad);
USE_OP_DEVICE_KERNEL(conv2d, CUDNN);
USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN);
template <typename T>
void InitRandomTensor(const std::vector<int64_t> &dims,
framework::Tensor *cpu_out) {
T *cpu_out_ptr = cpu_out->mutable_data<T>(framework::make_ddim(dims),
platform::CPUPlace());
std::default_random_engine random(0);
std::uniform_real_distribution<float> dis(0.0, 1.0);
for (int i = 0; i < cpu_out->numel(); ++i) {
cpu_out_ptr[i] = static_cast<T>(dis(random));
}
}
template <typename T>
void TransposeNchwToNhwc(const framework::Tensor &cpu_in,
framework::Tensor *cpu_out) {
auto in_dims = cpu_in.dims();
EXPECT_EQ(cpu_in.dims().size(), 4);
const T *cpu_in_ptr = cpu_in.data<T>();
T *cpu_out_ptr = cpu_out->mutable_data<T>(
{in_dims[0], in_dims[2], in_dims[3], in_dims[1]}, platform::CPUPlace());
int64_t n = in_dims[0];
int64_t c = in_dims[1];
int64_t hw = in_dims[2] * in_dims[3];
for (int i = 0; i < n; ++i) {
for (int j = 0; j < hw; ++j) {
for (int k = 0; k < c; ++k) {
int dst_idx = i * hw * c + j * c + k;
int src_idx = i * c * hw + k * hw + j;
cpu_out_ptr[dst_idx] = cpu_in_ptr[src_idx];
}
}
}
}
template <typename T>
void CheckOutput(const framework::Tensor &cpu_res,
const framework::Tensor &cpu_base, float diff,
bool is_relative_atol = false) {
EXPECT_EQ(cpu_res.dims(), cpu_base.dims());
const T *cpu_res_ptr = cpu_res.data<T>();
const T *cpu_base_ptr = cpu_base.data<T>();
for (int i = 0; i < cpu_res.numel(); ++i) {
if (is_relative_atol) {
EXPECT_LT(static_cast<float>(std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) /
cpu_base_ptr[i])),
diff);
} else {
EXPECT_LT(static_cast<float>(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])),
diff);
}
}
}
// Use Paddle conv2d op results as baseline
void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
const Tensor &cpu_input, const Tensor &cpu_filter,
Tensor *cpu_output, int stride, int padding) {
framework::Scope scope;
auto *input = scope.Var("Input")->GetMutable<framework::LoDTensor>();
auto *filter = scope.Var("Filter")->GetMutable<framework::LoDTensor>();
auto *output = scope.Var("Output")->GetMutable<framework::LoDTensor>();
auto place = ctx.GetPlace();
TensorCopySync(cpu_input, place, input);
TensorCopySync(cpu_filter, place, filter);
framework::AttributeMap attrs;
bool use_cudnn = true;
std::string data_format = "NHWC";
std::vector<int> strides = {stride, stride};
std::vector<int> paddings = {padding, padding};
attrs.insert({"strides", strides});
attrs.insert({"paddings", paddings});
attrs.insert({"use_cudnn", use_cudnn});
attrs.insert({"data_format", data_format});
auto op = framework::OpRegistry::CreateOp(
"conv2d", {{"Input", {"Input"}}, {"Filter", {"Filter"}}},
{{"Output", {"Output"}}}, attrs);
op->Run(scope, ctx.GetPlace());
TensorCopySync(*output, platform::CPUPlace(), cpu_output);
}
// Use Paddle conv2d_grad op results as baseline
void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
const Tensor &cpu_input, const Tensor &cpu_filter,
const Tensor &cpu_output_grad,
framework::Tensor *cpu_input_grad,
framework::Tensor *cpu_filter_grad, int stride,
int padding, int dilation) {
framework::Scope scope;
auto *input = scope.Var("Input")->GetMutable<framework::LoDTensor>();
auto *filter = scope.Var("Filter")->GetMutable<framework::LoDTensor>();
auto *output_grad =
scope.Var("Output@GRAD")->GetMutable<framework::LoDTensor>();
auto *input_grad =
scope.Var("Input@GRAD")->GetMutable<framework::LoDTensor>();
auto *filter_grad =
scope.Var("Filter@GRAD")->GetMutable<framework::LoDTensor>();
auto place = ctx.GetPlace();
TensorCopySync(cpu_input, place, input);
TensorCopySync(cpu_filter, place, filter);
TensorCopySync(cpu_output_grad, place, output_grad);
framework::AttributeMap attrs;
bool use_cudnn = true;
std::string data_format = "NHWC";
std::string padding_algorithm = "EXPLICIT";
std::vector<int> strides = {stride, stride};
std::vector<int> paddings = {padding, padding};
std::vector<int> dilations = {dilation, dilation};
int groups = 1;
bool exhaustive_search = false;
bool use_addto = false;
attrs.insert({"use_cudnn", use_cudnn});
attrs.insert({"data_format", data_format});
attrs.insert({"padding_algorithm", padding_algorithm});
attrs.insert({"strides", strides});
attrs.insert({"paddings", paddings});
attrs.insert({"dilations", dilations});
attrs.insert({"groups", groups});
attrs.insert({"exhaustive_search", exhaustive_search});
attrs.insert({"use_addto", use_addto});
auto op = framework::OpRegistry::CreateOp(
"conv2d_grad", {{"Input", {"Input"}},
{"Filter", {"Filter"}},
{"Output@GRAD", {"Output@GRAD"}}},
{{"Input@GRAD", {"Input@GRAD"}}, {"Filter@GRAD", {"Filter@GRAD"}}},
attrs);
op->Run(scope, ctx.GetPlace());
TensorCopySync(*input_grad, platform::CPUPlace(), cpu_input_grad);
TensorCopySync(*filter_grad, platform::CPUPlace(), cpu_filter_grad);
}
template <typename T>
void ComputeSumAndSquareSum(const framework::Tensor &cpu_out,
framework::Tensor *cpu_sum,
framework::Tensor *cpu_sum_of_square) {
auto dims = cpu_out.dims();
int64_t c = dims[3];
const T *cpu_out_ptr = cpu_out.data<T>();
float *cpu_sum_ptr =
cpu_sum->mutable_data<float>({1, 1, 1, c}, platform::CPUPlace());
float *cpu_sum_square_ptr = cpu_sum_of_square->mutable_data<float>(
{1, 1, 1, c}, platform::CPUPlace());
for (int j = 0; j < c; ++j) {
float tmp_sum = 0.0f;
float tmp_sum_of_squares = 0.0f;
for (int i = 0; i < cpu_out.numel() / c; ++i) {
float tmp_out = static_cast<float>(cpu_out_ptr[i * c + j]);
tmp_sum += tmp_out;
tmp_sum_of_squares += tmp_out * tmp_out;
}
cpu_sum_ptr[j] = tmp_sum;
cpu_sum_square_ptr[j] = tmp_sum_of_squares;
}
}
template <typename T>
class CudnnNormConvolutionTester {
public:
CudnnNormConvolutionTester(int batch_size, int height, int width,
int input_channels, int output_channels,
int kernel_size, int stride) {
batch_size_ = batch_size;
height_ = height;
width_ = width;
input_channels_ = input_channels;
output_channels_ = output_channels;
kernel_size_ = kernel_size;
stride_ = stride;
padding_ = (kernel_size_ - 1) / 2;
out_height_ = (height_ + 2 * padding_ - kernel_size_) / stride_ + 1;
out_width_ = (width_ + 2 * padding_ - kernel_size_) / stride_ + 1;
SetUp();
}
~CudnnNormConvolutionTester() {}
void CheckForward(float diff, bool is_relative_atol = false) {
platform::CUDADeviceContext *ctx =
static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(0)));
framework::Tensor cpu_output_base;
framework::Tensor cpu_sum_base;
framework::Tensor cpu_sum_of_square_base;
BaselineForward(*ctx, &cpu_output_base, &cpu_sum_base,
&cpu_sum_of_square_base);
framework::Tensor cpu_output;
framework::Tensor cpu_sum;
framework::Tensor cpu_sum_of_square;
FusedForward(*ctx, &cpu_output, &cpu_sum, &cpu_sum_of_square);
// Check forward correctness between baseline and results of normconv.
CheckOutput<T>(cpu_output, cpu_output_base, diff, is_relative_atol);
CheckOutput<float>(cpu_sum, cpu_sum_base, diff, is_relative_atol);
CheckOutput<float>(cpu_sum_of_square, cpu_sum_of_square_base, diff,
is_relative_atol);
}
void CheckBackward(float diff, bool is_relative_atol = false) {
platform::CUDADeviceContext *ctx =
static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(
platform::CUDAPlace(0)));
framework::Tensor cpu_input_grad_base;
framework::Tensor cpu_filter_nchw_grad_base;
framework::Tensor cpu_filter_nhwc_grad_base;
BaselineBackward(*ctx, &cpu_input_grad_base, &cpu_filter_nchw_grad_base);
TransposeNchwToNhwc<T>(cpu_filter_nchw_grad_base,
&cpu_filter_nhwc_grad_base);
framework::Tensor cpu_input_grad;
framework::Tensor cpu_filter_nhwc_grad;
FusedBackward(*ctx, &cpu_input_grad, &cpu_filter_nhwc_grad);
// Check backward correctness between baseline and results of normconv.
CheckOutput<T>(cpu_input_grad, cpu_input_grad_base, diff, is_relative_atol);
CheckOutput<T>(cpu_filter_nhwc_grad, cpu_filter_nhwc_grad_base, diff,
is_relative_atol);
}
private:
void SetUp() {
InitRandomTensor<T>({batch_size_, height_, width_, input_channels_},
&cpu_input_);
InitRandomTensor<T>(
{output_channels_, input_channels_, kernel_size_, kernel_size_},
&cpu_filter_nchw_);
// transpoes for filter, NCHW -> NHWC
TransposeNchwToNhwc<T>(cpu_filter_nchw_, &cpu_filter_nhwc_);
InitRandomTensor<T>(
{batch_size_, out_height_, out_width_, output_channels_},
&cpu_output_grad_);
}
void BaselineForward(const platform::CUDADeviceContext &ctx,
framework::Tensor *cpu_output_base,
framework::Tensor *cpu_sum_base,
framework::Tensor *cpu_sum_of_square_base) {
ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base,
stride_, padding_);
ComputeSumAndSquareSum<T>(*cpu_output_base, cpu_sum_base,
cpu_sum_of_square_base);
}
void BaselineBackward(const platform::CUDADeviceContext &ctx,
framework::Tensor *cpu_input_grad_base,
framework::Tensor *cpu_filter_grad_base) {
ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_grad_,
cpu_input_grad_base, cpu_filter_grad_base, stride_,
padding_, dilation_);
}
// get forward results of cudnn_norm_conv
void FusedForward(const platform::CUDADeviceContext &ctx,
framework::Tensor *cpu_output, framework::Tensor *cpu_sum,
framework::Tensor *cpu_sum_of_square) {
framework::Tensor input;
framework::Tensor filter_nhwc;
framework::Tensor output;
framework::Tensor sum;
framework::Tensor sum_of_square;
auto place = ctx.GetPlace();
TensorCopySync(cpu_input_, place, &input);
TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc);
output.Resize(framework::make_ddim(
{batch_size_, out_height_, out_width_, output_channels_}));
sum.Resize(framework::make_ddim({1, 1, 1, output_channels_}));
sum_of_square.Resize(framework::make_ddim({1, 1, 1, output_channels_}));
auto input_shape = framework::vectorize<int>(input.dims());
auto filter_shape = framework::vectorize<int>(filter_nhwc.dims());
auto output_shape = framework::vectorize<int>(output.dims());
op::CudnnNormConvolution<T> conv_op(ctx, input_shape, filter_shape,
output_shape, padding_, stride_,
dilation_, group_);
conv_op.Forward(ctx, input, filter_nhwc, &output, &sum, &sum_of_square);
TensorCopySync(output, platform::CPUPlace(), cpu_output);
TensorCopySync(sum, platform::CPUPlace(), cpu_sum);
TensorCopySync(sum_of_square, platform::CPUPlace(), cpu_sum_of_square);
}
void FusedBackward(const platform::CUDADeviceContext &ctx,
framework::Tensor *cpu_input_grad,
framework::Tensor *cpu_filter_grad) {
framework::Tensor input;
framework::Tensor filter_nhwc;
framework::Tensor output_grad;
framework::Tensor input_grad;
framework::Tensor filter_grad;
auto place = ctx.GetPlace();
TensorCopySync(cpu_input_, place, &input);
TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc);
TensorCopySync(cpu_output_grad_, place, &output_grad);
input_grad.Resize(input.dims());
filter_grad.Resize(filter_nhwc.dims());
auto input_shape = framework::vectorize<int>(input.dims());
auto filter_shape = framework::vectorize<int>(filter_nhwc.dims());
auto output_shape = framework::vectorize<int>(output_grad.dims());
op::CudnnNormConvolutionGrad<T> conv_grad_op(ctx, input_shape, filter_shape,
output_shape, padding_,
stride_, dilation_, group_);
conv_grad_op.Backward(ctx, input, filter_nhwc, output_grad, &input_grad,
&filter_grad);
TensorCopySync(input_grad, platform::CPUPlace(), cpu_input_grad);
TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad);
}
private:
int batch_size_;
int height_;
int width_;
int out_height_;
int out_width_;
int input_channels_;
int output_channels_;
int kernel_size_;
int stride_;
int padding_;
const int dilation_ = 1;
const int group_ = 1;
// Forward input
framework::Tensor cpu_input_;
framework::Tensor cpu_filter_nchw_;
framework::Tensor cpu_filter_nhwc_;
// Backward input
framework::Tensor cpu_output_grad_;
};
// test for fp16, kernel = 1, output_channels = input_channels
TEST(CudnnNormConvFp16, K1S1) {
int batch_size = 4;
int height = 56;
int width = 56;
int input_channels = 32;
int output_channels = 32;
int kernel_size = 1;
int stride = 1;
CudnnNormConvolutionTester<paddle::platform::float16> test(
batch_size, height, width, input_channels, output_channels, kernel_size,
stride);
test.CheckForward(1e-3, true);
test.CheckBackward(1e-3, true);
}
// test for fp16, kernel = 3, output_channels = input_channels
TEST(CudnnNormConvFp16, K3S1) {
int batch_size = 4;
int height = 56;
int width = 56;
int input_channels = 32;
int output_channels = 32;
int kernel_size = 3;
int stride = 1;
CudnnNormConvolutionTester<paddle::platform::float16> test(
batch_size, height, width, input_channels, output_channels, kernel_size,
stride);
test.CheckForward(1e-3, true);
test.CheckBackward(1e-3, true);
}
// test for fp16, kernel = 1, output_channels = input_channels * 4
TEST(CudnnNormConvFp16, K1S1O4) {
int batch_size = 4;
int height = 56;
int width = 56;
int input_channels = 32;
int output_channels = 128;
int kernel_size = 1;
int stride = 1;
CudnnNormConvolutionTester<paddle::platform::float16> test(
batch_size, height, width, input_channels, output_channels, kernel_size,
stride);
test.CheckForward(1e-3, true);
test.CheckBackward(1e-3, true);
}
// test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4
TEST(CudnnNormConvFp16, K1S2O4) {
int batch_size = 4;
int height = 8;
int width = 8;
int input_channels = 32;
int output_channels = 128;
int kernel_size = 1;
int stride = 2;
CudnnNormConvolutionTester<paddle::platform::float16> test(
batch_size, height, width, input_channels, output_channels, kernel_size,
stride);
platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
if (ctx->GetComputeCapability() <= 70) {
ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet);
} else {
ASSERT_NO_THROW(test.CheckForward(1e-3, true));
ASSERT_NO_THROW(test.CheckBackward(1e-3));
}
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
namespace dynload = platform::dynload;
template <typename T>
using BatchNormParamType =
typename platform::CudnnDataType<T>::BatchNormParamType;
#if CUDNN_VERSION >= 8000
template <typename T>
struct ScaleBiasAddReluArgs {
ScaleBiasAddReluArgs() {
dtype = platform::CudnnDataType<T>::type;
param_dtype = platform::CudnnDataType<BatchNormParamType<T>>::type;
format = CUDNN_TENSOR_NHWC;
}
void Set(const std::string &act_type, const std::vector<int> &data_shape,
const std::vector<int> &param_shape,
const std::vector<int> &bitmask_shape) {
PADDLE_ENFORCE_EQ(
data_shape.size(), 4U,
platform::errors::InvalidArgument(
"The size of data_shape is expected to 4. But recieved "
"data_shape's size is %d, data_shape is [%s].",
data_shape.size(), framework::make_ddim(data_shape)));
PADDLE_ENFORCE_EQ(
param_shape.size(), 4U,
platform::errors::InvalidArgument(
"The size of param_shape is expected to 4. But recieved "
"param_shape's size is %d, param_shape is [%s].",
param_shape.size(), framework::make_ddim(param_shape)));
PADDLE_ENFORCE_EQ(
bitmask_shape.size(), 3U,
platform::errors::InvalidArgument(
"The size of bitmask_shape is expected to 3. But recieved "
"bitmask_shape's size is %d, bitmask_shape is [%s].",
bitmask_shape.size(), framework::make_ddim(bitmask_shape)));
in_desc.set(data_shape, format, dtype);
out_desc.set(data_shape, format, dtype);
equiv_scale_bias_desc.set(param_shape, format, dtype);
scale_bias_mean_var_desc.set(param_shape, format, param_dtype);
bitmask_desc.set(bitmask_shape, format, CUDNN_DATA_INT32);
// set activation desc
cudnnActivationMode_t mode = CUDNN_ACTIVATION_IDENTITY;
if (act_type != "") {
PADDLE_ENFORCE_EQ(
act_type, "relu",
platform::errors::InvalidArgument(
"Only relu activation supported in normalized convolution."));
mode = CUDNN_ACTIVATION_RELU;
}
double dummy_clip = 0.0;
activation_desc.set(mode, dummy_clip);
}
cudnnDataType_t dtype;
cudnnDataType_t param_dtype;
cudnnTensorFormat_t format;
platform::TensorDescriptor in_desc;
platform::TensorDescriptor out_desc;
platform::TensorDescriptor equiv_scale_bias_desc;
platform::TensorDescriptor scale_bias_mean_var_desc;
platform::TensorDescriptor bitmask_desc;
platform::ActivationDescriptor activation_desc;
};
template <typename T>
class CudnnScaleBiasAddRelu {
public:
CudnnScaleBiasAddRelu(const platform::CUDADeviceContext &ctx,
const std::string &act_type, bool fuse_add,
bool has_shortcut, const std::vector<int> &data_shape,
const std::vector<int> &param_shape,
const std::vector<int> &bitmask_shape)
: fwd_op_(CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK),
bwd_op_(CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM) {
fuse_add_ = fuse_add;
has_shortcut_ = has_shortcut;
args_.Set(act_type, data_shape, param_shape, bitmask_shape);
}
~CudnnScaleBiasAddRelu() {}
void Forward(const platform::CUDADeviceContext &ctx, const Tensor &x,
const Tensor &x_scale, const Tensor &x_bias, const Tensor *z,
const Tensor *z_scale, const Tensor *z_bias, Tensor *out,
Tensor *bitmask) {
ForwardInit(ctx);
auto handle = ctx.cudnn_handle();
auto place = ctx.GetPlace();
auto workspace_handle = ctx.cudnn_workspace_handle();
fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle);
// Set variant_param
// input ptr
T *x_ptr = const_cast<T *>(x.data<T>());
T *x_scale_ptr = const_cast<T *>(x_scale.data<T>());
T *x_bias_ptr = const_cast<T *>(x_bias.data<T>());
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr);
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, x_scale_ptr);
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, x_bias_ptr);
if (has_shortcut_) {
T *z_ptr = const_cast<T *>(z->data<T>());
T *z_scale_ptr = const_cast<T *>(z_scale->data<T>());
T *z_bias_ptr = const_cast<T *>(z_bias->data<T>());
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr);
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQSCALE, z_scale_ptr);
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQBIAS, z_bias_ptr);
} else {
if (fuse_add_) {
T *z_ptr = const_cast<T *>(z->data<T>());
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr);
}
}
fwd_op_.SetOpVariantParamAttrPtr(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_);
// output ptr
T *out_ptr = out->mutable_data<T>(place);
int32_t *bitmask_ptr = bitmask->mutable_data<int32_t>(place);
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, out_ptr);
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr);
workspace_handle.RunFunc(
[&](void *workspace_ptr) {
// workspace ptr
fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
// workspace ptr
fwd_op_.Execute(handle);
},
fwd_workspace_byte_);
}
void Backward(const platform::CUDADeviceContext &ctx, const Tensor &dy,
const Tensor &x, const Tensor &scale, const Tensor &bias,
const Tensor &saved_mean, const Tensor &saved_invstd,
const Tensor *bitmask, Tensor *dx, Tensor *dz, Tensor *dscale,
Tensor *dbias, double eps) {
BackwardInit(ctx);
auto handle = ctx.cudnn_handle();
auto place = ctx.GetPlace();
auto workspace_handle = ctx.cudnn_workspace_handle();
bwd_workspace_byte_ = bwd_op_.GetWorkspaceSizeInBytes(handle);
// Set variant_param
// input ptr
T *dy_ptr = const_cast<T *>(dy.data<T>());
T *x_ptr = const_cast<T *>(x.data<T>());
float *scale_ptr = const_cast<float *>(scale.data<float>());
float *bias_ptr = const_cast<float *>(bias.data<float>());
float *saved_mean_ptr = const_cast<float *>(saved_mean.data<float>());
float *saved_invstd_ptr = const_cast<float *>(saved_invstd.data<float>());
int32_t *bitmask_ptr =
bitmask ? const_cast<int32_t *>(bitmask->data<int32_t>()) : nullptr;
T *dx_ptr = dx->mutable_data<T>(place);
T *dz_ptr = dz ? dz->mutable_data<T>(place) : nullptr;
float *dscale_ptr = dscale ? dscale->mutable_data<float>(place) : nullptr;
float *dbias_ptr = dbias ? dbias->mutable_data<float>(place) : nullptr;
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr);
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, dy_ptr);
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr);
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr);
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_MEAN, saved_mean_ptr);
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_INVSTD,
saved_invstd_ptr);
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr);
bwd_op_.SetOpVariantParamAttrPtr(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &bwd_workspace_byte_);
// output ptr
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DXDATA, dx_ptr);
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DSCALE, dscale_ptr);
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DBIAS, dbias_ptr);
bwd_op_.SetOpVariantParamAttrPtr<double>(CUDNN_SCALAR_DOUBLE_BN_EPSILON,
&eps);
if (has_shortcut_ || fuse_add_) {
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DZDATA, dz_ptr);
}
workspace_handle.RunFunc(
[&](void *workspace_ptr) {
// workspace ptr
bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
// workspace ptr
bwd_op_.Execute(handle);
},
bwd_workspace_byte_);
}
private:
void ForwardInit(const platform::CUDADeviceContext &ctx) {
// Set constant_param
fwd_op_.SetOpConstParamAttr(
{CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER,
CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER, CUDNN_PARAM_YDATA_PLACEHOLDER,
CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED);
if (has_shortcut_) {
fwd_op_.SetOpConstParamAttr(
{CUDNN_PARAM_ZDATA_PLACEHOLDER, CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER,
CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED);
} else if (fuse_add_) {
fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_ZDATA_PLACEHOLDER,
CUDNN_PTR_16B_ALIGNED);
}
// input desc
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc());
if (has_shortcut_ || fuse_add_) {
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ZDESC, args_.in_desc.desc());
}
// equiv scale/bias desc
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC,
args_.equiv_scale_bias_desc.desc());
if (has_shortcut_) {
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC,
args_.equiv_scale_bias_desc.desc());
}
// output desc
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc());
// bitmask desc
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_BITMASK_DESC,
args_.bitmask_desc.desc());
// activation desc
fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_DESC,
args_.activation_desc.desc());
// others
fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
}
void BackwardInit(const platform::CUDADeviceContext &ctx) {
// Set constant_param
bwd_op_.SetOpConstParamAttr(
{CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_DYDATA_PLACEHOLDER,
CUDNN_PARAM_DXDATA_PLACEHOLDER, CUDNN_PARAM_BN_SCALE_PLACEHOLDER,
CUDNN_PARAM_BN_BIAS_PLACEHOLDER, CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER,
CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER,
CUDNN_PARAM_BN_DSCALE_PLACEHOLDER, CUDNN_PARAM_BN_DBIAS_PLACEHOLDER,
CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER},
CUDNN_PTR_16B_ALIGNED);
if (has_shortcut_ || fuse_add_) {
bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_DZDATA_PLACEHOLDER,
CUDNN_PTR_16B_ALIGNED);
}
// input desc
bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc());
bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DXDESC, args_.in_desc.desc());
if (has_shortcut_ || fuse_add_) {
bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DZDESC, args_.in_desc.desc());
}
// scale/bias/mean/var desc for backward
bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC,
args_.scale_bias_mean_var_desc.desc());
// output desc
bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DYDESC, args_.out_desc.desc());
// bitmask desc
bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_BITMASK_DESC,
args_.bitmask_desc.desc());
// activation desc
bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_DESC,
args_.activation_desc.desc());
// others
bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
}
bool fuse_add_ = false;
bool has_shortcut_ = false;
size_t fwd_workspace_byte_;
size_t bwd_workspace_byte_;
ScaleBiasAddReluArgs<T> args_;
CudnnFusionOp fwd_op_;
CudnnFusionOp bwd_op_;
};
#endif
} // namespace operators
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
// Shape of bitmask
static framework::DDim GetBitmaskDims(std::vector<int> out_shape) {
int c = out_shape.back();
int64_t nhw = std::accumulate(out_shape.begin(), out_shape.end(), 1,
std::multiplies<int>()) /
c;
int32_t c_int32_elems = ((c + 63) & ~63) / 32;
int32_t nhw_int32_elems = ((nhw + 31) & ~31);
std::vector<int> bitmask_shape = {nhw_int32_elems, c_int32_elems, 1};
return framework::make_ddim(bitmask_shape);
}
class ResNetUnitOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const {
// Check input
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasInput("FilterX"), "Input", "FilterX",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasInput("ScaleX"), "Input", "ScaleX", "ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasInput("BiasX"), "Input", "BiasX", "ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasInput("MeanX"), "Input", "MeanX", "ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasInput("VarX"), "Input", "VarX", "ResNetUnitOp");
bool fuse_add = ctx->Attrs().Get<bool>("fuse_add");
bool has_shortcut = ctx->Attrs().Get<bool>("has_shortcut");
if (fuse_add || has_shortcut) {
OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "ResNetUnitOp");
}
if (has_shortcut) {
OP_INOUT_CHECK(ctx->HasInput("FilterZ"), "Input", "FilterZ",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasInput("ScaleZ"), "Input", "ScaleZ",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasInput("BiasZ"), "Input", "BiasZ", "ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasInput("MeanZ"), "Input", "MeanZ", "ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasInput("VarZ"), "Input", "VarZ", "ResNetUnitOp");
}
// Check output
OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasOutput("BitMask"), "Output", "BitMask",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasOutput("ConvX"), "Output", "ConvX", "ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasOutput("SavedMeanX"), "Output", "SavedMeanX",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasOutput("SavedInvstdX"), "Output", "SavedInvstdX",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasOutput("RunningMeanX"), "Output", "RunningMeanX",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasOutput("RunningVarX"), "Output", "RunningVarX",
"ResNetUnitOp");
if (has_shortcut) {
OP_INOUT_CHECK(ctx->HasOutput("ConvZ"), "Output", "ConvZ",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasOutput("SavedMeanZ"), "Output", "SavedMeanZ",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasOutput("SavedInvstdZ"), "Output", "SavedInvstdZ",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasOutput("RunningMeanZ"), "Output", "RunningMeanZ",
"ResNetUnitOp");
OP_INOUT_CHECK(ctx->HasOutput("RunningVarZ"), "Output", "RunningVarZ",
"ResNetUnitOp");
}
// make sure Mean/RunningMean and Var/RunningVar share memory
PADDLE_ENFORCE_EQ(
ctx->Inputs("MeanX")[0], ctx->Outputs("RunningMeanX")[0],
platform::errors::InvalidArgument(
"MeanX and RunningMeanX should share the same memory"));
PADDLE_ENFORCE_EQ(ctx->Inputs("VarX")[0], ctx->Outputs("RunningVarX")[0],
platform::errors::InvalidArgument(
"VarX and RunningVarX should share the same memory"));
if (has_shortcut) {
PADDLE_ENFORCE_EQ(
ctx->Inputs("MeanZ")[0], ctx->Outputs("RunningMeanZ")[0],
platform::errors::InvalidArgument(
"MeanZ and RunningMeanZ should share the same memory"));
PADDLE_ENFORCE_EQ(
ctx->Inputs("VarZ")[0], ctx->Outputs("RunningVarZ")[0],
platform::errors::InvalidArgument(
"VarZ and RunningVarZ should share the same memory"));
}
// Check dims of inputs
const auto x_dims = ctx->GetInputDim("X");
const auto w_dims = ctx->GetInputDim("FilterX");
const auto bn_param_dims = ctx->GetInputDim("ScaleX");
PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument(
"The dimensions of input "
"must equal to 4."
"But received: the shape of input "
"= [%s], the dimension of input = "
"[%d]",
x_dims, x_dims.size()));
PADDLE_ENFORCE_EQ(w_dims.size(), 4,
platform::errors::InvalidArgument(
"The dimensions of filter "
"must equal to 4."
"But received: the shape of filter "
"= [%s], the dimension of filter = [%d] ",
w_dims, w_dims.size()));
PADDLE_ENFORCE_EQ(bn_param_dims.size(), 4,
platform::errors::InvalidArgument(
"The dimensions of bn param "
"must equal to 4."
"But received: the shape of bn param "
"= [%s], the dimension of bn param = [%d] ",
bn_param_dims, bn_param_dims.size()));
auto data_format = ctx->Attrs().Get<std::string>("data_format");
PADDLE_ENFORCE_EQ(
data_format, "NHWC",
platform::errors::InvalidArgument("The data format must equal to NHWC. "
"But received: the data format "
"= [%s]",
data_format));
// Calculate the dims of outputs
int batch = x_dims[0];
int output_channel = w_dims[0];
int filter_size = w_dims[2];
int stride = ctx->Attrs().Get<int>("stride");
int padding = ctx->Attrs().Get<int>("padding");
int out_h = (x_dims[1] + padding * 2 - filter_size) / stride + 1;
int out_w = (x_dims[2] + padding * 2 - filter_size) / stride + 1;
std::vector<int> out_shape = {batch, out_h, out_w, output_channel};
auto y_dims = framework::make_ddim(out_shape);
auto bitmask_dims = GetBitmaskDims(out_shape);
// Set dims of outputs
ctx->SetOutputDim("Y", y_dims);
ctx->SetOutputDim("BitMask", bitmask_dims);
ctx->SetOutputDim("ConvX", y_dims);
ctx->SetOutputDim("SavedMeanX", bn_param_dims);
ctx->SetOutputDim("SavedInvstdX", bn_param_dims);
ctx->SetOutputDim("RunningMeanX", bn_param_dims);
ctx->SetOutputDim("RunningVarX", bn_param_dims);
if (has_shortcut) {
ctx->SetOutputDim("ConvZ", y_dims);
ctx->SetOutputDim("SavedMeanZ", bn_param_dims);
ctx->SetOutputDim("SavedInvstdZ", bn_param_dims);
ctx->SetOutputDim("RunningMeanZ", bn_param_dims);
ctx->SetOutputDim("RunningVarZ", bn_param_dims);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
// By default, the type of the scale, bias, mean,
// and var tensors should be float when input tensor's dtype is float16.
auto bn_param_type = framework::proto::VarType::FP32;
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("ScaleX")->type(),
platform::errors::InvalidArgument(
"Scale input should be of float type"));
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("BiasX")->type(),
platform::errors::InvalidArgument(
"Bias input should be of float type"));
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library);
}
};
class ResNetUnitOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "The input 1 tensor");
AddInput("FilterX", "Filter tensor of input 1");
AddInput("ScaleX", "Scale tensor of input 1 used in batchnorm");
AddInput("BiasX", "Bias tensor of input 1 used in batchnorm");
AddInput("MeanX", "Mean tensor of input 1 used in batchnorm");
AddInput("VarX", "Variance tensor of input 1 used in batchnorm");
AddInput("Z", "The input 2 tensor").AsDispensable();
AddInput("FilterZ", "Filter tensor of input 2").AsDispensable();
AddInput("ScaleZ", "Scale tensor of input 2").AsDispensable();
AddInput("BiasZ", "Bias tensor of input 2").AsDispensable();
AddInput("MeanZ", "Mean tensor of input 2").AsDispensable();
AddInput("VarZ", "Variance tensor of input 2").AsDispensable();
AddOutput("Y", "The result of the resnet unit");
AddOutput("BitMask", "The bitmask generated after relu");
AddOutput("ConvX", "The output of input 1 after conv");
AddOutput("SavedMeanX", "Mean of input 1 in the current batch");
AddOutput("SavedInvstdX", "Invstd of input 1 in the current batch");
AddOutput("RunningMeanX", "Shared memory with MeanX");
AddOutput("RunningVarX", "Shared memory with VarX");
AddOutput("ConvZ", "The output of input 2 after conv").AsDispensable();
AddOutput("SavedMeanZ", "Mean of input 1 in the current batch")
.AsDispensable();
AddOutput("SavedInvstdZ", "Invstd of input 1 in the current batch")
.AsDispensable();
AddOutput("RunningMeanZ", "Shared memory with MeanZ").AsDispensable();
AddOutput("RunningVarZ", "Shared memory with VarZ").AsDispensable();
AddAttr<int>("stride", "").SetDefault(1);
AddAttr<int>("stride_z", "").SetDefault(1);
AddAttr<int>("padding", "").SetDefault(0);
AddAttr<int>("dilation", "").SetDefault(1);
AddAttr<int>("group", "").SetDefault(1);
AddAttr<float>("momentum", "").SetDefault(0.9);
AddAttr<float>("epsilon", "").SetDefault(1e-5);
AddAttr<std::string>("data_format", "").SetDefault("NHWC");
AddAttr<bool>("fuse_add", "").SetDefault(false);
AddAttr<bool>("has_shortcut", "").SetDefault(false);
AddAttr<bool>("use_global_stats", "").SetDefault(false);
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddAttr<bool>("use_addto", "").SetDefault(false);
AddAttr<std::string>("act_type", "The activation type to be fused.")
.SetDefault("relu");
AddComment(R"DOC(
Fusion op of the basic unit of resnet block.
The implementation is based on the latest fusion op interface in cuDNN v8.0.
For more details:
https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnFusedOps_t
)DOC");
}
};
class ResNetUnitGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const {
// check input
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("FilterX"), "Input", "FilterX",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("ConvX"), "Input", "ConvX",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("ScaleX"), "Input", "ScaleX",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("BiasX"), "Input", "BiasX",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("SavedMeanX"), "Input", "SavedMeanX",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("SavedInvstdX"), "Input", "SavedInvstdX",
"ResNetUnitGradOp");
bool fuse_add = ctx->Attrs().Get<bool>("fuse_add");
bool has_shortcut = ctx->Attrs().Get<bool>("has_shortcut");
if (fuse_add || has_shortcut) {
OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "ResNetUnitGradOp");
}
if (has_shortcut) {
OP_INOUT_CHECK(ctx->HasInput("FilterZ"), "Input", "FilterZ",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("ConvZ"), "Input", "ConvZ",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("ScaleZ"), "Input", "ScaleZ",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("BiasZ"), "Input", "BiasZ",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("SavedMeanZ"), "Input", "SavedMeanZ",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("SavedInvstdZ"), "Input", "SavedInvstdZ",
"ResNetUnitGradOp");
}
OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput("BitMask"), "Input", "BitMask",
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
framework::GradVarName("Y"), "ResNetUnitGradOp");
// check output
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
framework::GradVarName("X"), "ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("FilterX")), "Output",
framework::GradVarName("FilterX"), "ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("ScaleX")), "Output",
framework::GradVarName("ScaleX"), "ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("BiasX")), "Output",
framework::GradVarName("BiasX"), "ResNetUnitGradOp");
if (fuse_add) {
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")), "Output",
framework::GradVarName("Z"), "ResNetUnitGradOp");
}
if (has_shortcut) {
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("FilterZ")),
"Output", framework::GradVarName("FilterZ"),
"ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("ScaleZ")), "Output",
framework::GradVarName("ScaleZ"), "ResNetUnitGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("BiasZ")), "Output",
framework::GradVarName("BiasZ"), "ResNetUnitGradOp");
}
const auto x_dims = ctx->GetInputDim("X");
const auto filter_x_dims = ctx->GetInputDim("FilterX");
const auto param_dims = ctx->GetInputDim("ScaleX");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->SetOutputDim(framework::GradVarName("FilterX"), filter_x_dims);
ctx->SetOutputDim(framework::GradVarName("ScaleX"), param_dims);
ctx->SetOutputDim(framework::GradVarName("BiasX"), param_dims);
if (fuse_add || has_shortcut) {
const auto z_dims = ctx->GetInputDim("Z");
ctx->SetOutputDim(framework::GradVarName("Z"), z_dims);
}
if (has_shortcut) {
const auto filter_z_dims = ctx->GetInputDim("FilterZ");
ctx->SetOutputDim(framework::GradVarName("FilterZ"), filter_z_dims);
ctx->SetOutputDim(framework::GradVarName("ScaleZ"), param_dims);
ctx->SetOutputDim(framework::GradVarName("BiasZ"), param_dims);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
PADDLE_ENFORCE_NOT_NULL(
ctx.InputVar(framework::GradVarName("Y")),
platform::errors::NotFound(
"Can not find Y@GRAD in the execution context."));
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(),
layout, library);
}
};
template <typename T>
class ResNetUnitGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("resnet_unit_grad");
op->SetInput("X", this->Input("X"));
op->SetInput("FilterX", this->Input("FilterX"));
op->SetInput("ConvX", this->Output("ConvX"));
op->SetInput("ScaleX", this->Input("ScaleX"));
op->SetInput("BiasX", this->Input("BiasX"));
op->SetInput("SavedMeanX", this->Output("SavedMeanX"));
op->SetInput("SavedInvstdX", this->Output("SavedInvstdX"));
op->SetInput("Z", this->Input("Z"));
op->SetInput("FilterZ", this->Input("FilterZ"));
op->SetInput("ConvZ", this->Output("ConvZ"));
op->SetInput("ScaleZ", this->Input("ScaleZ"));
op->SetInput("BiasZ", this->Input("BiasZ"));
op->SetInput("SavedMeanZ", this->Output("SavedMeanZ"));
op->SetInput("SavedInvstdZ", this->Output("SavedInvstdZ"));
op->SetInput("Y", this->Output("Y"));
op->SetInput("BitMask", this->Output("BitMask"));
op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
op->SetAttrMap(this->Attrs());
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
op->SetOutput(framework::GradVarName("FilterX"),
this->InputGrad("FilterX"));
op->SetOutput(framework::GradVarName("ScaleX"), this->InputGrad("ScaleX"));
op->SetOutput(framework::GradVarName("BiasX"), this->InputGrad("BiasX"));
op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z"));
op->SetOutput(framework::GradVarName("FilterZ"),
this->InputGrad("FilterZ"));
op->SetOutput(framework::GradVarName("ScaleZ"), this->InputGrad("ScaleZ"));
op->SetOutput(framework::GradVarName("BiasZ"), this->InputGrad("BiasZ"));
}
};
class ResNetUnitOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
const override {
static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
return m;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(resnet_unit, ops::ResNetUnitOp, ops::ResNetUnitOpMaker,
ops::ResNetUnitOpInferVarType,
ops::ResNetUnitGradOpMaker<paddle::framework::OpDesc>,
ops::ResNetUnitGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(resnet_unit_grad, ops::ResNetUnitGradOp);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h"
#include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
#include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class ResNetUnitKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::PreconditionNotMet("It must use CUDAPlace."));
PADDLE_ENFORCE_EQ(platform::CudnnDataType<T>::type, CUDNN_DATA_HALF,
platform::errors::Unavailable(
"ResNetUnitOp only supports float16 for now."));
// input x
const Tensor *input_x = ctx.Input<Tensor>("X");
const Tensor *filter_x = ctx.Input<Tensor>("FilterX");
const Tensor *scale_x = ctx.Input<Tensor>("ScaleX");
const Tensor *bias_x = ctx.Input<Tensor>("BiasX");
// norm conv
Tensor *conv_out_x = ctx.Output<Tensor>("ConvX");
// bn finalize
Tensor *saved_mean_x = ctx.Output<Tensor>("SavedMeanX");
Tensor *saved_invstd_x = ctx.Output<Tensor>("SavedInvstdX");
Tensor *running_mean_x = ctx.Output<Tensor>("RunningMeanX");
Tensor *running_var_x = ctx.Output<Tensor>("RunningVarX");
// sbar
Tensor *output = ctx.Output<Tensor>("Y");
Tensor *bitmask = ctx.Output<Tensor>("BitMask");
// attrs
int padding = ctx.Attr<int>("padding");
int stride = ctx.Attr<int>("stride");
int stride_z = ctx.Attr<int>("stride_z");
int dilation = ctx.Attr<int>("dilation");
int group = ctx.Attr<int>("group");
double eps = static_cast<double>(ctx.Attr<float>("epsilon"));
double momentum = static_cast<double>(ctx.Attr<float>("momentum"));
bool has_shortcut = ctx.Attr<bool>("has_shortcut");
bool fuse_add = ctx.Attr<bool>("fuse_add");
bool use_global_stats = ctx.Attr<bool>("use_global_stats");
bool is_test = ctx.Attr<bool>("is_test");
bool is_train = !is_test && !use_global_stats;
std::string act_type = ctx.Attr<std::string>("act_type");
auto input_x_shape = framework::vectorize<int>(input_x->dims());
auto filter_x_shape = framework::vectorize<int>(filter_x->dims());
auto param_dims = scale_x->dims();
auto param_shape = framework::vectorize<int>(scale_x->dims());
auto output_shape = framework::vectorize<int>(output->dims());
auto bitmask_shape = framework::vectorize<int>(bitmask->dims());
int output_channel = filter_x_shape[0];
int64_t ele_count =
std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>()) /
output_channel;
auto place = ctx.GetPlace();
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
// 1. Conv
Tensor sum_x;
Tensor sum_of_squares_x;
sum_x.Resize(param_dims);
sum_of_squares_x.Resize(param_dims);
CudnnNormConvolution<T> conv_x_op(dev_ctx, input_x_shape, filter_x_shape,
output_shape, padding, stride, dilation,
group);
conv_x_op.Forward(dev_ctx, *input_x, *filter_x, conv_out_x, &sum_x,
&sum_of_squares_x);
// 2. BN
Tensor equiv_scale_x;
Tensor equiv_bias_x;
equiv_scale_x.Resize(param_dims);
equiv_bias_x.Resize(param_dims);
CudnnBNStatsFinalize<T> bn_x_op(dev_ctx, param_shape);
bn_x_op.Forward(dev_ctx, sum_x, sum_of_squares_x, *scale_x, *bias_x,
saved_mean_x, saved_invstd_x, running_mean_x, running_var_x,
&equiv_scale_x, &equiv_bias_x, eps, momentum, ele_count,
is_train);
// 3. scale + bias + add + relu
CudnnScaleBiasAddRelu<T> sbar_op(dev_ctx, act_type, fuse_add, has_shortcut,
output_shape, param_shape, bitmask_shape);
if (has_shortcut) {
// input z
const Tensor *input_z = ctx.Input<Tensor>("Z");
const Tensor *filter_z = ctx.Input<Tensor>("FilterZ");
const Tensor *scale_z = ctx.Input<Tensor>("ScaleZ");
const Tensor *bias_z = ctx.Input<Tensor>("BiasZ");
// norm conv
Tensor *conv_out_z = ctx.Output<Tensor>("ConvZ");
// bn finalize
Tensor *saved_mean_z = ctx.Output<Tensor>("SavedMeanZ");
Tensor *saved_invstd_z = ctx.Output<Tensor>("SavedInvstdZ");
Tensor *running_mean_z = ctx.Output<Tensor>("RunningMeanZ");
Tensor *running_var_z = ctx.Output<Tensor>("RunningVarZ");
auto input_z_shape = framework::vectorize<int>(input_z->dims());
auto filter_z_shape = framework::vectorize<int>(filter_z->dims());
// 3.1 Conv for second input
Tensor sum_z;
Tensor sum_of_squares_z;
sum_z.Resize(param_dims);
sum_of_squares_z.Resize(param_dims);
CudnnNormConvolution<T> conv_z_op(dev_ctx, input_z_shape, filter_z_shape,
output_shape, padding, stride_z,
dilation, group);
conv_z_op.Forward(dev_ctx, *input_z, *filter_z, conv_out_z, &sum_z,
&sum_of_squares_z);
// 3.2 BN for second input
Tensor equiv_scale_z;
Tensor equiv_bias_z;
equiv_scale_z.Resize(param_dims);
equiv_bias_z.Resize(param_dims);
CudnnBNStatsFinalize<T> bn_z_op(dev_ctx, param_shape);
bn_z_op.Forward(dev_ctx, sum_z, sum_of_squares_z, *scale_z, *bias_z,
saved_mean_z, saved_invstd_z, running_mean_z,
running_var_z, &equiv_scale_z, &equiv_bias_z, eps,
momentum, ele_count, is_train);
// 3.3 sbar
sbar_op.Forward(dev_ctx, *conv_out_x, equiv_scale_x, equiv_bias_x,
conv_out_z, &equiv_scale_z, &equiv_bias_z, output,
bitmask);
} else {
const Tensor *input_z = fuse_add ? ctx.Input<Tensor>("Z") : nullptr;
sbar_op.Forward(dev_ctx, *conv_out_x, equiv_scale_x, equiv_bias_x,
input_z, nullptr, nullptr, output, bitmask);
}
}
};
template <typename T>
class ResNetUnitGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::PreconditionNotMet("It must use CUDAPlace."));
PADDLE_ENFORCE_EQ(platform::CudnnDataType<T>::type, CUDNN_DATA_HALF,
platform::errors::Unavailable(
"ResNetUnitOp only supports float16 for now."));
const Tensor *y_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
const Tensor *x = ctx.Input<Tensor>("X");
const Tensor *filter_x = ctx.Input<Tensor>("FilterX");
const Tensor *scale_x = ctx.Input<Tensor>("ScaleX");
const Tensor *bias_x = ctx.Input<Tensor>("BiasX");
const Tensor *saved_mean_x = ctx.Input<Tensor>("SavedMeanX");
const Tensor *saved_invstd_x = ctx.Input<Tensor>("SavedInvstdX");
const Tensor *conv_out_x = ctx.Input<Tensor>("ConvX");
const Tensor *output = ctx.Input<Tensor>("Y");
const Tensor *bitmask = ctx.Input<Tensor>("BitMask");
Tensor *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
Tensor *filter_x_grad =
ctx.Output<Tensor>(framework::GradVarName("FilterX"));
Tensor *scale_x_grad = ctx.Output<Tensor>(framework::GradVarName("ScaleX"));
Tensor *bias_x_grad = ctx.Output<Tensor>(framework::GradVarName("BiasX"));
int padding = ctx.Attr<int>("padding");
int stride = ctx.Attr<int>("stride");
int stride_z = ctx.Attr<int>("stride_z");
int dilation = ctx.Attr<int>("dilation");
int group = ctx.Attr<int>("group");
double eps = static_cast<double>(ctx.Attr<float>("epsilon"));
double momentum = static_cast<double>(ctx.Attr<float>("momentum"));
bool has_shortcut = ctx.Attr<bool>("has_shortcut");
bool fuse_add = ctx.Attr<bool>("fuse_add");
bool use_global_stats = ctx.Attr<bool>("use_global_stats");
std::string act_type = ctx.Attr<std::string>("act_type");
auto x_shape = framework::vectorize<int>(x->dims());
auto filter_x_shape = framework::vectorize<int>(filter_x->dims());
auto param_shape = framework::vectorize<int>(scale_x->dims());
auto output_shape = framework::vectorize<int>(output->dims());
auto bitmask_shape = framework::vectorize<int>(bitmask->dims());
auto place = ctx.GetPlace();
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
// 1. Backward of BN (+ Add + Relu) for x, get conv_out_x_grad,
// scale_x_grad, bias_x_grad
Tensor conv_out_x_grad;
conv_out_x_grad.Resize(conv_out_x->dims());
CudnnScaleBiasAddRelu<T> sbar_x_op(dev_ctx, act_type, fuse_add,
has_shortcut, output_shape, param_shape,
bitmask_shape);
if (has_shortcut) {
// X Z
// | |
// NormConv NormConv
// | |
// BNStatsFinalize BNStatsFinalize
// \ /
// ScaleBiasAddRelu
// |
// Y
const Tensor *z = ctx.Input<Tensor>("Z");
const Tensor *filter_z = ctx.Input<Tensor>("FilterZ");
const Tensor *scale_z = ctx.Input<Tensor>("ScaleZ");
const Tensor *bias_z = ctx.Input<Tensor>("BiasZ");
const Tensor *saved_mean_z = ctx.Input<Tensor>("SavedMeanZ");
const Tensor *saved_invstd_z = ctx.Input<Tensor>("SavedInvstdZ");
const Tensor *conv_out_z = ctx.Input<Tensor>("ConvZ");
Tensor *z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
Tensor *filter_z_grad =
ctx.Output<Tensor>(framework::GradVarName("FilterZ"));
Tensor *scale_z_grad =
ctx.Output<Tensor>(framework::GradVarName("ScaleZ"));
Tensor *bias_z_grad = ctx.Output<Tensor>(framework::GradVarName("BiasZ"));
// 1.1 Backward of BN + Add (+ Relu) for x, get conv_out_x_grad,
// scale_x_grad, bias_x_grad and z_grad_temp
Tensor z_grad_temp;
z_grad_temp.Resize(conv_out_z->dims());
sbar_x_op.Backward(dev_ctx, *y_grad, *conv_out_x, *scale_x, *bias_x,
*saved_mean_x, *saved_invstd_x, bitmask,
&conv_out_x_grad, &z_grad_temp, scale_x_grad,
bias_x_grad, eps);
// 1.2 bn backward for z, get conv_out_z_grad, dscale_z, dbias_z
Tensor conv_out_z_grad;
conv_out_z_grad.Resize(conv_out_z->dims());
CudnnScaleBiasAddRelu<T> sbar_z_op(
dev_ctx, "", false, false, output_shape, param_shape, bitmask_shape);
sbar_z_op.Backward(dev_ctx, z_grad_temp, *conv_out_z, *scale_z, *bias_z,
*saved_mean_z, *saved_invstd_z, nullptr,
&conv_out_z_grad, nullptr, scale_z_grad, bias_z_grad,
eps);
// 1.3 Backward of Conv for z, get z_grad and filter_z_grad
auto z_shape = framework::vectorize<int>(z->dims());
auto filter_z_shape = framework::vectorize<int>(filter_z->dims());
CudnnNormConvolutionGrad<T> conv_z_op(dev_ctx, z_shape, filter_z_shape,
output_shape, padding, stride_z,
dilation, group);
conv_z_op.Backward(dev_ctx, *z, *filter_z, conv_out_z_grad, z_grad,
filter_z_grad);
} else {
// 1.1 Backward of BN (+ Add + Relu) for x, get conv_out_x_grad,
// scale_x_grad, bias_x_grad (and z_grad)
Tensor *z_grad =
fuse_add ? ctx.Output<Tensor>(framework::GradVarName("Z")) : nullptr;
sbar_x_op.Backward(dev_ctx, *y_grad, *conv_out_x, *scale_x, *bias_x,
*saved_mean_x, *saved_invstd_x, bitmask,
&conv_out_x_grad, z_grad, scale_x_grad, bias_x_grad,
eps);
}
// 2. Backward of Conv for x, get x_grad and filter_x_grad
bool use_addto = ctx.Attr<bool>("use_addto");
CudnnNormConvolutionGrad<T> conv_x_op(dev_ctx, x_shape, filter_x_shape,
output_shape, padding, stride,
dilation, group);
conv_x_op.Backward(dev_ctx, *x, *filter_x, conv_out_x_grad, x_grad,
filter_x_grad, use_addto);
}
};
} // namespace operators
} // namespace paddle
#if CUDNN_VERSION >= 8000
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(resnet_unit, ops::ResNetUnitKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(resnet_unit_grad,
ops::ResNetUnitGradKernel<plat::float16>);
#endif
......@@ -68,8 +68,9 @@ class AvgPool {
template <class T>
class MaxPoolGrad {
public:
DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
T* dx) {
static constexpr bool use_x = true;
HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
T* dx) {
*dx += dy * static_cast<T>(x == y);
}
};
......@@ -77,8 +78,9 @@ class MaxPoolGrad {
template <class T>
class AvgPoolGrad {
public:
DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
T* dx) {
static constexpr bool use_x = false;
HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
T* dx) {
*dx += (scale * dy);
}
};
......
......@@ -13,46 +13,158 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
#include "paddle/fluid/operators/optimizers/momentum_op.h"
namespace paddle {
namespace operators {
class LarsMomentumOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs("Param"), "Input", "Param", "LarsMomentum");
OP_INOUT_CHECK(ctx->HasInputs("Grad"), "Input", "Grad", "LarsMomentum");
OP_INOUT_CHECK(ctx->HasInputs("Velocity"), "Input", "Velocity",
"LarsMomentum");
OP_INOUT_CHECK(ctx->HasInputs("LearningRate"), "Input", "LearningRate",
"LarsMomentum");
OP_INOUT_CHECK(ctx->HasOutputs("ParamOut"), "Output", "ParamOut",
"LarsMomentum");
OP_INOUT_CHECK(ctx->HasOutputs("VelocityOut"), "Output", "VelocityOut",
"LarsMomentum");
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Param").front(),
framework::proto::VarType::LOD_TENSOR,
platform::errors::InvalidArgument(
"The input var's type should be LoDTensor, but the received is %s",
ctx->GetInputsVarType("Param").front()));
auto lr_dims = ctx->GetInputsDim("LearningRate");
auto grad_dim = ctx->GetInputsDim("Grad");
auto param_dim = ctx->GetInputsDim("Param");
auto velocity_dim = ctx->GetInputsDim("Velocity");
auto lars_weight_decays =
ctx->Attrs().Get<std::vector<float>>("lars_weight_decay");
auto multi_precision = ctx->Attrs().Get<bool>("multi_precision");
PADDLE_ENFORCE_EQ(
param_dim.size(), grad_dim.size(),
platform::errors::InvalidArgument(
"Input(Param) and Input(Grad) of LarsMomentumOp should have "
"same quantity. But number of Param is [%d] and Grad is [%d].",
param_dim.size(), grad_dim.size()));
PADDLE_ENFORCE_EQ(
param_dim.size(), velocity_dim.size(),
platform::errors::InvalidArgument(
"Input(Param) and Input(Velocity) of LarsMomentumOp should "
"have same quantity. But number of Param is [%d] and Velocity "
"is [%d].",
param_dim.size(), velocity_dim.size()));
PADDLE_ENFORCE_EQ(
lars_weight_decays.size(), grad_dim.size(),
platform::errors::InvalidArgument(
"Attr(Lars_weight_decay) and "
"Input(Grad) of LarsMomentumOp should have same quantity. "
"But number of Lars_weight_decay is [%d] and Grad is [%d].",
lars_weight_decays.size(), grad_dim.size()));
if (multi_precision) {
OP_INOUT_CHECK(ctx->HasInputs("MasterParam"), "Input", "MasterParam",
"LarsMomentumMultiPrecision");
OP_INOUT_CHECK(ctx->HasOutputs("MasterParamOut"), "Output",
"MasterParamOut", "LarsMomentumMultiPrecision");
}
for (size_t i = 0; i < lr_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(framework::product(lr_dims[i]), 1,
platform::errors::InvalidArgument(
"Learning_rate should be a scalar. But Received "
"LearningRate's dim [%s]",
framework::product(lr_dims[i])));
}
for (size_t i = 0; i < param_dim.size(); ++i) {
PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Grad")[i],
framework::proto::VarType::LOD_TENSOR,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx->Inputs("Grad")[i].front(),
ctx->GetInputsVarType("Grad")[i]));
PADDLE_ENFORCE_EQ(
param_dim[i], grad_dim[i],
platform::errors::InvalidArgument(
"Input(Param) and Input(Grad) input of LarsMomentumOp shall "
"have same dimension. But Param`s dim is [%s] and Grad's dim "
"is [%s].",
param_dim[i], grad_dim[i]));
PADDLE_ENFORCE_EQ(
param_dim[i], velocity_dim[i],
platform::errors::InvalidArgument(
"Input(Param) and Input(Velocity) of LarsMomentumOp shall have "
"same dimension. But Param dim [%s] differs with Velocity dim "
"[%s].",
param_dim[i], velocity_dim[i]));
}
ctx->SetOutputsDim("ParamOut", param_dim);
ctx->SetOutputsDim("VelocityOut", param_dim);
if (ctx->HasOutputs("MasterParamOut")) {
ctx->SetOutputsDim("MasterParamOut", param_dim);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
OperatorWithKernel::IndicateVarDataType(ctx, "Param");
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Param",
"(LoDTensor, default LoDTensor<float>) "
"Input parameter that has to be updated");
"Input parameter that has to be updated")
.AsDuplicable();
AddInput("Grad",
"(LoDTensor, default LoDTensor<float>) "
"Input gradient of the parameter");
"Input gradient of the parameter")
.AsDuplicable();
AddInput("Velocity",
"(LoDTensor, default LoDTensor<float>) "
"Input velocity (corresponding to the parameter) "
"that has to be updated");
"that has to be updated")
.AsDuplicable();
AddInput("LearningRate",
"(LoDTensor, default LoDTensor<float>) "
"Input learning rate");
AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
"Input learning rate")
.AsDuplicable();
AddInput("MasterParam", "FP32 master weight for AMP.")
.AsDuplicable()
.AsDispensable();
AddOutput("ParamOut",
"(LoDTensor) This output is updated parameter. "
"It shared memory with Input(Param).");
"It shared memory with Input(Param).")
.AsDuplicable();
AddOutput("VelocityOut",
"(LoDTensor) This output is updated velocity. "
"It shared memory with Input(Velocity).");
"It shared memory with Input(Velocity).")
.AsDuplicable();
AddOutput("MasterParamOut",
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParam).")
.AsDuplicable()
.AsDispensable();
AddAttr<float>("mu", "(float) Momentum coefficient");
AddAttr<float>("lars_coeff", "(float, default 0.001) LARS coefficient.")
.SetDefault(0.001);
AddAttr<float>("lars_weight_decay",
"(float, default 0.0005) LARS weight decay")
.SetDefault(0.0005);
AddAttr<std::vector<float>>(
"lars_weight_decay",
"(std::vector<float>, default 0.0005) LARS weight decay params")
.SetDefault({0.0005});
AddAttr<float>("epsilon",
"(float, default 0.0) epsilon to avoid Division by Zero.")
.SetDefault(0.0);
......@@ -68,10 +180,8 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
Lars Momentum Optimizer.
This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
weight using a local learning rate:
$$
local\_lr = \eta *
\frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
......@@ -79,10 +189,8 @@ velocity = mu * velocity +
local\_lr * (grad + \beta * param) \\
param = param - velocity. \\
$$
Note that we use lars_weight_decay here to decay weights, you may need not to
use L2 regularizers in case of using LARS.
)DOC");
}
};
......@@ -96,7 +204,7 @@ class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
namespace ops = paddle::operators;
REGISTER_OPERATOR(
lars_momentum, ops::MomentumOp, ops::LarsMomentumOpMaker,
lars_momentum, ops::LarsMomentumOp, ops::LarsMomentumOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::LarsMomentumOpVarTypeInference);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -23,54 +23,48 @@ template <typename T>
class LarsMomentumOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
auto param = ctx.Input<framework::LoDTensor>("Param");
auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
auto* grad_var = ctx.InputVar("Grad");
// only support dense for now.
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Grad").front(),
framework::ToTypeName(grad_var->Type())));
auto grad = ctx.Input<framework::LoDTensor>("Grad");
param_out->mutable_data<T>(ctx.GetPlace());
velocity_out->mutable_data<T>(ctx.GetPlace());
auto param_out = ctx.MultiOutput<framework::LoDTensor>("ParamOut");
auto velocity_out = ctx.MultiOutput<framework::LoDTensor>("VelocityOut");
auto param = ctx.MultiInput<framework::LoDTensor>("Param");
auto velocity = ctx.MultiInput<framework::LoDTensor>("Velocity");
auto learning_rate = ctx.MultiInput<framework::LoDTensor>("LearningRate");
auto grad = ctx.MultiInput<framework::LoDTensor>("Grad");
auto weight_decay_arr = ctx.Attr<std::vector<float>>("lars_weight_decay");
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
T epsilon = ctx.Attr<float>("epsilon");
auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
int op_num = param.size();
for (int i = 0; i < op_num; ++i) {
auto* lr = learning_rate[i]->data<T>();
T lars_weight_decay = weight_decay_arr[i];
param_out[i]->mutable_data<T>(ctx.GetPlace());
velocity_out[i]->mutable_data<T>(ctx.GetPlace());
auto p = framework::EigenVector<T>::Flatten(*param);
auto v = framework::EigenVector<T>::Flatten(*velocity);
auto g = framework::EigenVector<T>::Flatten(*grad);
auto* lr = learning_rate->data<T>();
auto p_out = framework::EigenVector<T>::Flatten(*(param_out[i]));
auto v_out = framework::EigenVector<T>::Flatten(*(velocity_out[i]));
auto p = framework::EigenVector<T>::Flatten(*(param[i]));
auto v = framework::EigenVector<T>::Flatten(*(velocity[i]));
auto g = framework::EigenVector<T>::Flatten(*(grad[i]));
framework::Tensor p_norm_t, g_norm_t;
p_norm_t.Resize({1});
g_norm_t.Resize({1});
p_norm_t.mutable_data<T>(ctx.GetPlace());
g_norm_t.mutable_data<T>(ctx.GetPlace());
auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
framework::Tensor p_norm_t, g_norm_t;
p_norm_t.Resize({1});
g_norm_t.Resize({1});
p_norm_t.mutable_data<T>(ctx.GetPlace());
g_norm_t.mutable_data<T>(ctx.GetPlace());
auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
ep_norm = p.square().sum().sqrt();
eg_norm = g.square().sum().sqrt();
ep_norm = p.square().sum().sqrt();
eg_norm = g.square().sum().sqrt();
T local_lr = lr[0];
if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
local_lr = lr[0] * lars_coeff * ep_norm(0) /
(eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
T local_lr = lr[0];
if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
local_lr = lr[0] * lars_coeff * ep_norm(0) /
(eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
}
v_out = v * mu + local_lr * (g + lars_weight_decay * p);
p_out = p - v_out;
}
v_out = v * mu + local_lr * (g + lars_weight_decay * p);
p_out = p - v_out;
}
};
......
此差异已折叠。
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
merged_momentum,
ops::MergedMomentumOpKernel<plat::CUDADeviceContext, plat::float16>,
ops::MergedMomentumOpKernel<plat::CUDADeviceContext, float>,
ops::MergedMomentumOpKernel<plat::CUDADeviceContext, double>);
此差异已折叠。
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
pow2_decay_with_linear_warmup,
ops::Pow2DecayWithLinearWarmupOpKernel<plat::CUDADeviceContext, double>,
ops::Pow2DecayWithLinearWarmupOpKernel<plat::CUDADeviceContext, float>);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册