提交 945f20a3 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_embedding_to_phi

...@@ -26,7 +26,7 @@ add_definitions(-w) ...@@ -26,7 +26,7 @@ add_definitions(-w)
###################################### ######################################
include(ExternalProject) include(ExternalProject)
set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN) set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
set(CINN_GIT_TAG release/v0.1) set(CINN_GIT_TAG 56879b637e2c4db19091eedad03d7cc674e092a2)
set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION}
-DWITH_CUDA=${WITH_GPU} -DWITH_CUDA=${WITH_GPU}
-DWITH_CUDNN=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU}
......
...@@ -99,7 +99,8 @@ endfunction() ...@@ -99,7 +99,8 @@ endfunction()
function(mlir_add_rewriter td_base) function(mlir_add_rewriter td_base)
set(LLVM_TARGET_DEFINITIONS ${td_base}.td) set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") set(LLVM_TARGET_DEPENDS ${LLVM_TARGET_DEPENDS} ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td)
mlir_tablegen(${td_base}.cpp.inc -gen-rewriters)
add_public_tablegen_target(MLIR${td_base}IncGen) add_public_tablegen_target(MLIR${td_base}IncGen)
add_dependencies(mlir-headers MLIR${td_base}IncGen) add_dependencies(mlir-headers MLIR${td_base}IncGen)
endfunction() endfunction()
......
cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api)
if (WITH_DISTRIBUTE) if (WITH_DISTRIBUTE)
cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
endif() endif()
cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
if(WITH_NCCL) if(WITH_NCCL)
cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
......
...@@ -88,8 +88,8 @@ void SyncDefaultStream( ...@@ -88,8 +88,8 @@ void SyncDefaultStream(
for (size_t i = 0; i < places.size(); ++i) { for (size_t i = 0; i < places.size(); ++i) {
auto* default_ctx = static_cast<platform::CUDADeviceContext*>( auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(places[i])); platform::DeviceContextPool::Instance().Get(places[i]));
ncclEvents[i].Record(*dev_ctx[i]); ncclEvents[i].Record(*default_ctx);
ncclEvents[i].Block(*default_ctx); ncclEvents[i].Block(*dev_ctx[i]);
} }
} }
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/distributed/collective/reducer.h" #include "paddle/fluid/distributed/collective/reducer.h"
#include "paddle/phi/common/data_type.h"
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
...@@ -127,5 +126,430 @@ std::vector<std::vector<size_t>> Eager_AssignGroupBySize( ...@@ -127,5 +126,430 @@ std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
return res; return res;
} }
template <typename DeviceContext, typename T>
static void ConcatTensorsForAllReduce(
const DeviceContext &context,
const std::vector<phi::DenseTensor> &dense_tensors_,
Tensor *p_dense_contents) {
operators::math::ConcatFunctor<DeviceContext, T> concat_functor_;
concat_functor_(
context, dense_tensors_, 0,
std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
.get());
}
template <typename DeviceContext, typename T>
static void SplitTensorsForAllReduce(
const DeviceContext &context, Tensor *p_dense_contents,
std::vector<phi::DenseTensor> *p_dense_tensors) {
auto *in =
std::dynamic_pointer_cast<phi::DenseTensor>(p_dense_contents->impl())
.get();
std::vector<phi::DenseTensor *> outs;
std::vector<const phi::DenseTensor *> shape_refer;
outs.reserve(p_dense_tensors->size());
shape_refer.reserve(p_dense_tensors->size());
for (auto &tensor : *p_dense_tensors) {
outs.emplace_back(&tensor);
shape_refer.emplace_back(&tensor);
}
operators::math::SplitFunctor<DeviceContext, T> split_functor_;
split_functor_(context, *in, shape_refer, 0, &outs);
}
// context is used to select the stream for concat
template <typename DeviceContext>
static void ConcatTensorsWithType(
const DeviceContext &context,
const std::vector<phi::DenseTensor> &dense_tensors_,
Tensor *p_dense_contents, phi::DataType type) {
switch (type) {
case phi::DataType::FLOAT16:
ConcatTensorsForAllReduce<DeviceContext, platform::float16>(
context, dense_tensors_, p_dense_contents);
break;
case phi::DataType::FLOAT32:
ConcatTensorsForAllReduce<DeviceContext, float>(context, dense_tensors_,
p_dense_contents);
break;
case phi::DataType::FLOAT64:
ConcatTensorsForAllReduce<DeviceContext, double>(context, dense_tensors_,
p_dense_contents);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Data type (%s) is not supported when it concats tensors for "
"allreduce.",
type));
}
}
// context is used to select the stream for split
template <typename DeviceContext>
static void SplitTensorsWithType(const DeviceContext &context,
Tensor *p_dense_contents,
std::vector<phi::DenseTensor> *p_dense_tensors,
phi::DataType type) {
switch (type) {
case phi::DataType::FLOAT16:
SplitTensorsForAllReduce<DeviceContext, platform::float16>(
context, p_dense_contents, p_dense_tensors);
break;
case phi::DataType::FLOAT32:
SplitTensorsForAllReduce<DeviceContext, float>(context, p_dense_contents,
p_dense_tensors);
break;
case phi::DataType::FLOAT64:
SplitTensorsForAllReduce<DeviceContext, double>(context, p_dense_contents,
p_dense_tensors);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Data type (%s) is not supported when it splits tensors for "
"allreduce.",
type));
}
}
void EagerGroup::ConcatTensors(const platform::Place &place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_,
dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't concat grad tensors since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."));
#endif
} else if (platform::is_cpu_place(place)) {
auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_,
dtype_);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Concat grad tensor not supported on place (%s)", place));
}
}
void EagerGroup::SplitTensors(const platform::Place &place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_,
dtype_);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't split grad tensor since it's not compiled with NCCL,"
"Please recompile or reinstall Paddle with NCCL support."));
#endif
} else if (platform::is_cpu_place(place)) {
auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_,
dtype_);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Split grad tensor not supported on place (%s)", place));
}
}
EagerReducer::EagerReducer(
const std::vector<Tensor> tensors,
const std::vector<std::vector<size_t>> &group_indices,
const std::vector<bool> &is_sparse_gradient,
std::shared_ptr<distributed::ProcessGroup> process_group,
const std::vector<size_t> &group_size_limits, bool find_unused_parameters)
: tensors_(tensors),
group_indices_(group_indices),
is_sparse_gradient_(is_sparse_gradient),
process_group_(process_group),
group_size_limits_(group_size_limits),
find_unused_vars_each_step_(find_unused_parameters) {
VLOG(3) << "Start construct the Reducer ...";
nranks_ = process_group_->GetSize();
// initialize groups
InitializeGroups(group_indices);
for (size_t global_var_index = 0; global_var_index < tensors_.size();
++global_var_index) {
auto tensor = tensors_[global_var_index];
auto reduce_hook = [=](void) -> void {
this->AddDistHook(global_var_index);
};
const auto &grad_node = GetGradNodeFromTensor(&tensor);
PADDLE_ENFORCE(
grad_node.get() != nullptr,
paddle::platform::errors::Fatal("Detected NULL grad_node,"
"Leaf tensor should have had grad_node "
"with type: GradNodeAccumulation"));
const auto &accumulation_grad_node =
std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
accumulation_grad_node->RegisterReduceHook(
std::make_shared<egr::CppTensorVoidHook>(reduce_hook));
}
vars_marked_ready_.resize(tensors_.size(), false);
local_used_vars_.resize(tensors_.size(), 0);
}
std::shared_ptr<egr::GradNodeBase> EagerReducer::GetGradNodeFromTensor(
Tensor *tensor) {
auto *autograd_meta = tensor->get_autograd_meta();
const auto &grad_node =
static_cast<egr::AutogradMeta *>(autograd_meta)->GetMutableGradNode();
return grad_node;
}
void EagerReducer::InitializeGroups(
const std::vector<std::vector<size_t>> &group_indices) {
VLOG(3) << "Start initialize groups ..";
// clear the group
groups_.clear();
groups_.reserve(group_indices.size());
variable_locators_.clear();
variable_locators_.resize(tensors_.size());
auto group_nums = group_indices.size();
for (size_t group_index = 0; group_index < group_nums; ++group_index) {
const auto &tensor_indices_ = group_indices[group_index];
PADDLE_ENFORCE_GT(
tensor_indices_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of group[%d]'s elements is 0.", group_index));
EagerGroup group;
// It's just for check the sparse or dense
auto first_var = tensors_[tensor_indices_.front()];
if (tensor_indices_.size() == 1 &&
is_sparse_gradient_[tensor_indices_.front()]) {
// process the sparse gradient. one sparse, one group
group.dtype_ = first_var.dtype();
} else {
// process the dense gradient.
InitializeDenseGroups(tensor_indices_, &group);
experimental::Backend backend;
switch (inner_place_.GetType()) {
case phi::AllocationType::GPU:
backend = experimental::Backend::GPU;
break;
case phi::AllocationType::CPU:
backend = experimental::Backend::CPU;
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Place type (%s) is not supported. ", inner_place_));
break;
}
group.dense_contents_ = paddle::experimental::empty(
ScalarArray({group.all_length_}), group.dtype_, backend);
}
// map tensors to this group by VariableLocator
size_t inside_group_index = 0;
for (const auto var_index : tensor_indices_) {
TensorLocator tensor_locator;
tensor_locator.group_index = group_index;
tensor_locator.inside_group_index = inside_group_index++;
variable_locators_[var_index] = tensor_locator;
}
group.tensor_indices_ = std::move(tensor_indices_);
groups_.emplace_back(std::move(group));
VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
}
}
void EagerReducer::InitializeDenseGroups(
const std::vector<size_t> &tensor_indices_, EagerGroup *p_group) {
VLOG(3) << "InitializeDenseGroups.";
int64_t all_length = 0;
for (size_t index = 0; index < tensor_indices_.size(); ++index) {
auto tensor_index = tensor_indices_[index];
auto &tensor = tensors_[tensor_index];
auto &tensor_name = tensor.name();
PADDLE_ENFORCE_EQ(tensor.is_initialized(), true,
platform::errors::PreconditionNotMet(
"Tensor %s is not initialized.", tensor_name));
const auto size = tensor.numel();
PADDLE_ENFORCE_GT(
size, 0, platform::errors::PreconditionNotMet(
"The number of tensor %s's elements is 0.", tensor_name));
all_length += size;
p_group->length_.push_back(size);
// for concat operator
p_group->origin_shapes_.push_back(ScalarArray(tensor.shape()));
p_group->dense_tensors_.push_back(phi::DenseTensor());
const auto &dtype = tensor.dtype();
const auto &place = tensor.place();
const auto &inner_place = tensor.impl()->place();
if (index > 0) {
PADDLE_ENFORCE_EQ(dtype, p_group->dtype_,
platform::errors::PreconditionNotMet(
"Tensor %s has unexpected dtype.", tensor_name));
PADDLE_ENFORCE_EQ(place, place_,
platform::errors::PreconditionNotMet(
"Tensor %s has different place. Expected place is "
"%s, but actual place is %s",
tensor_name, inner_place_, inner_place));
} else {
p_group->dtype_ = dtype;
place_ = place;
inner_place_ = inner_place;
}
}
p_group->all_length_ = all_length;
}
void EagerReducer::PrepareForBackward(const std::vector<Tensor> &outputs) {
VLOG(3) << "after forward, then reset count for backward.";
grad_need_hooks_ = true;
next_group_ = 0;
std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) {
group.pending_ = group.tensor_indices_.size();
});
// reinitialize vars_marked_ready_ for next iteration
vars_marked_ready_.clear();
vars_marked_ready_.resize(tensors_.size(), false);
}
void EagerReducer::AddDistHook(size_t var_index) {
PADDLE_ENFORCE_LT(var_index, variable_locators_.size(),
platform::errors::OutOfRange(
"Out of bounds variable index. it must be less"
"than %d, but it is %d",
variable_locators_.size(), var_index));
// gradient synchronization is not required when grad_need_hooks_ is false.
if (!grad_need_hooks_) {
return;
}
auto &tensor = tensors_[var_index];
const auto &grad_node = GetGradNodeFromTensor(&tensor);
VLOG(3) << "Var[" << var_index << "] [" << (*grad_node).name()
<< "] arrived and triggered disthook";
local_used_vars_[var_index] = 1;
MarkVarReady(var_index, true);
}
void EagerReducer::MarkVarReady(const size_t var_index,
const bool is_used_var) {
const auto &var_locator = variable_locators_[var_index];
const auto group_index = var_locator.group_index;
const auto inside_group_index = var_locator.inside_group_index;
auto &group = groups_[group_index];
auto &group_tensor = group.dense_tensors_[inside_group_index];
auto *autograd_meta = tensors_[var_index].get_autograd_meta();
auto &grad_tensor = static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
group_tensor
.ShareDataWith(
*(std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor.impl())))
.Resize({grad_tensor.numel()});
vars_marked_ready_[var_index] = true;
if (--group.pending_ == 0) {
// can start allreduce
MarkGroupReady(group_index);
}
}
void EagerReducer::MarkGroupReady(size_t group_index) {
VLOG(3) << "Group[" << group_index << "] is ready";
PADDLE_ENFORCE_GE(
group_index, next_group_,
platform::errors::PreconditionNotMet(
"The index of the incoming group must be greater "
"than or equal to the previously synchronized group index, "
"expect it to greater than or equal to %d, but got %d.",
next_group_, group_index));
if (group_index > next_group_) {
VLOG(3) << "It will adjust the order of group in next batch automatically";
return;
}
for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0;
++next_group_) {
UNUSED auto &group = groups_[next_group_];
FusedAllReduceSchedule(&group, next_group_);
}
}
void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
const int curr_group_index) {
// The overall timeline: concat > div_nranks > allreduce > split
distributed::AllreduceOptions opts;
opts.reduce_op = ReduceOp::SUM;
VLOG(3) << "group [" << curr_group_index << "] start fused_allreduce.";
// concat tensors
group->ConcatTensors(inner_place_);
// div nranks
double scaling = 1.0 / nranks_;
paddle::experimental::scale_(group->dense_contents_, scaling, 0.0, false);
// all_reduce
std::vector<Tensor> reduce_tensors = {group->dense_contents_};
tasks_.push_back(process_group_->AllReduce(reduce_tensors, opts));
if (tasks_.size() == groups_.size()) {
for (size_t index = 0; index < tasks_.size(); index++) {
auto &task = tasks_.back();
task->Synchronize();
tasks_.pop_back();
}
for (size_t index = 0; index < groups_.size(); index++) {
auto &group = groups_[index];
group.SplitTensors(inner_place_);
}
}
}
std::ostream &operator<<(std::ostream &out, const EagerGroup &group) {
const auto &tensors_ = group.tensor_indices_;
out << "numel: " << group.all_length_ << " ;var number: " << tensors_.size()
<< "\n";
auto begin = tensors_.begin();
auto end = tensors_.end();
out << "[";
for (int i = 0; begin != end && i < 100; ++i, ++begin) {
if (i > 0) out << ' ';
out << *begin;
}
if (begin != end) {
out << " ...";
}
out << "]\n";
return out;
}
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -17,16 +17,109 @@ ...@@ -17,16 +17,109 @@
#include <map> #include <map>
#include <vector> #include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/utils/hook_utils.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/api/lib/ext_compat_utils.h"
#include "paddle/phi/common/data_type.h"
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
using Tensor = paddle::experimental::Tensor; using Tensor = paddle::experimental::Tensor;
using Scalar = paddle::experimental::ScalarBase<paddle::experimental::Tensor>;
using ScalarArray =
paddle::experimental::ScalarArrayBase<paddle::experimental::Tensor>;
std::vector<std::vector<size_t>> Eager_AssignGroupBySize( std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient, const std::vector<Tensor>, const std::vector<bool> &is_sparse_gradient,
const std::vector<size_t>& group_size_limits, const std::vector<size_t> &group_size_limits,
const std::vector<int64_t>& tensor_indices = {}); const std::vector<int64_t> &tensor_indices = {});
class EagerGroup {
public:
Tensor dense_contents_;
// for concat kernel
std::vector<phi::DenseTensor> dense_tensors_;
std::vector<int64_t> length_;
int64_t all_length_{0};
std::vector<ScalarArray> origin_shapes_;
// Global indices of participating tensors in the group
std::vector<size_t> tensor_indices_;
// Number of params that haven't been ready. When it is 0, it means
// the group is ready.
size_t pending_ = -1;
// external message of group
phi::DataType dtype_;
// context is used to select the stream for concat
void ConcatTensors(const platform::Place &);
// context is used to select the stream for split
void SplitTensors(const platform::Place &);
friend std::ostream &operator<<(std::ostream &, const EagerGroup &);
};
struct TensorLocator {
// record the index in groups_
size_t group_index;
size_t inside_group_index;
};
class EagerReducer {
public:
explicit EagerReducer(
const std::vector<Tensor> tensors,
const std::vector<std::vector<size_t>> &group_indices,
const std::vector<bool> &is_sparse_gradient,
std::shared_ptr<distributed::ProcessGroup> process_group,
const std::vector<size_t> &group_size_limits,
bool find_unused_parameters);
virtual ~EagerReducer() {}
std::shared_ptr<egr::GradNodeBase> GetGradNodeFromTensor(Tensor *tensor);
void InitializeGroups(const std::vector<std::vector<size_t>> &group_indices);
void InitializeDenseGroups(const std::vector<size_t> &tensor_indices_,
EagerGroup *p_group);
void PrepareForBackward(const std::vector<Tensor> &outputs);
void AddDistHook(size_t var_index);
void MarkVarReady(const size_t var_index, const bool is_used_var);
void MarkGroupReady(const size_t group_index);
void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index);
private:
std::vector<Tensor> tensors_;
std::vector<std::vector<size_t>> group_indices_;
std::vector<bool> is_sparse_gradient_;
std::shared_ptr<distributed::ProcessGroup> process_group_;
std::vector<size_t> group_size_limits_;
bool find_unused_vars_each_step_;
std::vector<EagerGroup> groups_;
std::vector<TensorLocator> variable_locators_;
PlaceType place_;
platform::Place inner_place_;
size_t next_group_ = 0;
int64_t nranks_ = -1;
std::vector<std::shared_ptr<paddle::distributed::ProcessGroup::Task>> tasks_;
bool grad_need_hooks_{false};
std::vector<bool> vars_marked_ready_;
std::vector<int> local_used_vars_;
};
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/operators/truncated_gaussian_random_op.h" #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
namespace paddle { namespace paddle {
...@@ -118,9 +117,13 @@ class TruncatedGaussianInitializer : public Initializer { ...@@ -118,9 +117,13 @@ class TruncatedGaussianInitializer : public Initializer {
seed_ = static_cast<unsigned int>(std::stoi(attrs[1])); seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
mean_ = std::stof(attrs[2]); mean_ = std::stof(attrs[2]);
std_ = std::stof(attrs[3]); std_ = std::stof(attrs[3]);
auto normal_cdf = [](float x) {
std::uniform_real_distribution<float> dist_( return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
std::numeric_limits<float>::min(), 1.0); };
float a_normal_cdf = normal_cdf((-2.0 - mean_) / std_);
float b_normal_cdf = normal_cdf((2.0 - mean_) / std_);
std::uniform_real_distribution<float> dist_(2.0 * a_normal_cdf - 1.0,
2.0 * b_normal_cdf - 1.0);
random_engine_ = framework::GetCPURandomEngine(seed_); random_engine_ = framework::GetCPURandomEngine(seed_);
} }
......
set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node custom_operator_node)
set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node) set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node)
...@@ -9,6 +10,8 @@ endif() ...@@ -9,6 +10,8 @@ endif()
add_subdirectory(api) add_subdirectory(api)
add_subdirectory(accumulation) add_subdirectory(accumulation)
add_subdirectory(custom_operator)
cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include <atomic> #include <atomic>
#include <memory> #include <memory>
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include "paddle/phi/api/ext/op_meta_info.h"
namespace egr { namespace egr {
class UniqueNameGenerator { class UniqueNameGenerator {
...@@ -70,6 +70,21 @@ class Controller { ...@@ -70,6 +70,21 @@ class Controller {
void SetInEagerMode(bool in_eager_mode) { in_eager_mode_ = in_eager_mode; } void SetInEagerMode(bool in_eager_mode) { in_eager_mode_ = in_eager_mode; }
const std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>&
GetOpMetaInfoMap() {
return op_meta_info_map_;
}
void MergeOpMetaInfoMap(const std::unordered_map<
std::string, std::vector<paddle::OpMetaInfo>>& map) {
op_meta_info_map_.insert(map.begin(), map.end());
}
std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>&
GetCustomEdgesSlotMap() {
return custom_edges_slot_map_;
}
private: private:
Controller() = default; Controller() = default;
static Controller* controller_; static Controller* controller_;
...@@ -77,6 +92,11 @@ class Controller { ...@@ -77,6 +92,11 @@ class Controller {
new paddle::imperative::Tracer()}; new paddle::imperative::Tracer()};
// TODO(jiabin): remove when we don't need imperative. // TODO(jiabin): remove when we don't need imperative.
bool in_eager_mode_{false}; bool in_eager_mode_{false};
std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>
op_meta_info_map_;
/* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/
std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>
custom_edges_slot_map_;
DISABLE_COPY_AND_ASSIGN(Controller); DISABLE_COPY_AND_ASSIGN(Controller);
}; };
......
...@@ -1553,9 +1553,23 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1553,9 +1553,23 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
core_ops_returns_info[op_type] = return_contents; core_ops_returns_info[op_type] = return_contents;
// [Generation] ComputeRequireGrad -> GradNodeCreation // [Generation] ComputeRequireGrad -> GradNodeCreation
if (!bwd_info.GenerateForwardOnly()) { if (!bwd_info.GenerateForwardOnly()) {
std::string grad_node_creation_body_str = std::string grad_node_creation_body_str =
GenerateGradNodeCreationContent(fwd_info, bwd_info); GenerateGradNodeCreationContent(fwd_info, bwd_info);
// Add event record
std::string event_name = op_type + " node_creation";
const char* NODE_CREATION_TEMPLATE =
"{\n"
" paddle::platform::RecordEvent node_creation_record_event(\"%s\", "
"paddle::platform::TracerEventType::Operator, 1);\n"
" %s\n"
"}";
grad_node_creation_body_str = paddle::string::Sprintf(
NODE_CREATION_TEMPLATE, event_name, grad_node_creation_body_str);
generated_function_body += grad_node_creation_body_str; generated_function_body += grad_node_creation_body_str;
generated_function_body += "\n"; generated_function_body += "\n";
...@@ -1614,10 +1628,20 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents( ...@@ -1614,10 +1628,20 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
if ((*iter) == ',') dygraph_function_args_str.erase(iter); if ((*iter) == ',') dygraph_function_args_str.erase(iter);
} }
const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n"; const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE =
"paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", "
"paddle::platform::TracerEventType::Operator, 1);";
std::string event_name = op_type + " dygraph";
std::string fwd_record_event_str = paddle::string::Sprintf(
DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name);
const char* FWD_FUNCTION_TEMPLATE =
"%s %s(%s) {\n\n"
" %s\n"
" %s\n"
"}\n\n";
std::string fwd_function_str = paddle::string::Sprintf( std::string fwd_function_str = paddle::string::Sprintf(
FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name,
dygraph_function_args_str, generated_function_body); dygraph_function_args_str, fwd_record_event_str, generated_function_body);
// [Generation] Generate forward functions header // [Generation] Generate forward functions header
const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n"; const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n";
...@@ -2240,8 +2264,9 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path, ...@@ -2240,8 +2264,9 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
"\"paddle/fluid/eager/api/generated/fluid_generated/" "\"paddle/fluid/eager/api/generated/fluid_generated/"
"dygraph_forward_api.h\"\n" "dygraph_forward_api.h\"\n"
"#include " "#include "
"\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n" "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n"
"#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"; "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
"#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n";
std::string forward_cc_include_str = std::string forward_cc_include_str =
paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE); paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
std::ofstream forward_cc_stream(forward_cc_path, std::ios::out); std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
......
...@@ -148,6 +148,12 @@ def ReadBwdFile(filepath): ...@@ -148,6 +148,12 @@ def ReadBwdFile(filepath):
###################### ######################
### Yaml Parsers ### ### Yaml Parsers ###
###################### ######################
def RemoveSpecialSymbolsInName(string):
# Remove any name after '@'
ret = string.split("@")[0]
return ret
def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
# intermediate_outputs : [name0, name1, ...] # intermediate_outputs : [name0, name1, ...]
# forward_returns_list : [[ret_name, type, orig_pos], ...] # forward_returns_list : [[ret_name, type, orig_pos], ...]
...@@ -166,15 +172,19 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): ...@@ -166,15 +172,19 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
def ParseDispensable(string): def ParseDispensable(string):
# string: "X, Y" # string: "X, Y"
string = RemoveSpecialSymbolsInName(string)
return [v.strip() for v in string.split(",")] return [v.strip() for v in string.split(",")]
def ParseIntermediate(string): def ParseIntermediate(string):
string = RemoveSpecialSymbolsInName(string)
return [v.strip() for v in string.split(",")] return [v.strip() for v in string.split(",")]
def ParseNoNeedBuffer(string): def ParseNoNeedBuffer(string):
# string: "x, y" # string: "x, y"
string = RemoveSpecialSymbolsInName(string)
no_need_buffer_set = set() no_need_buffer_set = set()
for name in string.split(","): for name in string.split(","):
no_need_buffer_set.add(name.strip()) no_need_buffer_set.add(name.strip())
...@@ -204,6 +214,8 @@ def ParseYamlArgs(string): ...@@ -204,6 +214,8 @@ def ParseYamlArgs(string):
assert arg_type in yaml_types_mapping.keys() assert arg_type in yaml_types_mapping.keys()
arg_type = yaml_types_mapping[arg_type] arg_type = yaml_types_mapping[arg_type]
arg_name = RemoveSpecialSymbolsInName(arg_name)
if "Tensor" in arg_type: if "Tensor" in arg_type:
assert default_value is None assert default_value is None
inputs_list.append([arg_name, arg_type, i]) inputs_list.append([arg_name, arg_type, i])
...@@ -239,6 +251,7 @@ def ParseYamlReturns(string): ...@@ -239,6 +251,7 @@ def ParseYamlReturns(string):
ret_type = yaml_types_mapping[ret_type] ret_type = yaml_types_mapping[ret_type]
assert "Tensor" in ret_type assert "Tensor" in ret_type
ret_name = RemoveSpecialSymbolsInName(ret_name)
returns_list.append([ret_name, ret_type, i]) returns_list.append([ret_name, ret_type, i])
return returns_list return returns_list
...@@ -910,8 +923,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, ...@@ -910,8 +923,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
backward_fwd_input_map, backward_grad_input_map, backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs) backward_grad_output_map, backward_attrs_list, optional_inputs)
node_event_name = fwd_api_name + " node_creation"
NODE_CREATION_TEMPLATE = """{{\n
paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
{}\n
}}"""
node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name,
node_creation_str)
dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
FORWARD_FUNCTION_TEMPLATE = """ FORWARD_FUNCTION_TEMPLATE = """
{} {}({}) {{ {} {}({}) {{
{}
// Forward API Call // Forward API Call
{} {}
...@@ -925,7 +950,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, ...@@ -925,7 +950,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
forward_function_name = GetForwardFunctionName(fwd_api_name) forward_function_name = GetForwardFunctionName(fwd_api_name)
forward_function_str = FORWARD_FUNCTION_TEMPLATE.format( forward_function_str = FORWARD_FUNCTION_TEMPLATE.format(
returns_type_str, forward_function_name, inputs_args_definition_str, returns_type_str, forward_function_name, inputs_args_definition_str,
forward_call_str, node_creation_str, returns_str) dygraph_event_str, forward_call_str, node_creation_str, returns_str)
forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});" forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});"
return forward_function_str, forward_function_declaration_str return forward_function_str, forward_function_declaration_str
...@@ -1052,6 +1077,8 @@ def GenerateForwardCCFile(filepath, forward_definition_str): ...@@ -1052,6 +1077,8 @@ def GenerateForwardCCFile(filepath, forward_definition_str):
#include "paddle/phi/api/include/sparse_api.h" #include "paddle/phi/api/include/sparse_api.h"
#include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
""" """
file_contents += GenerateCoreOpInfoDefinition() file_contents += GenerateCoreOpInfoDefinition()
......
...@@ -94,9 +94,13 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, ...@@ -94,9 +94,13 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_list[pos] = f"{name}"
dygraph_function_call_str = ",".join(dygraph_function_call_list) dygraph_function_call_str = ",".join(dygraph_function_call_list)
pythonc_event_str = f"paddle::platform::RecordEvent pythonc_record_event(\"{fwd_api_name} pybind_imperative_func\", paddle::platform::TracerEventType::Operator, 1);"
PYTHON_C_FUNCTION_TEMPLATE = """ PYTHON_C_FUNCTION_TEMPLATE = """
static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs)
{{ {{
{}
PyThreadState *tstate = nullptr; PyThreadState *tstate = nullptr;
try try
{{ {{
...@@ -136,8 +140,8 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj ...@@ -136,8 +140,8 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name) fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name)
python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, fwd_api_name, pythonc_event_str, fwd_api_name, get_eager_tensor_str,
fwd_function_name, dygraph_function_call_str) parse_attributes_str, fwd_function_name, dygraph_function_call_str)
python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
...@@ -231,6 +235,7 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): ...@@ -231,6 +235,7 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
#include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/pybind/op_function_common.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include <Python.h> #include <Python.h>
namespace paddle {{ namespace paddle {{
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
#include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/fluid/eager/grad_tensor_holder.h"
#include "paddle/fluid/eager/utils.h" #include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/errors.h"
...@@ -77,6 +79,9 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap( ...@@ -77,6 +79,9 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
const std::vector<paddle::experimental::Tensor>& grad_tensors, const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph) { bool retain_graph) {
paddle::platform::RecordEvent backward_record_event(
"backward", paddle::platform::TracerEventType::Operator, 1);
VLOG(6) << "Start Backward"; VLOG(6) << "Start Backward";
// *Gradient Hook should happen at node-level // *Gradient Hook should happen at node-level
// *Inplace version check should perform at node-level // *Inplace version check should perform at node-level
...@@ -112,7 +117,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, ...@@ -112,7 +117,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
// Prepare GradTensorHolder // Prepare GradTensorHolder
if (!node_input_buffers_dict.count(grad_node)) { if (!node_input_buffers_dict.count(grad_node)) {
VLOG(6) << "Create Value for grad input tensor " << i; VLOG(6) << "Create Value for grad input tensor " << i
<< " of grad node: " << grad_node->name();
node_input_buffers_dict[grad_node] = node_input_buffers_dict[grad_node] =
std::make_unique<GradTensorHolder>(grad_node->InputMeta()); std::make_unique<GradTensorHolder>(grad_node->InputMeta());
} }
...@@ -158,19 +164,27 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, ...@@ -158,19 +164,27 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
VLOG(6) << "Run Backward"; VLOG(6) << "Run Backward";
while (!queue.empty()) { while (!queue.empty()) {
GradNodeBase* node = queue.front(); GradNodeBase* node = queue.front();
queue.pop();
paddle::platform::RecordEvent node_record_event(
std::string(typeid(*node).name()) + " grad_node",
paddle::platform::TracerEventType::Operator, 1);
if (queue.size() > 1 && node_in_degree_map[node] != 0) {
queue.pop();
continue;
}
queue.pop();
// Run node: This is where Hook happens // Run node: This is where Hook happens
PADDLE_ENFORCE( PADDLE_ENFORCE(
node_input_buffers_dict.count(node), node_input_buffers_dict.count(node),
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"Unable to find next node in the InputBuufer" "Unable to find next node in the GradTensorHolder \n"
"Trying to run Node without configuring its GradTensorHolder")); "Trying to run Node without configuring its GradTensorHolder"));
std::unique_ptr<GradTensorHolder> node_input_buffer = std::unique_ptr<GradTensorHolder> node_input_buffer =
std::move(node_input_buffers_dict[node]); std::move(node_input_buffers_dict[node]);
VLOG(6) << "Run Backward Kernel with input_buffer"; VLOG(6) << "Run Backward Kernel with GradTensorHolder";
// Run Pre Backward Node and get outputs // Run Pre Backward Node and get outputs
std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors = std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
(*node)(node_input_buffer->Buffers()); (*node)(node_input_buffer->Buffers());
...@@ -215,9 +229,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, ...@@ -215,9 +229,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
if ((!grad_output_tensor.defined() || if ((!grad_output_tensor.defined() ||
!grad_output_tensor.initialized())) { !grad_output_tensor.initialized())) {
VLOG(6) VLOG(6) << "We get grad_output_tensor with slot: " << i
<< "We get grad_output_tensor with slot: " << i << ", rank: " << j << ", rank: " << j << " as uninitialized or undefined tensor";
<< " as uninitialized or undefined in both tensor and variable";
} }
VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
<< ", rank: " << j << ", rank: " << j
...@@ -228,6 +241,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, ...@@ -228,6 +241,8 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
const auto& input_meta = next_node->InputMeta(); const auto& input_meta = next_node->InputMeta();
auto grad_tensor_holder = auto grad_tensor_holder =
std::make_unique<GradTensorHolder>(input_meta); std::make_unique<GradTensorHolder>(input_meta);
VLOG(6) << "Construct GradTensorHolder for grad node: "
<< next_node->name();
node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
} }
VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
...@@ -237,10 +252,12 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, ...@@ -237,10 +252,12 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
// Update queue // Update queue
node_in_degree_map[next_node]--; node_in_degree_map[next_node]--;
PADDLE_ENFORCE(node_in_degree_map[next_node] >= 0, PADDLE_ENFORCE(
paddle::platform::errors::Fatal( node_in_degree_map[next_node] >= 0,
"Detected in-degree value smaller than zero." paddle::platform::errors::Fatal(
"Node's in-degree cannot be negative")); "Detected in-degree value smaller than zero. For Node: %s"
"Node's in-degree cannot be negative",
next_node->name()));
if (node_in_degree_map[next_node] == 0) { if (node_in_degree_map[next_node] == 0) {
queue.emplace(std::move(next_node)); queue.emplace(std::move(next_node));
} }
......
cc_library(custom_operator_node SRCS custom_operator_node.cc DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
#include "paddle/fluid/framework/custom_operator.h"
#include "paddle/fluid/framework/op_meta_info_helper.h"
#include "paddle/phi/api/ext/op_meta_info.h"
#include "paddle/phi/core/dense_tensor.h"
namespace egr {
std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
paddle::CustomOpKernelContext ctx;
auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
auto grad_outputs_names = paddle::framework::OpMetaInfoHelper::GetOutputs(
egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
std::vector<std::vector<paddle::experimental::Tensor>> tmp_ins(
grad_inputs_name.size());
VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
<< ", whose grad_inputs_name size is: " << grad_inputs_name.size();
for (size_t i = 0; i < grads.size(); i++) {
if (map[1].find(i) != map[1].end()) {
VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i];
tmp_ins[map[1][i]] = grads[i];
}
}
for (auto it : fwd_outs) {
VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second));
}
for (auto it : fwd_ins) {
VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second));
}
VLOG(6) << "Prepare Grad inputs";
for (const auto& in : tmp_ins) {
ctx.EmplaceBackInputs(in);
}
VLOG(6) << "Prepare Grad attrs";
ctx.EmplaceBackAttrs(attrs_);
std::vector<std::vector<paddle::experimental::Tensor>> outs(
GetEdges().size());
std::vector<std::vector<paddle::experimental::Tensor>> tmp_outs(
grad_outputs_names.size());
VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
for (size_t i = 0; i < GetEdges().size(); i++) {
if (map[0].find(i) != map[0].end()) {
VLOG(7) << "Insert grad outputs: " << i
<< " with size: " << GetEdges()[i].size()
<< " to tmp_outputs: " << map[0][i];
for (size_t j = 0; j < GetEdges()[i].size(); j++) {
outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
std::make_shared<phi::DenseTensor>(
phi::DataType::UNDEFINED),
egr::Controller::Instance().GenerateUniqueName(
"custom_tmp_grad"));
}
tmp_outs[map[0][i]] = outs[i];
}
}
for (size_t i = 0; i < tmp_outs.size(); i++) {
VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
ctx.EmplaceBackOutputs(tmp_outs[i]);
}
VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_;
(*paddle::framework::OpMetaInfoHelper::GetKernelFn(
kernel_map.at(op_type_)[1]))(&ctx);
return outs;
}
} // namespace egr
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/hooks.h"
#include "paddle/fluid/eager/tensor_wrapper.h"
#include "paddle/fluid/framework/custom_operator.h"
#include "paddle/utils/any.h"
namespace egr {
class RunCustomOpNode : public GradNodeBase {
public:
// Constructor: configure fwd input tensors to grad node
explicit RunCustomOpNode(size_t bwd_in_slot_num, size_t bwd_out_slot_num,
const std::string& op_type)
: GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) {
VLOG(6) << "Construct RunCustomOpNode for op: " << op_type;
}
~RunCustomOpNode() override {
VLOG(6) << "Destruct RunCustomOpNode for op: " << op_type_;
}
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;
std::string name() {
return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
}
static std::vector<egr::TensorWrapper> ConstructTensorWrapper(
const std::vector<paddle::experimental::Tensor>& fwd_var) {
std::vector<egr::TensorWrapper> res;
for (auto const& var : fwd_var) {
res.emplace_back(var);
}
return res;
}
static std::vector<paddle::experimental::Tensor> Recover(
std::vector<egr::TensorWrapper>* fwd_var) {
std::vector<paddle::experimental::Tensor> res;
for (size_t i = 0; i < fwd_var->size(); i++) {
res.emplace_back(fwd_var->at(i).recover(nullptr));
}
return res;
}
void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
public:
std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_outs;
std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_ins;
std::unordered_map<int, int> grads2grad_in_map;
private:
std::vector<paddle::any> attrs_;
std::string op_type_{""};
};
} // namespace egr
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
#include "glog/logging.h" #include "glog/logging.h"
/** /**
* Implementation of GradNodeBase, Edge and InputBuffer. * Implementation of GradNodeBase, Edge and GradTensorHolder.
**/ **/
namespace egr { namespace egr {
......
...@@ -40,6 +40,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); ...@@ -40,6 +40,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
using namespace egr; // NOLINT using namespace egr; // NOLINT
using namespace egr_utils_api; // NOLINT using namespace egr_utils_api; // NOLINT
......
...@@ -44,6 +44,8 @@ PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); ...@@ -44,6 +44,8 @@ PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
TEST(Benchmark, EagerScaleCUDA) { TEST(Benchmark, EagerScaleCUDA) {
eager_test::InitEnv(paddle::platform::CUDAPlace()); eager_test::InitEnv(paddle::platform::CUDAPlace());
......
...@@ -41,6 +41,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); ...@@ -41,6 +41,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
......
...@@ -43,6 +43,8 @@ PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); ...@@ -43,6 +43,8 @@ PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
......
...@@ -57,6 +57,7 @@ inline void run_program_dygraph_function( ...@@ -57,6 +57,7 @@ inline void run_program_dygraph_function(
auto grad_node = std::make_shared<GradNodeRunProgram>(1, 2); auto grad_node = std::make_shared<GradNodeRunProgram>(1, 2);
grad_node->SetFwdOutNames(out_names); grad_node->SetFwdOutNames(out_names);
grad_node->SetOut(out);
// Set Attributes // Set Attributes
grad_node->SetAttrMap(attrs); grad_node->SetAttrMap(attrs);
// Set TensorWrappers // Set TensorWrappers
......
...@@ -260,9 +260,9 @@ inline void RunProgramAPI( ...@@ -260,9 +260,9 @@ inline void RunProgramAPI(
} }
VLOG(2) << "The number of sub scopes after forward: " VLOG(2) << "The number of sub scopes after forward: "
<< out_scope_vec->front()->kids().size(); << out_scope_vec->front()->kids().size();
// #ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
// #endif #endif
} }
inline void RunProgramGradAPI( inline void RunProgramGradAPI(
...@@ -357,7 +357,7 @@ inline void RunProgramGradAPI( ...@@ -357,7 +357,7 @@ inline void RunProgramGradAPI(
details::ShareTensorsFromScope(params_grad, *global_block, &scope); details::ShareTensorsFromScope(params_grad, *global_block, &scope);
// Step5. drop current scope // Step5. drop current scope
// global_inner_scope->DeleteScope(&scope); global_inner_scope->DeleteScope(&scope);
VLOG(2) << "The number of sub scopes after backward: " VLOG(2) << "The number of sub scopes after backward: "
<< global_inner_scope->kids().size(); << global_inner_scope->kids().size();
} }
...@@ -400,6 +400,10 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -400,6 +400,10 @@ class GradNodeRunProgram : public egr::GradNodeBase {
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"The grads[0].size() and fwd_out_names_.size() should be equal.")); "The grads[0].size() and fwd_out_names_.size() should be equal."));
for (size_t i = 0; i < fwd_out_names_.size(); ++i) { for (size_t i = 0; i < fwd_out_names_.size(); ++i) {
auto &out_grad = egr::EagerUtils::unsafe_autograd_meta(*out_[i])->Grad();
const_cast<paddle::experimental::Tensor &>(out_grad).set_impl(
grads[0][i].impl());
const_cast<paddle::experimental::Tensor &>(grads[0][i]) const_cast<paddle::experimental::Tensor &>(grads[0][i])
.set_name(fwd_out_names_[i] + "@GRAD"); .set_name(fwd_out_names_[i] + "@GRAD");
} }
...@@ -432,6 +436,10 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -432,6 +436,10 @@ class GradNodeRunProgram : public egr::GradNodeBase {
fwd_out_names_ = out_names; fwd_out_names_ = out_names;
} }
void SetOut(const std::vector<paddle::experimental::Tensor *> &out) {
out_ = out;
}
protected: protected:
void ConstructGradTensors( void ConstructGradTensors(
const std::vector<paddle::experimental::Tensor> &fwd_tensors, const std::vector<paddle::experimental::Tensor> &fwd_tensors,
...@@ -440,7 +448,11 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -440,7 +448,11 @@ class GradNodeRunProgram : public egr::GradNodeBase {
// such as: name, tensor type(DenseTensor or SelectedRows). // such as: name, tensor type(DenseTensor or SelectedRows).
VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
for (auto &fwd_t : fwd_tensors) { for (auto &fwd_t : fwd_tensors) {
grad_tensors->emplace_back(fwd_t.impl()); if (phi::DenseTensor::classof(fwd_t.impl().get())) {
grad_tensors->emplace_back(std::make_shared<phi::DenseTensor>());
} else if (phi::SelectedRows::classof(fwd_t.impl().get())) {
grad_tensors->emplace_back(std::make_shared<phi::SelectedRows>());
}
auto &grad_t = grad_tensors->back(); auto &grad_t = grad_tensors->back();
grad_t.set_name(fwd_t.name() + "@GRAD"); grad_t.set_name(fwd_t.name() + "@GRAD");
} }
...@@ -462,6 +474,7 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -462,6 +474,7 @@ class GradNodeRunProgram : public egr::GradNodeBase {
std::vector<paddle::framework::Scope *> step_scope_; std::vector<paddle::framework::Scope *> step_scope_;
std::vector<std::string> fwd_out_names_; std::vector<std::string> fwd_out_names_;
std::vector<paddle::experimental::Tensor *> out_;
// Attribute Map // Attribute Map
paddle::framework::AttributeMap attrs_; paddle::framework::AttributeMap attrs_;
......
...@@ -440,6 +440,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}") ...@@ -440,6 +440,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
configure_file(commit.h.in commit.h) configure_file(commit.h.in commit.h)
cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api)
#cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
#cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/op_meta_info_helper.h" #include "paddle/fluid/framework/op_meta_info_helper.h"
...@@ -946,15 +947,16 @@ void RegisterOperatorWithMetaInfoMap( ...@@ -946,15 +947,16 @@ void RegisterOperatorWithMetaInfoMap(
////////////////////// User APIs /////////////////////// ////////////////////// User APIs ///////////////////////
// load op api // load op api
void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) { const std::unordered_map<std::string, std::vector<OpMetaInfo>>&
LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name); void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name);
VLOG(3) << "load custom_op lib: " << dso_name; VLOG(3) << "load custom_op lib: " << dso_name;
typedef OpMetaInfoMap& get_op_meta_info_map_t(); typedef OpMetaInfoMap& get_op_meta_info_map_t();
auto* get_op_meta_info_map = auto* get_op_meta_info_map =
detail::DynLoad<get_op_meta_info_map_t>(handle, "PD_GetOpMetaInfoMap"); detail::DynLoad<get_op_meta_info_map_t>(handle, "PD_GetOpMetaInfoMap");
auto& op_meta_info_map = get_op_meta_info_map(); auto& op_meta_info_map = get_op_meta_info_map();
RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle); RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle);
return op_meta_info_map.GetMap();
} }
} // namespace framework } // namespace framework
......
...@@ -20,9 +20,9 @@ limitations under the License. */ ...@@ -20,9 +20,9 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// Load custom op api: register op after user compiled // Load custom op api: register op after user compiled
void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); const std::unordered_map<std::string, std::vector<OpMetaInfo>>&
LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
// Register custom op api: register op directly // Register custom op api: register op directly
void RegisterOperatorWithMetaInfoMap( void RegisterOperatorWithMetaInfoMap(
...@@ -31,6 +31,5 @@ void RegisterOperatorWithMetaInfoMap( ...@@ -31,6 +31,5 @@ void RegisterOperatorWithMetaInfoMap(
// Interface for selective register custom op. // Interface for selective register custom op.
void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos, void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
void* dso_handle = nullptr); void* dso_handle = nullptr);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -249,13 +249,13 @@ class CompatMetaTensor : public phi::MetaTensor { ...@@ -249,13 +249,13 @@ class CompatMetaTensor : public phi::MetaTensor {
} }
void share_meta(const MetaTensor& meta_tensor) override { void share_meta(const MetaTensor& meta_tensor) override {
share_dims(meta_tensor);
set_dtype(meta_tensor.dtype()); set_dtype(meta_tensor.dtype());
// VarDesc doesn't contains layout, so we cannot share layout // VarDesc doesn't contains layout, so we cannot share layout
// set_layout(meta_tensor.layout()); // set_layout(meta_tensor.layout());
// special case 1: share lod of LoDTensor // special case: share lod of LoDTensor
share_lod(meta_tensor); share_lod(meta_tensor);
share_dims(meta_tensor);
} }
private: private:
...@@ -297,7 +297,8 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, ...@@ -297,7 +297,8 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature; VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature;
// 2. build infermeta context // 2. build infermeta context
phi::InferMetaContext infer_meta_context(ctx->IsRuntime()); phi::InferMetaContext infer_meta_context(
{ctx->IsRuntime(), ctx->IsRunMKLDNNKernel()});
auto& input_names = std::get<0>(signature.args); auto& input_names = std::get<0>(signature.args);
auto& attr_names = std::get<1>(signature.args); auto& attr_names = std::get<1>(signature.args);
...@@ -499,8 +500,22 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, ...@@ -499,8 +500,22 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
"Unsupported attribute type is received when call " "Unsupported attribute type is received when call "
"InferShapeFunctor.")); "InferShapeFunctor."));
} }
} else { } else if (ctx->HasInput(attr_name)) {
// do nothing // convert from data
if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) {
if (ctx->IsRuntime()) {
const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name);
auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]);
auto val = experimental::MakePhiScalarFromVar(*var_temp);
int32_t val_int = val.template to<int32_t>();
infer_meta_context.EmplaceBackAttr(val_int);
} else {
infer_meta_context.EmplaceBackAttr(-1);
}
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Get value from variable only support int yet"));
}
} }
} }
......
...@@ -126,6 +126,7 @@ if(WITH_MKLDNN) ...@@ -126,6 +126,7 @@ if(WITH_MKLDNN)
pass_library(interpolate_mkldnn_pass inference DIR mkldnn) pass_library(interpolate_mkldnn_pass inference DIR mkldnn)
pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn) pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn)
pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn) pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
pass_library(cpu_quantize_placement_pass base DIR mkldnn) pass_library(cpu_quantize_placement_pass base DIR mkldnn)
pass_library(cpu_quantize_pass inference DIR mkldnn) pass_library(cpu_quantize_pass inference DIR mkldnn)
pass_library(cpu_quantize_squash_pass inference DIR mkldnn) pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
......
...@@ -918,6 +918,36 @@ PDNode *patterns::ConvActivation::operator()( ...@@ -918,6 +918,36 @@ PDNode *patterns::ConvActivation::operator()(
return activation_out_var; return activation_out_var;
} }
PDNode *patterns::ElementwiseActivation::operator()(
paddle::framework::ir::PDNode *elementwise_a,
const std::string &elementwise_type, const std::string &activation_type) {
// Create Operators
elementwise_a->assert_is_op_input(elementwise_type, "X");
auto *elementwise_op =
pattern->NewNode(elementwise_repr())->assert_is_op(elementwise_type);
auto *activation_op =
pattern->NewNode(activation_repr())->assert_is_op(activation_type);
// Create variables
auto *elementwise_b = pattern->NewNode(elementwise_b_repr())
->AsInput()
->assert_is_op_input(elementwise_type, "Y");
// intermediate variable, will be removed in the IR after fuse.
auto *elementwise_out_var =
pattern->NewNode(elementwise_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op(elementwise_type)
->assert_is_op_input(activation_type);
// output
auto *activation_out_var = pattern->NewNode(activation_out_repr())
->AsOutput()
->assert_is_op_output(activation_type);
elementwise_op->LinksFrom({elementwise_a, elementwise_b})
.LinksTo({elementwise_out_var});
activation_op->LinksFrom({elementwise_out_var}).LinksTo({activation_out_var});
return activation_out_var;
}
PDNode *patterns::SeqConvEltAddRelu::operator()( PDNode *patterns::SeqConvEltAddRelu::operator()(
paddle::framework::ir::PDNode *seqconv_input) { paddle::framework::ir::PDNode *seqconv_input) {
// Create Operators // Create Operators
......
...@@ -487,6 +487,28 @@ struct ConvActivation : public PatternBase { ...@@ -487,6 +487,28 @@ struct ConvActivation : public PatternBase {
PATTERN_DECL_NODE(activation_out); PATTERN_DECL_NODE(activation_out);
}; };
// Elementwise with Activation
// op: elementwise + activation
// named nodes:
// elementwise_a, elementwise_b,
// elementwise_out, elementwise,
// activation_out, activation
struct ElementwiseActivation : public PatternBase {
ElementwiseActivation(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "elementwise_add_activation") {}
PDNode* operator()(PDNode* elementwise_a, const std::string& elementwise_type,
const std::string& activation_type);
// declare operator node's name
PATTERN_DECL_NODE(elementwise);
PATTERN_DECL_NODE(activation);
// declare variable node's name
PATTERN_DECL_NODE(elementwise_b);
PATTERN_DECL_NODE(elementwise_out);
PATTERN_DECL_NODE(activation_out);
};
// SEQCONV with Elementwise_Add ReLU // SEQCONV with Elementwise_Add ReLU
// op: seqconv + elementwise_add + relu // op: seqconv + elementwise_add + relu
// named nodes: // named nodes:
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace framework {
namespace ir {
using string::PrettyLogDetail;
void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const {
std::vector<std::string> act_types = {
"relu", "tanh", "leaky_relu", "swish", "hardswish", "sqrt",
"abs", "clip", "gelu", "relu6", "sigmoid"};
std::vector<std::string> elt_types = {"elementwise_add", "elementwise_sub",
"elementwise_mul"};
for (const auto &elt_type : elt_types)
for (const auto &act_type : act_types) {
std::unordered_map<std::string, std::string> attr_map;
if (act_type == "swish")
attr_map.emplace("beta", "activation_alpha");
else if (act_type == "relu6")
attr_map.emplace("threshold", "activation_alpha");
else if (act_type == "clip") {
attr_map.emplace("min", "activation_alpha");
attr_map.emplace("max", "activation_beta");
} else {
attr_map.emplace("alpha", "activation_alpha");
attr_map.emplace("beta", "activation_beta");
}
FuseElementwiseAct(graph, elt_type, act_type, attr_map);
}
}
void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
Graph *graph, const std::string &elt_type, const std::string &act_type,
const std::unordered_map<std::string, std::string> &attr_map) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
FusePassBase::Init("elementwise_act", graph);
GraphPatternDetector gpd;
auto *elementwise_input = gpd.mutable_pattern()
->NewNode(elt_type + "_act/elementwise_input")
->AsInput()
->assert_is_op_input(elt_type, "X");
patterns::ElementwiseActivation elementwise_act_pattern(gpd.mutable_pattern(),
elt_type + "_act");
elementwise_act_pattern(elementwise_input, elt_type, act_type);
int found_elementwise_activation_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
VLOG(4) << "Fuse " << elt_type << " with activation op.";
// Elementwise output
GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
elementwise_act_pattern);
// ACT output
GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out,
elementwise_act_pattern);
// ops
GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise,
elementwise_act_pattern);
GET_IR_NODE_FROM_SUBGRAPH(activation, activation, elementwise_act_pattern);
auto *elementwise_op = elementwise->Op();
if (elementwise_op->HasAttr("use_mkldnn")) {
const std::string wo_elt_type =
"The " + elt_type; // Workaround for PP error message checking.
PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(bool, elementwise_op->GetAttr("use_mkldnn")), true,
platform::errors::PreconditionNotMet(
wo_elt_type + "+Act fusion may happen only when oneDNN library "
"is used."));
}
auto *activation_op = activation->Op();
for (const auto &attr : attr_map) {
if (activation_op->HasAttr(attr.first)) {
elementwise_op->SetAttr(attr.second,
activation_op->GetAttr(attr.first));
}
}
if (act_type == "gelu" && activation_op->HasAttr("approximate") &&
BOOST_GET_CONST(bool, activation_op->GetAttr("approximate")))
elementwise_op->SetAttr("activation_type", std::string("gelu_tanh"));
else
elementwise_op->SetAttr("activation_type", act_type);
elementwise_op->SetOutput("Out", {activation_out->Name()});
IR_OP_VAR_LINK(elementwise, activation_out);
GraphSafeRemoveNodes(g, {activation, elementwise_out});
found_elementwise_activation_count++;
};
gpd(graph, handler);
AddStatis(found_elementwise_activation_count);
PrettyLogDetail("--- fused %d %s with %s activation",
found_elementwise_activation_count, elt_type, act_type);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(elt_act_mkldnn_fuse_pass,
paddle::framework::ir::ElementwiseActivationOneDNNPass);
REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.LE("elementwise_add", 1)
.LE("elementwise_sub", 1)
.LE("elementwise_mul", 1)
.LE("relu", 0)
.LE("tanh", 0)
.LE("leaky_relu", 1)
.LE("swish", 0)
.LE("hard_swish", 0)
.LE("sqrt", 0)
.LE("abs", 0)
.LE("clip", 1)
.LE("gelu", 0)
.LE("relu6", 0)
.LE("sigmoid", 0));
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* \brief Fuse the Elementwise and activation operators into single
* OneDNN's Elementwise with post-op.
*/
class ElementwiseActivationOneDNNPass : public FusePassBase {
public:
virtual ~ElementwiseActivationOneDNNPass() {}
protected:
void ApplyImpl(Graph *graph) const override;
void FuseElementwiseAct(
Graph *graph, const std::string &elt_types, const std::string &act_types,
const std::unordered_map<std::string, std::string> &attr_map) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -25,11 +25,11 @@ USE_OP_ITSELF(softmax); ...@@ -25,11 +25,11 @@ USE_OP_ITSELF(softmax);
USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
USE_OP(leaky_relu); USE_OP_ITSELF(leaky_relu);
USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
USE_OP(gelu); USE_OP(gelu);
USE_OP_ITSELF(relu); USE_OP_ITSELF(relu);
USE_OP(tanh); USE_OP_ITSELF(tanh);
USE_OP_DEVICE_KERNEL(tanh, MKLDNN); USE_OP_DEVICE_KERNEL(tanh, MKLDNN);
namespace paddle { namespace paddle {
......
...@@ -32,7 +32,7 @@ USE_OP(concat); ...@@ -32,7 +32,7 @@ USE_OP(concat);
USE_OP(matmul); USE_OP(matmul);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
USE_OP(sigmoid); USE_OP(sigmoid);
USE_OP(tanh); USE_OP_ITSELF(tanh);
USE_OP(elementwise_mul); USE_OP(elementwise_mul);
USE_OP(softmax_with_cross_entropy); USE_OP(softmax_with_cross_entropy);
USE_OP_ITSELF(reduce_mean); USE_OP_ITSELF(reduce_mean);
...@@ -48,7 +48,7 @@ USE_OP(transpose2_grad); ...@@ -48,7 +48,7 @@ USE_OP(transpose2_grad);
USE_OP(concat_grad); USE_OP(concat_grad);
USE_OP_ITSELF(elementwise_mul_grad); USE_OP_ITSELF(elementwise_mul_grad);
USE_OP(sigmoid_grad); USE_OP(sigmoid_grad);
USE_OP(tanh_grad); USE_OP_ITSELF(tanh_grad);
USE_OP(sum); USE_OP(sum);
USE_OP(slice_grad); USE_OP(slice_grad);
USE_OP(lookup_table_grad); USE_OP(lookup_table_grad);
......
...@@ -2250,41 +2250,62 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2250,41 +2250,62 @@ void OperatorWithKernel::BuildPhiKernelContext(
} }
} else { } else {
// TODO(chenweihang): support other attrs later // TODO(chenweihang): support other attrs later
auto& attr = Attrs().at(attr_names[i]); auto attr_it = attrs_.find(attr_names[i]);
if (attr_defs[i].type_index == std::type_index(typeid(int))) { if (attr_defs[i].type_index == std::type_index(typeid(int))) {
pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); if (attr_it == attrs_.end()) {
auto in_it = ctx.inputs.find(attr_names[i]);
if (in_it != ctx.inputs.end()) {
// get data from input
auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0]));
int32_t val_int = val.template to<int32_t>();
pt_kernel_context->EmplaceBackAttr(val_int);
} else {
PADDLE_THROW(platform::errors::NotFound(
"can not find attribute `%s` both in attribute and input ",
attr_names[i]));
}
} else {
pt_kernel_context->EmplaceBackAttr(
BOOST_GET_CONST(int, attr_it->second));
}
} else if (attr_defs[i].type_index == std::type_index(typeid(float))) { } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); pt_kernel_context->EmplaceBackAttr(
BOOST_GET_CONST(float, attr_it->second));
} else if (attr_defs[i].type_index == std::type_index(typeid(bool))) { } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); pt_kernel_context->EmplaceBackAttr(
BOOST_GET_CONST(bool, attr_it->second));
} else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) { } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr)); pt_kernel_context->EmplaceBackAttr(
BOOST_GET_CONST(int64_t, attr_it->second));
} else if (attr_defs[i].type_index == } else if (attr_defs[i].type_index ==
std::type_index(typeid(std::string))) { std::type_index(typeid(std::string))) {
pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); pt_kernel_context->EmplaceBackAttr(
BOOST_GET_CONST(std::string, attr_it->second));
} else if (attr_defs[i].type_index == } else if (attr_defs[i].type_index ==
std::type_index(typeid(phi::DataType))) { std::type_index(typeid(phi::DataType))) {
auto data_type = paddle::framework::TransToPhiDataType( auto data_type = paddle::framework::TransToPhiDataType(
static_cast<framework::proto::VarType::Type>( static_cast<framework::proto::VarType::Type>(
BOOST_GET_CONST(int, attr))); BOOST_GET_CONST(int, attr_it->second)));
pt_kernel_context->EmplaceBackAttr(data_type); pt_kernel_context->EmplaceBackAttr(data_type);
} else if (attr_defs[i].type_index == } else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int64_t>))) { std::type_index(typeid(std::vector<int64_t>))) {
if (std::type_index(attr.type()) == if (std::type_index(attr_it->second.type()) ==
std::type_index(typeid(std::vector<int64_t>))) { std::type_index(typeid(std::vector<int64_t>))) {
pt_kernel_context->EmplaceBackAttr( pt_kernel_context->EmplaceBackAttr(
BOOST_GET_CONST(std::vector<int64_t>, attr)); BOOST_GET_CONST(std::vector<int64_t>, attr_it->second));
} else if (std::type_index(attr.type()) == } else if (std::type_index(attr_it->second.type()) ==
std::type_index(typeid(std::vector<int>))) { std::type_index(typeid(std::vector<int>))) {
// Emplace Back Attr according to the type of Phi_Kernel args. // Emplace Back Attr according to the type of Phi_Kernel args.
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr); const auto& vector_int_attr =
BOOST_GET_CONST(std::vector<int>, attr_it->second);
const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(), const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
vector_int_attr.end()); vector_int_attr.end());
pt_kernel_context->EmplaceBackAttr(vector_int64_attr); pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
} }
} else if (attr_defs[i].type_index == } else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int32_t>))) { std::type_index(typeid(std::vector<int32_t>))) {
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr); const auto& vector_int_attr =
BOOST_GET_CONST(std::vector<int>, attr_it->second);
pt_kernel_context->EmplaceBackAttr(vector_int_attr); pt_kernel_context->EmplaceBackAttr(vector_int_attr);
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
......
...@@ -45,8 +45,8 @@ Program CreateAddProgram() { ...@@ -45,8 +45,8 @@ Program CreateAddProgram() {
NetBuilder builder("net_builder"); NetBuilder builder("net_builder");
auto a = builder.CreateInput(Float(32), {M, N}); auto a = builder.CreateInput(Float(32), {M, N});
auto b = builder.CreateInput(Float(32), {M, N}); auto b = builder.CreateInput(Float(32), {M, N});
auto c = builder.add(a, b); auto c = builder.Add(a, b);
auto d = builder.add(a, c); auto d = builder.Add(a, c);
auto program = builder.Build(); auto program = builder.Build();
return program; return program;
...@@ -116,8 +116,8 @@ TEST(net_build, program_execute_fc) { ...@@ -116,8 +116,8 @@ TEST(net_build, program_execute_fc) {
auto w = builder.CreateInput(Float(32), {N, K}, "W"); // weight auto w = builder.CreateInput(Float(32), {N, K}, "W"); // weight
auto b = builder.CreateInput(Float(32), {N}, "B"); // bias auto b = builder.CreateInput(Float(32), {N}, "B"); // bias
auto mul_out = builder.mul(a, w, 2, 1); auto mul_out = builder.Mul(a, w, 2, 1);
auto add_out = builder.add(mul_out, b); auto add_out = builder.Add(mul_out, b);
auto program = builder.Build(); auto program = builder.Build();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -389,6 +389,9 @@ static void PerformBackwardInplace(const std::string& op_type, ...@@ -389,6 +389,9 @@ static void PerformBackwardInplace(const std::string& op_type,
} }
void BasicEngine::Execute() { void BasicEngine::Execute() {
platform::RecordEvent backward_record_event(
"backward", platform::TracerEventType::Operator, 1);
if (init_nodes_.empty()) { if (init_nodes_.empty()) {
return; return;
} }
...@@ -412,7 +415,7 @@ void BasicEngine::Execute() { ...@@ -412,7 +415,7 @@ void BasicEngine::Execute() {
for (auto& cur_op : *shared_cur_node) { for (auto& cur_op : *shared_cur_node) {
platform::RecordEvent op_type_record_event( platform::RecordEvent op_type_record_event(
cur_op.Type(), platform::TracerEventType::Operator, 1); cur_op.Type() + " grad_node", platform::TracerEventType::Operator, 1);
++op_num; ++op_num;
......
...@@ -264,14 +264,23 @@ void BuildDygraphPhiKernelContext( ...@@ -264,14 +264,23 @@ void BuildDygraphPhiKernelContext(
size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second); size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
if ((it == ins.end()) && if (it == ins.end()) {
(input_defs[i].type_index == if (LIKELY(input_defs[i].type_index ==
std::type_index(typeid(paddle::optional<const phi::DenseTensor&>)))) { std::type_index(
kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); typeid(paddle::optional<const phi::DenseTensor&>)))) {
auto end_idx = start_idx + 1; kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); auto end_idx = start_idx + 1;
continue; kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
continue;
} else {
PADDLE_THROW(phi::errors::NotFound(
"Can not find input variable '%s' for %s OP, please check whether "
"the name setting in OpArgumentMapping is consistent with that in "
"OpMaker.",
input_names[i], pt_kernel_signature.name));
}
} }
auto ins_vector = it->second; auto ins_vector = it->second;
size_t end_idx = start_idx + ins_vector.size(); size_t end_idx = start_idx + ins_vector.size();
...@@ -410,6 +419,17 @@ void BuildDygraphPhiKernelContext( ...@@ -410,6 +419,17 @@ void BuildDygraphPhiKernelContext(
experimental::MakePhiScalarFromVar(ins_vector[0]->Var()))); experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
} }
} else if (ins.find(attr_names[i]) != ins.end()) {
// deal tensor attr here
auto& ins_vector = ins.at(attr_names[i]);
auto tensor_attr =
experimental::MakePhiScalarFromVar(ins_vector[0]->Var());
if (attr_defs[i].type_index == std::type_index(typeid(int))) {
int val = tensor_attr.template to<int>();
kernel_ctx->EmplaceBackAttr(val);
} else {
PADDLE_THROW(platform::errors::Unimplemented("only support int here"));
}
} else if (attr_defs[i].type_index == } else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<phi::Scalar>))) { std::type_index(typeid(std::vector<phi::Scalar>))) {
auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
...@@ -466,6 +486,7 @@ void BuildDygraphPhiKernelContext( ...@@ -466,6 +486,7 @@ void BuildDygraphPhiKernelContext(
} }
} else { } else {
// TODO(chenweihang): support other attrs later // TODO(chenweihang): support other attrs later
auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
if (attr_defs[i].type_index == std::type_index(typeid(int))) { if (attr_defs[i].type_index == std::type_index(typeid(int))) {
kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
......
...@@ -34,6 +34,7 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); ...@@ -34,6 +34,7 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
#endif #endif
namespace imperative = paddle::imperative; namespace imperative = paddle::imperative;
......
...@@ -177,7 +177,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins, ...@@ -177,7 +177,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
paddle::framework::AttributeMap* passed_default_attrs_, paddle::framework::AttributeMap* passed_default_attrs_,
bool use_default_attr_map) { bool use_default_attr_map) {
platform::RecordEvent op_type_record_event( platform::RecordEvent op_type_record_event(
type, platform::TracerEventType::Operator, 1); type + " trace_op", platform::TracerEventType::Operator, 1);
platform::ScopedFlushDenormal flush; platform::ScopedFlushDenormal flush;
VLOG(1) << "Trace Op: " << type; VLOG(1) << "Trace Op: " << type;
if (FLAGS_use_mkldnn) { if (FLAGS_use_mkldnn) {
...@@ -297,19 +297,24 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins, ...@@ -297,19 +297,24 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
program_desc_tracer_->InsertOp(type, new_ins, outs, attrs); program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
} }
if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { {
PADDLE_ENFORCE_EQ( platform::RecordEvent node_creation_record_event(
passed_default_attrs_, nullptr, type + " node_creation", platform::TracerEventType::Operator, 1);
paddle::platform::errors::PermissionDenied(
"We expect passed_default_attrs_ is nullptr while " if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
"use_default_attr_map is true, however we got not null " PADDLE_ENFORCE_EQ(
"passed_default_attrs_. Please check your usage of trace_op. ")); passed_default_attrs_, nullptr,
CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, paddle::platform::errors::PermissionDenied(
inplace_map); "We expect passed_default_attrs_ is nullptr while "
} else { "use_default_attr_map is true, however we got not null "
VLOG(3) << "No Grad to track for Op: " << type; "passed_default_attrs_. Please check your usage of trace_op. "));
CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
inplace_map);
} else {
VLOG(3) << "No Grad to track for Op: " << type;
}
VLOG(6) << "Finish Trace Op: " << type;
} }
VLOG(6) << "Finish Trace Op: " << type;
} }
template void Tracer::TraceOp<VarBase>( template void Tracer::TraceOp<VarBase>(
......
...@@ -262,6 +262,7 @@ void CpuPassStrategy::EnableMKLDNN() { ...@@ -262,6 +262,7 @@ void CpuPassStrategy::EnableMKLDNN() {
// "fc_act_mkldnn_fuse_pass", // "fc_act_mkldnn_fuse_pass",
"batch_norm_act_fuse_pass", // "batch_norm_act_fuse_pass", //
"softplus_activation_mkldnn_fuse_pass", // "softplus_activation_mkldnn_fuse_pass", //
"elt_act_mkldnn_fuse_pass", //
// TODO(intel): Please fix the bug on windows. // TODO(intel): Please fix the bug on windows.
// https://github.com/PaddlePaddle/Paddle/issues/29710 // https://github.com/PaddlePaddle/Paddle/issues/29710
// "mkldnn_inplace_pass", // This pass should be activated after // "mkldnn_inplace_pass", // This pass should be activated after
......
...@@ -328,5 +328,5 @@ class Pool2dOpConverter : public OpConverter { ...@@ -328,5 +328,5 @@ class Pool2dOpConverter : public OpConverter {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(pool2d); USE_OP_ITSELF(pool2d);
REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter); REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
...@@ -224,5 +224,5 @@ class Pool3dOpConverter : public OpConverter { ...@@ -224,5 +224,5 @@ class Pool3dOpConverter : public OpConverter {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(pool3d); USE_OP_ITSELF(pool3d);
REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter); REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter);
...@@ -54,5 +54,5 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } ...@@ -54,5 +54,5 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
USE_OP_ITSELF(relu); USE_OP_ITSELF(relu);
USE_OP(sigmoid); USE_OP(sigmoid);
USE_OP(tanh); USE_OP_ITSELF(tanh);
USE_OP(relu6); USE_OP(relu6);
...@@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) { ...@@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) {
} // namespace paddle } // namespace paddle
// USE_OP(leaky_relu); // USE_OP(leaky_relu);
USE_OP(leaky_relu); USE_OP_ITSELF(leaky_relu);
...@@ -71,4 +71,4 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } ...@@ -71,4 +71,4 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(pool2d); USE_OP_ITSELF(pool2d);
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h"
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/phi/kernels/funcs/pooling.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -108,16 +108,14 @@ int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs, ...@@ -108,16 +108,14 @@ int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs,
output_shape.insert(output_shape.begin(), batchSize); output_shape.insert(output_shape.begin(), batchSize);
if (pool3d_type_ == Pool3DType::max) { if (pool3d_type_ == Pool3DType::max) {
paddle::operators::math::MaxPool<float> pool_process; phi::funcs::MaxPool<float> pool_process;
paddle::operators::math::Pool3dDirectCUDAFunctor< phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
paddle::operators::math::MaxPool<float>, float>
pool3d_forward; pool3d_forward;
pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
paddings_, true, adaptive_, odatas[0], stream, pool_process); paddings_, true, adaptive_, odatas[0], stream, pool_process);
} else if (pool3d_type_ == Pool3DType::avg) { } else if (pool3d_type_ == Pool3DType::avg) {
paddle::operators::math::AvgPool<float> pool_process; phi::funcs::AvgPool<float> pool_process;
paddle::operators::math::Pool3dDirectCUDAFunctor< phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
paddle::operators::math::AvgPool<float>, float>
pool3d_forward; pool3d_forward;
pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
paddings_, true, adaptive_, odatas[0], stream, pool_process); paddings_, true, adaptive_, odatas[0], stream, pool_process);
...@@ -351,16 +349,14 @@ int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, ...@@ -351,16 +349,14 @@ int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
} }
if (pool3d_type_ == "max") { if (pool3d_type_ == "max") {
paddle::operators::math::MaxPool<float> pool_process; phi::funcs::MaxPool<float> pool_process;
paddle::operators::math::Pool3dDirectCUDAFunctor< phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
paddle::operators::math::MaxPool<float>, float>
pool3d_forward; pool3d_forward;
pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
true, adaptive_, output, stream, pool_process); true, adaptive_, output, stream, pool_process);
} else if (pool3d_type_ == "avg") { } else if (pool3d_type_ == "avg") {
paddle::operators::math::AvgPool<float> pool_process; phi::funcs::AvgPool<float> pool_process;
paddle::operators::math::Pool3dDirectCUDAFunctor< phi::funcs::Pool3dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
paddle::operators::math::AvgPool<float>, float>
pool3d_forward; pool3d_forward;
pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
true, adaptive_, output, stream, pool_process); true, adaptive_, output, stream, pool_process);
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h"
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/phi/kernels/funcs/pooling.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -84,16 +84,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs, ...@@ -84,16 +84,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs,
output_shape.insert(output_shape.begin(), batchSize); output_shape.insert(output_shape.begin(), batchSize);
if (pool_type_ == PoolType::max) { if (pool_type_ == PoolType::max) {
paddle::operators::math::MaxPool<float> pool_process; phi::funcs::MaxPool<float> pool_process;
paddle::operators::math::Pool2dDirectCUDAFunctor< phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
paddle::operators::math::MaxPool<float>, float>
pool2d_forward; pool2d_forward;
pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
paddings_, true, false, odatas[0], stream, pool_process); paddings_, true, false, odatas[0], stream, pool_process);
} else if (pool_type_ == PoolType::avg) { } else if (pool_type_ == PoolType::avg) {
paddle::operators::math::AvgPool<float> pool_process; phi::funcs::AvgPool<float> pool_process;
paddle::operators::math::Pool2dDirectCUDAFunctor< phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
paddle::operators::math::AvgPool<float>, float>
pool2d_forward; pool2d_forward;
pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, pool2d_forward(idata, input_shape, output_shape, ksize_, strides_,
paddings_, exclusive_, adaptive_, odatas[0], stream, paddings_, exclusive_, adaptive_, odatas[0], stream,
...@@ -292,16 +290,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, ...@@ -292,16 +290,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
} }
if (pool_type_ == "max") { if (pool_type_ == "max") {
paddle::operators::math::MaxPool<float> pool_process; phi::funcs::MaxPool<float> pool_process;
paddle::operators::math::Pool2dDirectCUDAFunctor< phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::MaxPool<float>, float>
paddle::operators::math::MaxPool<float>, float>
pool2d_forward; pool2d_forward;
pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
true, false, output, stream, pool_process); true, false, output, stream, pool_process);
} else if (pool_type_ == "avg") { } else if (pool_type_ == "avg") {
paddle::operators::math::AvgPool<float> pool_process; phi::funcs::AvgPool<float> pool_process;
paddle::operators::math::Pool2dDirectCUDAFunctor< phi::funcs::Pool2dDirectCUDAFunctor<phi::funcs::AvgPool<float>, float>
paddle::operators::math::AvgPool<float>, float>
pool2d_forward; pool2d_forward;
pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
exclusive_, adaptive_, output, stream, pool_process); exclusive_, adaptive_, output, stream, pool_process);
......
...@@ -131,4 +131,7 @@ cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_aut ...@@ -131,4 +131,7 @@ cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_aut
if(NOT WIN32) if(NOT WIN32)
cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
if (WITH_GPU)
cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator)
endif()
endif(NOT WIN32) endif(NOT WIN32)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _WIN32
#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include <fcntl.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <random>
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace memory {
namespace allocation {
namespace {
std::mutex ipc_mutex_;
std::unordered_map<std::string, std::weak_ptr<void>> ipc_handle_to_baseptr_;
} // namespace
std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
std::lock_guard<std::mutex> lock(ipc_mutex_);
auto iter = ipc_handle_to_baseptr_.find(handle);
if (iter != ipc_handle_to_baseptr_.end()) {
auto baseptr = iter->second.lock();
if (baseptr) return baseptr;
}
// The IpcMemHandle can only open once for the same handle,
// so here we cache it here.
void *baseptr = nullptr;
auto ipc_handle =
reinterpret_cast<const cudaIpcMemHandle_t *>(handle.c_str());
PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle(
&baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
// Close ipc handle on the same device.
int device_id = platform::GetCurrentDeviceId();
// Add deleter to close ipc handle.
auto sp = std::shared_ptr<void>(baseptr, [handle, device_id](void *ptr) {
platform::CUDADeviceGuard guard(device_id);
std::lock_guard<std::mutex> lock(ipc_mutex_);
PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr));
ipc_handle_to_baseptr_.erase(handle);
VLOG(6) << "cudaIpcCloseMemHandle for ptr:"
<< "\t" << ptr;
});
std::weak_ptr<void> wp = sp;
ipc_handle_to_baseptr_.insert(iter, {handle, wp});
return sp;
}
CudaIpcAllocation::~CudaIpcAllocation() {
shared_ptr_.reset();
VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:"
<< "\t" << this->ptr();
}
} // namespace allocation
} // namespace memory
} // namespace paddle
#endif
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,45 +12,45 @@ ...@@ -12,45 +12,45 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// This file defines the types used in PaddlePaddle MLIR dialect. #ifndef _WIN32
// We borrowed much ideas from tensorflow mlir dialect (tf_types.h in
// tensorflow).
#pragma once #pragma once
#include <mlir/IR/Diagnostics.h> #include <memory>
#include <mlir/IR/Location.h> #include <mutex> // NOLINT
#include <mlir/IR/Operation.h> #include <string>
#include <mlir/IR/TypeUtilities.h> #include <unordered_set>
#include <mlir/IR/Types.h> #include <utility>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace memory {
namespace allocation {
namespace mlir { std::shared_ptr<void> GetIpcBasePtr(std::string handle);
namespace PD {
class PaddleType : public Type { class CudaIpcAllocation : public Allocation {
public: public:
using Type::Type; explicit CudaIpcAllocation(void *ptr, size_t size, int device_id,
std::shared_ptr<void> shared_ptr)
: Allocation(ptr, size, platform::CUDAPlace(device_id)),
device_id_(std::move(device_id)),
shared_ptr_(std::move(shared_ptr)) {}
static bool classof(Type type); inline const int &device_id() const { return device_id_; }
};
namespace detail { ~CudaIpcAllocation() override;
template <typename Derived> private:
class PaddleTypeImpl : public Type::TypeBase<Derived, PaddleType, TypeStorage> { int device_id_;
public: std::shared_ptr<void> shared_ptr_;
using Base = typename Type::TypeBase<Derived, PaddleType, TypeStorage>;
using PDBase = PaddleTypeImpl<Derived>;
using Base::Base;
}; };
} // namespace detail } // namespace allocation
} // namespace memory
#define HANDLE_PD_TYPE(pdtype, enumerant, name) \ } // namespace paddle
class pdtype##Type : public detail::PaddleTypeImpl<pdtype##Type> { \
public: \
using PDBase::PDBase; \
};
} // namespace PD #endif
} // namespace mlir
...@@ -29,6 +29,155 @@ namespace paddle { ...@@ -29,6 +29,155 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
std::string GetIPCName() {
static std::random_device rd;
std::string handle = "/paddle_";
#ifdef _WIN32
handle += std::to_string(GetCurrentProcessId());
#else
handle += std::to_string(getpid());
#endif
handle += "_";
handle += std::to_string(rd());
return handle;
}
struct CountInfo {
std::atomic<int> refcount;
};
void AllocateMemoryMap(std::string filename, int flags, size_t size,
void **map_ptr_, int *fd_) {
// TODO(@ZHUI): support win32
int file_flags = 0;
int fd = -1;
if (flags & MAPPED_SHAREDMEM) {
file_flags = O_RDWR | O_CREAT;
} else {
file_flags = O_RDONLY;
}
if (flags & MAPPED_EXCLUSIVE) {
file_flags |= O_EXCL;
}
if (flags & MAPPED_NOCREATE) {
file_flags &= ~O_CREAT;
}
if (!(flags & MAPPED_FROMFD)) {
if (flags & MAPPED_SHAREDMEM) {
fd = shm_open(filename.c_str(), file_flags, (mode_t)0600);
PADDLE_ENFORCE_NE(
fd, -1,
platform::errors::Unavailable(
"File descriptor %s open failed, unable in read-write mode",
filename.c_str()));
VLOG(6) << "shm_open: " << filename;
}
} else {
fd = -1;
}
PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0,
platform::errors::Unavailable(
"Fruncate a file to a specified length failed!"));
if (flags & MAPPED_SHAREDMEM) {
*map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
} else {
*map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
}
PADDLE_ENFORCE_NE(*map_ptr_, MAP_FAILED,
platform::errors::Unavailable(
"Memory map failed when create shared memory."));
if (flags & MAPPED_KEEPFD) {
*fd_ = fd;
} else {
PADDLE_ENFORCE_NE(::close(fd), -1,
platform::errors::Unavailable(
"Error closing memory maped file <", filename, ">"));
*fd_ = -1;
}
}
std::shared_ptr<RefcountedMemoryMapAllocation>
AllocateRefcountedMemoryMapAllocation(std::string filename, int flags,
size_t size) {
int fd = -1;
void *base_ptr = nullptr;
AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd);
void *aliged_base_ptr =
static_cast<void *>(static_cast<char *>(base_ptr) + mmap_alignment);
return std::make_shared<RefcountedMemoryMapAllocation>(aliged_base_ptr, size,
filename, flags, fd);
}
RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation(
void *ptr, size_t size, std::string ipc_name, int fd, int flags)
: MemoryMapAllocation(ptr, size, ipc_name, fd, flags) {
// must reset base ptr first.
resetBaseptr();
initializeRefercount();
}
void MemoryMapAllocation::close() {
if (closed_) {
return;
}
closed_ = true;
}
MemoryMapAllocation::~MemoryMapAllocation() { close(); }
void RefcountedMemoryMapAllocation::incref() {
CountInfo *info = static_cast<CountInfo *>(map_ptr_);
++info->refcount;
}
int RefcountedMemoryMapAllocation::decref() {
CountInfo *info = static_cast<CountInfo *>(map_ptr_);
return --info->refcount == 0;
}
void RefcountedMemoryMapAllocation::resetBaseptr() {
map_ptr_ =
static_cast<void *>(static_cast<char *>(map_ptr_) - mmap_alignment);
map_size_ = map_size_ + mmap_alignment;
}
void RefcountedMemoryMapAllocation::initializeRefercount() {
CountInfo *info = reinterpret_cast<CountInfo *>(map_ptr_);
if (flags_ & MAPPED_EXCLUSIVE) {
new (&info->refcount) std::atomic<int>(1);
} else {
info->refcount++;
}
}
void RefcountedMemoryMapAllocation::close() {
if (closed_) {
return;
}
closed_ = true;
void *data = map_ptr_;
CountInfo *info = reinterpret_cast<CountInfo *>(data);
if (--info->refcount == 0) {
PADDLE_ENFORCE_NE(
shm_unlink(ipc_name_.c_str()), -1,
platform::errors::Unavailable(
"could not unlink the shared memory file ", ipc_name_));
VLOG(6) << "shm_unlink file: " << ipc_name_;
}
PADDLE_ENFORCE_NE(
munmap(map_ptr_, map_size_), -1,
platform::errors::Unavailable("could not unmap the shared memory file: ",
strerror(errno), " (", errno, ")"));
}
MemoryMapWriterAllocation::~MemoryMapWriterAllocation() { MemoryMapWriterAllocation::~MemoryMapWriterAllocation() {
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
munmap(this->ptr(), this->size()), -1, munmap(this->ptr(), this->size()), -1,
...@@ -44,30 +193,30 @@ MemoryMapReaderAllocation::~MemoryMapReaderAllocation() { ...@@ -44,30 +193,30 @@ MemoryMapReaderAllocation::~MemoryMapReaderAllocation() {
/* Here we do not pay attention to the result of shm_unlink, /* Here we do not pay attention to the result of shm_unlink,
because the memory mapped file may have been cleared due to the because the memory mapped file may have been cleared due to the
MemoryMapFdSet::Clear() */ MemoryMapFdSet::Clear() */
// Code of DataLoader subprocess:
//
// core._array_to_share_memory_tensor(b)
// out_queue.put((idx, tensor_list, structure))
// core._remove_tensor_list_mmap_fds(tensor_list)
/* If the tensor in already in the send queue, the tensor will be
* deconstructed by the function. If the tensor not send yet, it
* will be cleared by MemoryMapFdSet::Clear().
* If the `_remove_tensor_list_mmap_fds` have be interrupted, the
* tensor will be cleared by both methods.
* */
shm_unlink(this->ipc_name().c_str()); shm_unlink(this->ipc_name().c_str());
MemoryMapFdSet::Instance().Remove(this->ipc_name()); MemoryMapFdSet::Instance().Remove(this->ipc_name());
VLOG(3) << "~MemoryMapReaderAllocation: " << this->ipc_name(); VLOG(3) << "~MemoryMapReaderAllocation: " << this->ipc_name();
} }
std::string GetIPCName() {
static std::random_device rd;
std::string handle = "/paddle_";
#ifdef _WIN32
handle += std::to_string(GetCurrentProcessId());
#else
handle += std::to_string(getpid());
#endif
handle += "_";
handle += std::to_string(rd());
return handle;
}
std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation( std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
size_t size) { size_t size) {
const std::string &ipc_name = GetIPCName(); const std::string &ipc_name = GetIPCName();
int flags = O_RDWR | O_CREAT; int flags = O_RDWR | O_CREAT;
int fd = shm_open(ipc_name.c_str(), flags, 0600);
int fd = shm_open(ipc_name.c_str(), flags, 0644);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
fd, -1, platform::errors::Unavailable("File descriptor %s open failed", fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
ipc_name.c_str())); ipc_name.c_str()));
...@@ -86,12 +235,14 @@ std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation( ...@@ -86,12 +235,14 @@ std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
std::shared_ptr<MemoryMapReaderAllocation> RebuildMemoryMapReaderAllocation( std::shared_ptr<MemoryMapReaderAllocation> RebuildMemoryMapReaderAllocation(
const std::string &ipc_name, size_t size) { const std::string &ipc_name, size_t size) {
int fd = shm_open(ipc_name.c_str(), O_RDONLY, 0644); int flags = O_RDWR | O_CREAT;
flags &= ~O_CREAT;
int fd = shm_open(ipc_name.c_str(), flags, 0600);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
fd, -1, platform::errors::Unavailable("File descriptor %s open failed", fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
ipc_name.c_str())); ipc_name.c_str()));
void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
void *ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
PADDLE_ENFORCE_NE(ptr, MAP_FAILED, PADDLE_ENFORCE_NE(ptr, MAP_FAILED,
platform::errors::Unavailable( platform::errors::Unavailable(
"Memory map failed when rebuild shared memory.")); "Memory map failed when rebuild shared memory."));
......
...@@ -16,8 +16,9 @@ ...@@ -16,8 +16,9 @@
#ifndef _WIN32 #ifndef _WIN32
#include <atomic>
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex>
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
...@@ -28,6 +29,72 @@ namespace paddle { ...@@ -28,6 +29,72 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
std::string GetIPCName();
static constexpr int64_t mmap_alignment = 64;
enum MappedModes {
MAPPED_SHAREDMEM = 1,
MAPPED_EXCLUSIVE = 2,
MAPPED_NOCREATE = 4,
MAPPED_KEEPFD = 8,
MAPPED_FROMFD = 16,
MAPPED_UNLINK = 32
};
class MemoryMapAllocation : public Allocation {
public:
explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name)
: Allocation(ptr, size, platform::CPUPlace()),
ipc_name_(std::move(ipc_name)),
map_ptr_(ptr),
map_size_(size) {}
explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name,
int flags, int fd)
: Allocation(ptr, size, platform::CPUPlace()),
ipc_name_(std::move(ipc_name)),
fd_(fd),
flags_(flags),
map_ptr_(ptr),
map_size_(size) {}
inline const std::string &ipc_name() const { return ipc_name_; }
virtual void close();
~MemoryMapAllocation() override;
protected:
std::string ipc_name_;
int fd_ = -1;
int flags_ = 0;
void *map_ptr_ = nullptr;
size_t map_size_ = 0;
bool closed_ = false;
};
class RefcountedMemoryMapAllocation : public MemoryMapAllocation {
public:
RefcountedMemoryMapAllocation(void *ptr, size_t size, std::string ipc_name,
int flags, int fd);
void incref();
int decref();
void close() override;
virtual ~RefcountedMemoryMapAllocation() { close(); }
protected:
void initializeRefercount();
void resetBaseptr();
};
void AllocateMemoryMap(std::string filename, int flags, size_t size,
void **base_ptr_, int *fd_);
std::shared_ptr<RefcountedMemoryMapAllocation>
AllocateRefcountedMemoryMapAllocation(std::string filename, int flags,
size_t size);
class MemoryMapWriterAllocation : public Allocation { class MemoryMapWriterAllocation : public Allocation {
public: public:
explicit MemoryMapWriterAllocation(void *ptr, size_t size, explicit MemoryMapWriterAllocation(void *ptr, size_t size,
......
...@@ -1482,6 +1482,9 @@ REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); ...@@ -1482,6 +1482,9 @@ REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor);
REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);
REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);
REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor);
REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu,
ThresholdedReluFunctor, ThresholdedReluGradFunctor);
/* ========================== sigmoid register ============================= /* ========================== sigmoid register =============================
*/ */
...@@ -1567,23 +1570,6 @@ REGISTER_OPERATOR( ...@@ -1567,23 +1570,6 @@ REGISTER_OPERATOR(
ops::ActivationOpTripleGrad<ops::TanhTripleGradFunctor<float>::FwdDeps()>, ops::ActivationOpTripleGrad<ops::TanhTripleGradFunctor<float>::FwdDeps()>,
ops::ActivationTripleGradOpInplaceInferer); ops::ActivationTripleGradOpInplaceInferer);
REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
REGISTER_OP_CPU_KERNEL(
tanh_grad_grad, ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
ops::TanhGradGradFunctor<float>>,
ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
ops::TanhGradGradFunctor<double>>,
ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
ops::TanhGradGradFunctor<plat::float16>>);
// Register TripleGrad Kernel
REGISTER_OP_CPU_KERNEL(
tanh_triple_grad,
ops::TanhTripeGradKernel<plat::CPUDeviceContext,
ops::TanhTripleGradFunctor<float>>,
ops::TanhTripeGradKernel<plat::CPUDeviceContext,
ops::TanhTripleGradFunctor<double>>,
ops::TanhTripeGradKernel<plat::CPUDeviceContext,
ops::TanhTripleGradFunctor<plat::float16>>);
/* ========================================================================== */ /* ========================================================================== */
/* ========================== relu register ============================= */ /* ========================== relu register ============================= */
...@@ -1623,16 +1609,6 @@ REGISTER_OPERATOR( ...@@ -1623,16 +1609,6 @@ REGISTER_OPERATOR(
ops::ActivationOpDoubleGrad2<ops::LeakyReluGradFunctor<float>::FwdDeps()>, ops::ActivationOpDoubleGrad2<ops::LeakyReluGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer); ops::ActivationDoubleGradOpInplaceInferer);
REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
LeakyReluGradFunctor);
REGISTER_OP_CPU_KERNEL(
leaky_relu_grad_grad,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::LeakyReluGradGradFunctor<float>>,
ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
ops::LeakyReluGradGradFunctor<double>>,
ops::ActivationDoubleGradKernel<
plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
/* ========================================================================== */ /* ========================================================================== */
/* ======================== elu register ============================ */ /* ======================== elu register ============================ */
......
...@@ -253,6 +253,14 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> { ...@@ -253,6 +253,14 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
template <typename T> \ template <typename T> \
using name##GradFunctor = phi::funcs::name##GradFunctor<T>; using name##GradFunctor = phi::funcs::name##GradFunctor<T>;
#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \
template <typename T> \
using name##GradGradFunctor = phi::funcs::name##GradGradFunctor<T>;
#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \
template <typename T> \
using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor<T>;
USE_PHI_FUNCTOR(Cos) USE_PHI_FUNCTOR(Cos)
USE_PHI_FUNCTOR(Tan) USE_PHI_FUNCTOR(Tan)
USE_PHI_FUNCTOR(Acos) USE_PHI_FUNCTOR(Acos)
...@@ -264,6 +272,13 @@ USE_PHI_FUNCTOR(Cosh) ...@@ -264,6 +272,13 @@ USE_PHI_FUNCTOR(Cosh)
USE_PHI_FUNCTOR(Asinh) USE_PHI_FUNCTOR(Asinh)
USE_PHI_FUNCTOR(Acosh) USE_PHI_FUNCTOR(Acosh)
USE_PHI_FUNCTOR(Atanh) USE_PHI_FUNCTOR(Atanh)
USE_PHI_FUNCTOR(Tanh)
USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh)
USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh)
USE_PHI_FUNCTOR(BRelu)
USE_PHI_FUNCTOR(ThresholdedRelu)
USE_PHI_FUNCTOR(LeakyRelu)
USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
template <typename T> template <typename T>
struct SigmoidGradFunctor : public BaseActivationFunctor<T> { struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
...@@ -497,117 +512,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>; ...@@ -497,117 +512,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
template <typename T> template <typename T>
using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>; using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct TanhFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.tanh();
}
};
template <typename T>
struct TanhGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (static_cast<T>(1) - out * out);
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* Out,
const framework::Tensor* ddX, const framework::Tensor* dOut,
framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
auto out = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
// tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
// * ddx)
if (dOutNew) {
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
auto dout_new = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
dout_new.device(*d) =
static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
}
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
/*
Out
DOut D_Dout
DDx -> TanhTripleGrad -> D_DDx
D_DDout d_OutNew
D_Dout_new
D_Dout = (-2) * Out * DDx * D_Dout_new
D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
Out, DDX, DOut, D_DDOut, D_DOut_New // input
D_OutNew, D_DOut, D_DDx // output
*/
template <typename T>
struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* Out,
const framework::Tensor* ddX, const framework::Tensor* dOut,
const framework::Tensor* d_DDOut,
const framework::Tensor* d_dOut_New,
framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
framework::Tensor* d_DDx) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
auto out = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
auto d_ddOut = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
auto d_dOutNew = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
if (d_Out_New) {
auto d_OutNew = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
(static_cast<T>(2) * dout * ddx * d_dOutNew);
}
if (d_d_Out) {
auto d_dOut = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
}
if (d_DDx) {
auto d_ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
static_cast<T>(2) * out * dout * d_dOutNew;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
// tanhshrink(x) = x - tanh(x) // tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T> template <typename T>
...@@ -909,42 +813,6 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> { ...@@ -909,42 +813,6 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
template <typename T>
struct BReluFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
// NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
// not polymorphism for speed.
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) =
x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
}
};
template <typename T>
struct BReluGradFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout *
((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
.template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// relu6(x) = min(max(0, x), 6) // relu6(x) = min(max(0, x), 6)
template <typename T> template <typename T>
struct Relu6Functor : public BaseActivationFunctor<T> { struct Relu6Functor : public BaseActivationFunctor<T> {
...@@ -1168,41 +1036,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> { ...@@ -1168,41 +1036,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
} }
}; };
template <typename T>
struct LeakyReluFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
if (alpha < 1.f) {
out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
} else {
out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
}
}
};
template <typename T>
struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 =
static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
dx.device(d) = dout * (temp1 + temp2).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T> template <typename T>
struct ELUFunctor : public BaseActivationFunctor<T> { struct ELUFunctor : public BaseActivationFunctor<T> {
float alpha; float alpha;
...@@ -1430,37 +1263,6 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> { ...@@ -1430,37 +1263,6 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
template <typename T>
struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
auto th = static_cast<T>(threshold);
out.device(d) = (x > th).template cast<T>() * x;
}
};
template <typename T>
struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto th = static_cast<T>(threshold);
dx.device(d) = dout * (x > th).template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T> template <typename T>
struct HardSigmoidFunctor : public BaseActivationFunctor<T> { struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
float slope; float slope;
...@@ -1531,121 +1333,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -1531,121 +1333,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
/*
* in arguments: x, out, ddx
* out arguments: ddout, dout, dx
*/
template <ActBwdOpFwdDeps kDepValue>
inline void ExtractActivationDoubleGradTensor(
const framework::ExecutionContext& ctx, const framework::Tensor** X,
const framework::Tensor** Out, const framework::Tensor** ddX,
framework::Tensor** dX, framework::Tensor** dOut,
framework::Tensor** ddOut) {
auto ddx_var = ctx.InputVar("DDX");
auto ddo_var = ctx.OutputVar("DDOut");
PADDLE_ENFORCE_NOT_NULL(
ddx_var, platform::errors::NotFound(
"Cannot get input Variable Out, variable name = %s",
ctx.InputName("DDX")));
if (CanBeUsedBySelectedRows.count(ctx.Type())) {
*ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var);
if (ddo_var) {
*ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
ddo_var);
}
} else {
*ddX = ctx.Input<framework::Tensor>("DDX");
if (ddo_var) {
*ddOut = ctx.Output<framework::Tensor>("DDOut");
}
}
PADDLE_ENFORCE_NOT_NULL(
*ddX,
platform::errors::NotFound(
"Cannot get the tensor from the Variable Output, variable name = %s",
ctx.OutputName("DDX")));
if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
auto x_var = ctx.InputVar("X");
PADDLE_ENFORCE_NOT_NULL(
x_var, platform::errors::NotFound(
"Cannot get input Variable Out, variable name = %s",
ctx.InputName("X")));
auto dx_var = ctx.OutputVar("DX");
if (CanBeUsedBySelectedRows.count(ctx.Type())) {
*X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
if (dx_var) {
*dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
dx_var);
}
} else {
*X = ctx.Input<framework::Tensor>("X");
if (dx_var) {
*dX = ctx.Output<framework::Tensor>("DX");
}
}
} else {
VLOG(10) << "Inplace activation of Op: " << ctx.Type();
*X = *ddX;
}
if (static_cast<int>(kDepValue) &
static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
auto out_var = ctx.InputVar("Out");
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound(
"Cannot get the tensor from the Variable Out, variable name = %s",
ctx.InputName("Out")));
auto dout_var = ctx.OutputVar("DOut");
if (CanBeUsedBySelectedRows.count(ctx.Type())) {
*Out =
paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
if (dout_var) {
*dOut =
paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
dout_var);
}
} else {
*Out = ctx.Input<framework::Tensor>("Out");
if (dout_var) {
*dOut = ctx.Output<framework::Tensor>("DOut");
}
}
} else {
VLOG(10) << "Inplace activation of Op: " << ctx.Type();
*Out = *ddX;
}
}
template <typename DeviceContext, typename Functor>
class ActivationDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& ctx) const override {
const framework::Tensor *X, *Out, *ddX;
X = Out = ddX = nullptr;
framework::Tensor *ddOut, *dOut, *dX;
ddOut = dOut = dX = nullptr;
ExtractActivationDoubleGradTensor<Functor::FwdDeps()>(ctx, &X, &Out, &ddX,
&dX, &dOut, &ddOut);
if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
if (dOut) dOut->mutable_data<T>(ctx.GetPlace());
if (dX) dX->mutable_data<T>(Out->dims(), ctx.GetPlace());
auto& place = ctx.template device_context<DeviceContext>();
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
functor(place, X, Out, ddX, ddOut, dOut, dX);
}
};
template <typename T> template <typename T>
struct AbsGradGradFunctor : public BaseActivationFunctor<T> { struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device> template <typename Device>
...@@ -1667,35 +1354,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> { ...@@ -1667,35 +1354,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
template <typename T>
struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* X,
const framework::Tensor* Out, const framework::Tensor* ddX,
framework::Tensor* ddOut, framework::Tensor* dOut,
framework::Tensor* dX) const {
if (ddOut) {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
ddout.device(*d) =
ddx *
((x > static_cast<T>(0)).template cast<T>() +
static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
.template cast<T>();
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T> template <typename T>
struct ELUGradGradFunctor : public BaseActivationFunctor<T> { struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
float alpha; float alpha;
...@@ -2504,7 +2162,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> { ...@@ -2504,7 +2162,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
__macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \
__macro(log2, Log2, Log2Functor, Log2GradFunctor); \ __macro(log2, Log2, Log2Functor, Log2GradFunctor); \
__macro(log10, Log10, Log10Functor, Log10GradFunctor); \ __macro(log10, Log10, Log10Functor, Log10GradFunctor); \
__macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \
__macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \
__macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \
__macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \
...@@ -2515,7 +2172,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> { ...@@ -2515,7 +2172,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
__macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \
HardSigmoidGradFunctor); \ HardSigmoidGradFunctor); \
__macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \
__macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, \
ThresholdedReluGradFunctor); \
__macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(mish, Mish, MishFunctor, MishGradFunctor); \
__macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
...@@ -18,38 +18,6 @@ limitations under the License. */ ...@@ -18,38 +18,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// leakyrelu(x) = x > 0 ? x : alpha * x
__device__ __forceinline__ T operator()(const T x) const {
return x > zero ? x : static_cast<T>(alpha) * x;
}
};
template <typename T>
struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float alpha;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"alpha", &alpha}};
}
// dx = dout * (x > 0 ? 1 : alpha)
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return x > zero ? dout : static_cast<T>(alpha) * dout;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T> template <typename T>
struct CudaSigmoidFunctor : public BaseActivationFunctor<T> { struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type; using MPType = typename details::MPTypeTrait<T>::Type;
...@@ -224,31 +192,6 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> { ...@@ -224,31 +192,6 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
} }
}; };
template <typename T>
struct CudaTanhFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
// tanh(x) = tanh(x)
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(tanh(x));
}
};
template <typename T>
struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
T one = static_cast<T>(1.0f);
// dx = dout * (1 - out^2)
__device__ __forceinline__ T operator()(const T dout, const T out) const {
return dout * (one - out * out);
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T> template <typename T>
struct CudaReciprocalFunctor : public BaseActivationFunctor<T> { struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
T one = static_cast<T>(1.0f); T one = static_cast<T>(1.0f);
...@@ -476,45 +419,6 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> { ...@@ -476,45 +419,6 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
template <typename T>
struct CudaBReluFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
// brelu(x) = min(max(x, t_min), t_max)
__device__ __forceinline__ T operator()(const T x) const {
T t_min_cast = static_cast<T>(t_min);
T t_max_cast = static_cast<T>(t_max);
T temp_max = x > t_min_cast ? x : t_min_cast;
T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
return temp_min;
}
};
template <typename T>
struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float t_min;
float t_max;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
// dx = (x > t_min && x < t_max) ? dout : 0
__device__ __forceinline__ T operator()(const T dout, const T x) const {
T t_min_cast = static_cast<T>(t_min);
T t_max_cast = static_cast<T>(t_max);
return (x > t_min_cast && x < t_max_cast) ? dout : zero;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T> template <typename T>
struct CudaSoftReluFunctor : public BaseActivationFunctor<T> { struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type; using MPType = typename details::MPTypeTrait<T>::Type;
...@@ -907,38 +811,6 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> { ...@@ -907,38 +811,6 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
}; };
template <typename T>
struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// thresholded_relu(x) = x > threshold ? x : 0
__device__ __forceinline__ T operator()(const T x) const {
return x > static_cast<T>(threshold) ? x : zero;
}
};
template <typename T>
struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// dx = x > threshold ? dout : 0
__device__ __forceinline__ T operator()(const T dout, const T x) const {
return x > static_cast<T>(threshold) ? dout : zero;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T> template <typename T>
struct CudaHardSwishFunctor : public BaseActivationFunctor<T> { struct CudaHardSwishFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f); T zero = static_cast<T>(0.0f);
...@@ -1212,6 +1084,22 @@ class ActivationGradCudaKernel ...@@ -1212,6 +1084,22 @@ class ActivationGradCudaKernel
} }
}; };
USE_PHI_FUNCTOR(CudaCos)
USE_PHI_FUNCTOR(CudaTan)
USE_PHI_FUNCTOR(CudaAcos)
USE_PHI_FUNCTOR(CudaSin)
USE_PHI_FUNCTOR(CudaAsin)
USE_PHI_FUNCTOR(CudaAtan)
USE_PHI_FUNCTOR(CudaSinh)
USE_PHI_FUNCTOR(CudaCosh)
USE_PHI_FUNCTOR(CudaAsinh)
USE_PHI_FUNCTOR(CudaAcosh)
USE_PHI_FUNCTOR(CudaAtanh)
USE_PHI_FUNCTOR(CudaTanh)
USE_PHI_FUNCTOR(CudaBRelu)
USE_PHI_FUNCTOR(CudaLeakyRelu)
USE_PHI_FUNCTOR(CudaThresholdedRelu)
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -1270,20 +1158,6 @@ namespace plat = paddle::platform; ...@@ -1270,20 +1158,6 @@ namespace plat = paddle::platform;
ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \ ops::ActivationGradCudaKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::bfloat16>>); ops::grad_functor<plat::bfloat16>>);
/* ======================== leaky relu register ============================ */
REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
CudaLeakyReluGradFunctor);
REGISTER_OP_CUDA_KERNEL(
leaky_relu_grad_grad,
ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
ops::LeakyReluGradGradFunctor<float>>,
ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
ops::LeakyReluGradGradFunctor<double>>,
ops::ActivationDoubleGradKernel<
plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ======================== elu register ============================ */ /* ======================== elu register ============================ */
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, elu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
...@@ -1348,29 +1222,6 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -1348,29 +1222,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::SigmoidTripleGradFunctor<plat::bfloat16>>); ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
/* ========================================================================== */ /* ========================================================================== */
/* =========================== tanh register ============================ */
REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
CudaTanhGradFunctor);
REGISTER_OP_CUDA_KERNEL(
tanh_grad_grad,
ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::TanhGradGradFunctor<float>>,
ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::TanhGradGradFunctor<double>>,
ops::TanhDoubleGradKernel<plat::CUDADeviceContext,
ops::TanhGradGradFunctor<plat::float16>>);
REGISTER_OP_CUDA_KERNEL(
tanh_triple_grad,
ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
ops::TanhTripleGradFunctor<float>>,
ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
ops::TanhTripleGradFunctor<double>>,
ops::TanhTripeGradKernel<plat::CUDADeviceContext,
ops::TanhTripleGradFunctor<plat::float16>>);
/* ========================================================================== */
/* =========================== sqrt register ============================= */ /* =========================== sqrt register ============================= */
REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
CudaSqrtGradFunctor); CudaSqrtGradFunctor);
...@@ -1521,7 +1372,6 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -1521,7 +1372,6 @@ REGISTER_OP_CUDA_KERNEL(
__macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor); \ __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor); \
__macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor); \ __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor); \
__macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor); \ __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor); \
__macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor); \
__macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \ __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
__macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \ __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \
__macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \ __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \
...@@ -1535,8 +1385,6 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -1535,8 +1385,6 @@ REGISTER_OP_CUDA_KERNEL(
CudaHardSigmoidGradFunctor); \ CudaHardSigmoidGradFunctor); \
__macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \ __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \
__macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \ __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \
__macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor, \
CudaThresholdedReluGradFunctor); \
__macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \
CudaHardSwishGradFunctor); CudaHardSwishGradFunctor);
FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
......
...@@ -12,52 +12,20 @@ ...@@ -12,52 +12,20 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/allclose_op.h"
#include <cmath> #include <cmath>
#include <string> #include <string>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct GetTensorValue<platform::CPUDeviceContext, T> {
T operator()(const platform::CPUDeviceContext& dev_ctx,
const framework::Tensor& tensor) const {
return *(tensor.data<T>());
}
};
template <typename T>
struct AllcloseFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& other,
const double rtol, const double atol, bool equal_nan,
framework::Tensor* output) {
auto* in_a = in.data<T>();
auto* in_b = other.data<T>();
auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
auto num = in.numel();
*out_data = true;
for (int i = 0; i < num; i++) {
const T a = in_a[i], b = in_b[i];
bool val;
if (std::isnan(a) || std::isnan(b)) {
val = equal_nan && std::isnan(a) == std::isnan(b);
} else {
T left = (a > b ? a - b : b - a);
T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
T diff = (left > right ? left - right : right - left);
val = a == b || left <= right || diff <= 1e-15;
}
*out_data &= val;
}
}
};
class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker { class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -96,40 +64,6 @@ class AllcloseOp : public framework::OperatorWithKernel { ...@@ -96,40 +64,6 @@ class AllcloseOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose");
OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose");
auto input_dim = ctx->GetInputDim("Input");
auto other_dim = ctx->GetInputDim("Other");
PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
platform::errors::PreconditionNotMet(
"Input(Input) and Input(Other) must have the same "
"dimension size."));
int n = input_dim.size();
bool is_runtime = ctx->IsRuntime();
for (int i = 0; i < n; i++) {
if (is_runtime) {
PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
platform::errors::PreconditionNotMet(
"The value at dim %d of Input(Input) is not "
"equal to the Input(Other): %ld != %ld.",
i, input_dim[i], other_dim[i]));
} else {
if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
platform::errors::PreconditionNotMet(
"The value at dim %d of Input(Input) is not "
"equal to the Input(Other): %ld != %ld.",
i, input_dim[i], other_dim[i]));
}
}
}
ctx->SetOutputDim("Out", phi::make_ddim({1}));
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -152,13 +86,13 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference { ...@@ -152,13 +86,13 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference {
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext; using CPU = paddle::platform::CPUDeviceContext;
DECLARE_INFER_SHAPE_FUNCTOR(allclose, AllcloseInferShapeFunctor,
PD_INFER_META(phi::AllValueCompareInferMeta));
REGISTER_OPERATOR( REGISTER_OPERATOR(
allclose, ops::AllcloseOp, ops::AllcloseOpMaker, allclose, ops::AllcloseOp, ops::AllcloseOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::AllcloseOpVarTypeInference); ops::AllcloseOpVarTypeInference, AllcloseInferShapeFunctor);
REGISTER_OP_CPU_KERNEL(allclose, ops::AllcloseKernel<CPU, float>,
ops::AllcloseKernel<CPU, double>);
/* ========================== register checkpoint ===========================*/ /* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(allclose) REGISTER_OP_VERSION(allclose)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
struct GetTensorValue {
T operator()(const platform::DeviceContext& ctx,
const framework::Tensor& tensor) const;
};
template <typename DeviceContext, typename T>
struct AllcloseFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in,
const framework::Tensor& other, const float rtol,
const float atol, bool equal_nan, framework::Tensor* output);
};
template <typename DeviceContext, typename T>
class AllcloseKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// get attrs
bool equal_nan = ctx.Attr<bool>("equal_nan");
// get input/output
const auto* input = ctx.Input<Tensor>("Input");
const auto* other = ctx.Input<Tensor>("Other");
auto* out = ctx.Output<Tensor>("Out");
double rtol_v = std::stod(ctx.Attr<std::string>("rtol"));
double atol_v = std::stod(ctx.Attr<std::string>("atol"));
auto& dev_ctx = ctx.template device_context<DeviceContext>();
GetTensorValue<DeviceContext, double> get_tensor_value;
if (ctx.HasInput("Rtol")) {
const auto* rtol = ctx.Input<Tensor>("Rtol");
PADDLE_ENFORCE_EQ(
rtol->numel(), 1,
platform::errors::InvalidArgument(
"Input(Rtol) size must be 1, but get %d.", rtol->numel()));
PADDLE_ENFORCE_EQ(
framework::TransToProtoVarType(rtol->dtype()),
framework::proto::VarType::FP64,
platform::errors::InvalidArgument(
"Input(Rtol) type must be double, but get %s.",
framework::DataTypeToString(
framework::TransToProtoVarType(rtol->dtype()))));
rtol_v = get_tensor_value(dev_ctx, *rtol);
}
if (ctx.HasInput("Atol")) {
const auto* atol = ctx.Input<Tensor>("Atol");
PADDLE_ENFORCE_EQ(
atol->numel(), 1,
platform::errors::InvalidArgument(
"Input(Atol) size must be 1, but get %d", atol->numel()));
PADDLE_ENFORCE_EQ(
framework::TransToProtoVarType(atol->dtype()),
framework::proto::VarType::FP64,
platform::errors::InvalidArgument(
"Input(Atol) type must be double, but get %s",
framework::DataTypeToString(
framework::TransToProtoVarType(atol->dtype()))));
atol_v = get_tensor_value(dev_ctx, *atol);
}
AllcloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
equal_nan, out);
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
found_inf->mutable_data<bool>(dev_ctx.GetPlace());
MLUCnnlTensorDesc scale_desc(*scale);
MLUCnnlTensorDesc found_inf_desc(*found_inf, CNNL_LAYOUT_ARRAY,
ToCnnlDataType<bool>());
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(ctx.GetPlace());
// check is_finite or is_nan
Tensor is_finite(found_inf->type());
if (i != 0) {
is_finite.Resize(phi::make_ddim({1}));
is_finite.mutable_data<bool>(ctx.GetPlace());
} else {
is_finite.ShareDataWith(*found_inf);
}
MLUCnnlTensorDesc x_desc(*x);
MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x),
GetBasePtr(&is_finite));
// save is_finite by logical_and op after checking every input
if (i != 0) {
MLUCnnlTensorDesc is_finite_desc(is_finite, CNNL_LAYOUT_ARRAY,
ToCnnlDataType<bool>());
MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_OR, found_inf_desc.get(),
GetBasePtr(found_inf), is_finite_desc.get(),
GetBasePtr(&is_finite), found_inf_desc.get(),
GetBasePtr(found_inf));
}
// The normal logic is :
// out = in, if found_inf = true
// out = in/scale, if found_inf = false
// But when found_inf is true, the data of Out should not be used.
// So, on MLU, we always compute out with in/scale.
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(),
GetBasePtr(x), scale_desc.get(), GetBasePtr(scale),
out_desc.get(), GetBasePtr(out));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(check_finite_and_unscale,
ops::CheckFiniteAndUnscaleMLUKernel<float>,
ops::CheckFiniteAndUnscaleMLUKernel<plat::float16>);
...@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/controlflow/compare_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle { namespace paddle {
......
...@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/infermeta/unary.h" #include "paddle/phi/infermeta/unary.h"
...@@ -58,15 +56,56 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -58,15 +56,56 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
} }
}; };
class DiagV2GradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "X", "X", "DiagV2Grad");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
framework::GradVarName("X"), "DiagV2Grad");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.GetPlace());
}
};
template <typename T>
class DiagV2GradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> grad_op) const override {
grad_op->SetType("diag_v2_grad");
grad_op->SetInput("X", this->Input("X"));
grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
grad_op->SetAttrMap(this->Attrs());
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagGradV2NoNeedBufferVarsInferer, "X");
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
PD_INFER_META(phi::DiagInferMeta)); PD_INFER_META(phi::DiagInferMeta));
REGISTER_OPERATOR( REGISTER_OPERATOR(diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, ops::DiagV2GradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, ops::DiagV2GradOpMaker<paddle::imperative::OpBase>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, DiagInferShapeFunctor);
DiagInferShapeFunctor);
REGISTER_OPERATOR(diag_v2_grad, ops::DiagV2GradOp,
ops::DiagGradV2NoNeedBufferVarsInferer);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -90,86 +87,6 @@ struct MinFunctor { ...@@ -90,86 +87,6 @@ struct MinFunctor {
template <typename T> template <typename T>
using Complex = paddle::platform::complex<T>; using Complex = paddle::platform::complex<T>;
// Fmax
template <typename T>
struct FMaxFunctor {
inline HOSTDEVICE T operator()(const T a, const T b) const {
return std::fmax(a, b);
}
};
template <>
struct FMaxFunctor<paddle::platform::float16> {
inline HOSTDEVICE paddle::platform::float16 operator()(
const paddle::platform::float16 a,
const paddle::platform::float16 b) const {
float float_a = static_cast<float>(a);
float float_b = static_cast<float>(b);
auto result = std::fmax(float_a, float_b);
return static_cast<paddle::platform::float16>(result);
}
};
template <>
struct FMaxFunctor<int> {
inline HOSTDEVICE int operator()(const int a, const int b) const {
float float_a = static_cast<float>(a);
float float_b = static_cast<float>(b);
auto result = std::fmax(float_a, float_b);
return std::lrint(result);
}
};
template <>
struct FMaxFunctor<int64_t> {
inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
double double_a = static_cast<double>(a);
double double_b = static_cast<double>(b);
auto result = std::fmax(double_a, double_b);
return std::llrint(result);
}
};
// Fmin
template <typename T>
struct FMinFunctor {
inline HOSTDEVICE T operator()(const T a, const T b) const {
return std::fmin(a, b);
}
};
template <>
struct FMinFunctor<paddle::platform::float16> {
inline HOSTDEVICE paddle::platform::float16 operator()(
const paddle::platform::float16 a,
const paddle::platform::float16 b) const {
float float_a = static_cast<float>(a);
float float_b = static_cast<float>(b);
auto result = std::fmin(float_a, float_b);
return static_cast<paddle::platform::float16>(result);
}
};
template <>
struct FMinFunctor<int> {
inline HOSTDEVICE int operator()(const int a, const int b) const {
float float_a = static_cast<float>(a);
float float_b = static_cast<float>(b);
auto result = std::fmin(float_a, float_b);
return std::lrint(result);
}
};
template <>
struct FMinFunctor<int64_t> {
inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const {
double double_a = static_cast<double>(a);
double double_b = static_cast<double>(b);
auto result = std::fmin(double_a, double_b);
return std::llrint(result);
}
};
template <typename T> template <typename T>
struct MinGradXFunctor { struct MinGradXFunctor {
inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const { inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const {
......
...@@ -151,21 +151,3 @@ REGISTER_OPERATOR(elementwise_fmax, ops::ElementwiseOp, ...@@ -151,21 +151,3 @@ REGISTER_OPERATOR(elementwise_fmax, ops::ElementwiseOp,
ops::ElementwiseFMaxGradOpMaker<paddle::imperative::OpBase>); ops::ElementwiseFMaxGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(elementwise_fmax_grad, ops::ElementwiseOpGrad); REGISTER_OPERATOR(elementwise_fmax_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL(
elementwise_fmax,
ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext,
paddle::platform::float16>,
ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseFMaxKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
elementwise_fmax_grad,
ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::float16>,
ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseFMaxGradKernel<paddle::platform::CPUDeviceContext,
int64_t>);
...@@ -86,21 +86,3 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -86,21 +86,3 @@ REGISTER_OP_CUDA_KERNEL(
ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
int64_t>); int64_t>);
REGISTER_OP_CUDA_KERNEL(
elementwise_fmax,
ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseFMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
elementwise_fmax_grad,
ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseFMaxGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
...@@ -35,21 +35,6 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> { ...@@ -35,21 +35,6 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class ElementwiseFMaxKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<FMaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
FMaxFunctor<T>(), z);
}
};
template <typename T> template <typename T>
struct MaxGradDx { struct MaxGradDx {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
...@@ -104,88 +89,5 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> { ...@@ -104,88 +89,5 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
} }
}; };
template <typename T>
struct FMaxGradDx {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return dout * static_cast<T>((x >= y) || isnan(y));
}
};
template <>
struct FMaxGradDx<paddle::platform::float16> {
HOSTDEVICE paddle::platform::float16 operator()(
paddle::platform::float16 x, paddle::platform::float16 y,
paddle::platform::float16 out, paddle::platform::float16 dout) const {
return dout * static_cast<paddle::platform::float16>(
(x >= y) || paddle::platform::isnan(y));
}
};
template <>
struct FMaxGradDx<int> {
HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
return dout * static_cast<int>((x >= y));
}
};
template <>
struct FMaxGradDx<int64_t> {
HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
int64_t dout) const {
return dout * static_cast<int64_t>((x >= y));
}
};
template <typename T>
struct FMaxGradDy {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return dout * static_cast<T>(!((x >= y) || isnan(y)));
}
};
template <>
struct FMaxGradDy<paddle::platform::float16> {
HOSTDEVICE paddle::platform::float16 operator()(
paddle::platform::float16 x, paddle::platform::float16 y,
paddle::platform::float16 out, paddle::platform::float16 dout) const {
return dout * static_cast<paddle::platform::float16>(
!((x >= y) || paddle::platform::isnan(y)));
}
};
template <>
struct FMaxGradDy<int64_t> {
HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
int64_t dout) const {
return dout * static_cast<int64_t>(!((x >= y)));
}
};
template <>
struct FMaxGradDy<int> {
HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
return dout * static_cast<int>(!((x >= y)));
}
};
template <typename DeviceContext, typename T>
class ElementwiseFMaxGradKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto* out = dout; // Fake out, not used
int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, FMaxGradDx<T>, FMaxGradDy<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, FMaxGradDx<T>(),
FMaxGradDy<T>());
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -147,21 +147,3 @@ REGISTER_OPERATOR(elementwise_fmin, ops::ElementwiseOp, ...@@ -147,21 +147,3 @@ REGISTER_OPERATOR(elementwise_fmin, ops::ElementwiseOp,
ops::ElementwiseFMinGradOpMaker<paddle::imperative::OpBase>); ops::ElementwiseFMinGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(elementwise_fmin_grad, ops::ElementwiseOpGrad); REGISTER_OPERATOR(elementwise_fmin_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL(
elementwise_fmin,
ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext,
paddle::platform::float16>,
ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseFMinKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
elementwise_fmin_grad,
ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::float16>,
ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseFMinGradKernel<paddle::platform::CPUDeviceContext,
int64_t>);
...@@ -82,21 +82,3 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -82,21 +82,3 @@ REGISTER_OP_CUDA_KERNEL(
ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
int64_t>); int64_t>);
REGISTER_OP_CUDA_KERNEL(
elementwise_fmin,
ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseFMinKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
elementwise_fmin_grad,
ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseFMinGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
...@@ -35,21 +35,6 @@ class ElementwiseMinKernel : public framework::OpKernel<T> { ...@@ -35,21 +35,6 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class ElementwiseFMinKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<FMinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
FMinFunctor<T>(), z);
}
};
template <typename T> template <typename T>
struct MinGradDx { struct MinGradDx {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
...@@ -124,89 +109,5 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> { ...@@ -124,89 +109,5 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
ElementwiseMinGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy); ElementwiseMinGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
} }
}; };
template <typename T>
struct FMinGradDx {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return dout * static_cast<T>((x <= y) || isnan(y));
}
};
template <>
struct FMinGradDx<paddle::platform::float16> {
HOSTDEVICE paddle::platform::float16 operator()(
paddle::platform::float16 x, paddle::platform::float16 y,
paddle::platform::float16 out, paddle::platform::float16 dout) const {
return dout * static_cast<paddle::platform::float16>(
(x <= y) || paddle::platform::isnan(y));
}
};
template <>
struct FMinGradDx<int> {
HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
return dout * static_cast<int>((x <= y));
}
};
template <>
struct FMinGradDx<int64_t> {
HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
int64_t dout) const {
return dout * static_cast<int64_t>((x <= y));
}
};
template <typename T>
struct FMinGradDy {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return dout * static_cast<T>(!((x <= y) || isnan(y)));
}
};
template <>
struct FMinGradDy<paddle::platform::float16> {
HOSTDEVICE paddle::platform::float16 operator()(
paddle::platform::float16 x, paddle::platform::float16 y,
paddle::platform::float16 out, paddle::platform::float16 dout) const {
return dout * static_cast<paddle::platform::float16>(
!((x <= y) || paddle::platform::isnan(y)));
}
};
template <>
struct FMinGradDy<int> {
HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
return dout * static_cast<int>(!((x <= y)));
}
};
template <>
struct FMinGradDy<int64_t> {
HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out,
int64_t dout) const {
return dout * static_cast<int64_t>(!((x <= y)));
}
};
template <typename DeviceContext, typename T>
class ElementwiseFMinGradKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto* out = dout; // Fake out, not used
int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, FMinGradDx<T>, FMinGradDy<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, FMinGradDx<T>(),
FMinGradDy<T>());
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -32,6 +32,45 @@ using dnnl::stream; ...@@ -32,6 +32,45 @@ using dnnl::stream;
template <typename T, dnnl::algorithm BINARY_OP> template <typename T, dnnl::algorithm BINARY_OP>
class EltwiseMKLDNNKernel : public framework::OpKernel<T> { class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
private:
dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const {
dnnl::post_ops post_operations;
if (ctx.HasAttr("activation_type")) {
const float scale = ctx.HasAttr("activation_scale")
? ctx.Attr<float>("activation_scale")
: 1.0f;
const float alpha = ctx.HasAttr("activation_alpha")
? ctx.Attr<float>("activation_alpha")
: 0.0f;
const float beta = ctx.HasAttr("activation_beta")
? ctx.Attr<float>("activation_beta")
: 0.0f;
static std::unordered_map<std::string, dnnl::algorithm> algo_map = {
{"relu", dnnl::algorithm::eltwise_relu},
{"tanh", dnnl::algorithm::eltwise_tanh},
{"leaky_relu", dnnl::algorithm::eltwise_relu},
{"swish", dnnl::algorithm::eltwise_swish},
{"hardswish", dnnl::algorithm::eltwise_hardswish},
{"sqrt", dnnl::algorithm::eltwise_sqrt},
{"abs", dnnl::algorithm::eltwise_abs},
{"clip", dnnl::algorithm::eltwise_clip},
{"gelu", dnnl::algorithm::eltwise_gelu_erf},
{"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh},
{"relu6", dnnl::algorithm::eltwise_bounded_relu},
{"sigmoid", dnnl::algorithm::eltwise_logistic}};
const auto& activation_type =
algo_map.find(ctx.Attr<std::string>("activation_type"));
if (activation_type != algo_map.end()) {
post_operations.append_eltwise(scale, activation_type->second, alpha,
beta);
}
}
return post_operations;
}
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto& dev_ctx = const auto& dev_ctx =
...@@ -47,9 +86,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> { ...@@ -47,9 +86,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
float scale_o = ctx.Attr<float>("Scale_out"); float scale_o = ctx.Attr<float>("Scale_out");
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
platform::BinaryMKLDNNHandler<T> handler(BINARY_OP, axis, mkldnn_engine, platform::BinaryMKLDNNHandler<T> handler(
ctx.GetPlace(), x, y, z, scale_x, BINARY_OP, axis, mkldnn_engine, ctx.GetPlace(), x, y, z, scale_x,
scale_y, scale_o); scale_y, scale_o, get_post_ops(ctx));
const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_x_memory = handler.AcquireSrcMemory(x);
const auto src_y_memory = handler.AcquireSecondSrcMemory(y); const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
......
...@@ -96,30 +96,6 @@ __global__ void filter_copy_fuse_kernel( ...@@ -96,30 +96,6 @@ __global__ void filter_copy_fuse_kernel(
if (N < ins_end) ins_end = N; if (N < ins_end) ins_end = N;
/*
if (!x1_lods_filled) {
for (int p = ins_start; p < ins_end; p++) {
x1_lods_data[p] = p;
}
if (idx == 0) {
x1_lods_data[N] = N;
}
}
if (!x2_lods_filled) {
for (int p = ins_start; p < ins_end; p++) {
x2_lods_data[p] = p;
}
if (idx == 0) {
x2_lods_data[N] = N;
}
}
if (!x1_lods_filled || !x2_lods_filled) {
b.sync();
}
*/
int flag_data[5]; int flag_data[5];
int prefix_sum_data[5]; int prefix_sum_data[5];
int prefix_sum_data2[5]; int prefix_sum_data2[5];
...@@ -173,8 +149,6 @@ __global__ void filter_copy_fuse_kernel( ...@@ -173,8 +149,6 @@ __global__ void filter_copy_fuse_kernel(
local_addr = prefix_sum_data[ins_end - 1 - ins_start]; local_addr = prefix_sum_data[ins_end - 1 - ins_start];
sum_addr = local_addr; sum_addr = local_addr;
// flag
// local_flag = 0;
for (int p = ins_start; p < ins_end; p++) { for (int p = ins_start; p < ins_end; p++) {
local_flag += flag_data[p - ins_start]; local_flag += flag_data[p - ins_start];
} }
...@@ -188,7 +162,6 @@ __global__ void filter_copy_fuse_kernel( ...@@ -188,7 +162,6 @@ __global__ void filter_copy_fuse_kernel(
sum_out_lods = local_out_lods; sum_out_lods = local_out_lods;
} }
// 32 threads
for (int i = 1; i < warp_thread_num; i *= 2) { for (int i = 1; i < warp_thread_num; i *= 2) {
int temp_addr = g.shfl_up(sum_addr, i); int temp_addr = g.shfl_up(sum_addr, i);
int temp_flag = g.shfl_up(sum_flag, i); int temp_flag = g.shfl_up(sum_flag, i);
...@@ -266,27 +239,16 @@ __global__ void filter_copy_fuse_kernel( ...@@ -266,27 +239,16 @@ __global__ void filter_copy_fuse_kernel(
if (ins_start < ins_end) { if (ins_start < ins_end) {
int out_lods_idx = p_flag + 1; int out_lods_idx = p_flag + 1;
// ins_start = 1
// BUG fix
for (int p = ins_start; p < ins_end; p++) { for (int p = ins_start; p < ins_end; p++) {
if (flag_data[p - ins_start] == 1) { if (flag_data[p - ins_start] == 1) {
// batch_len = 2
// batch_len = 4
size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p]; size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p];
// t = 0
// t = 1
int t = out_lods_idx - 1; int t = out_lods_idx - 1;
// out_lods_data[0] = 0;
int previous; int previous;
if (out_lods_idx == p_flag + 1) { if (out_lods_idx == p_flag + 1) {
// out_lods_data[t] = p_out_lods;
previous = p_out_lods; previous = p_out_lods;
} else { } else {
previous = out_lods_data[t]; previous = out_lods_data[t];
} }
map_data[t * 3] = (int64_t)previous; map_data[t * 3] = (int64_t)previous;
map_data[t * 3 + 1] = x1_lods_data[p]; map_data[t * 3 + 1] = x1_lods_data[p];
map_lods_data[t] = t; map_lods_data[t] = t;
...@@ -300,7 +262,6 @@ __global__ void filter_copy_fuse_kernel( ...@@ -300,7 +262,6 @@ __global__ void filter_copy_fuse_kernel(
if (sum_out_lods4 > 1) { if (sum_out_lods4 > 1) {
int out_data_num = sum_out_lods4 - 1; int out_data_num = sum_out_lods4 - 1;
int out_start = ins_start; int out_start = ins_start;
if (out_start < out_data_num) { if (out_start < out_data_num) {
int out_end = ins_end >= out_data_num ? out_data_num : ins_end; int out_end = ins_end >= out_data_num ? out_data_num : ins_end;
for (int p = out_start; p < out_end; p++) { for (int p = out_start; p < out_end; p++) {
...@@ -314,11 +275,8 @@ __global__ void filter_copy_fuse_kernel( ...@@ -314,11 +275,8 @@ __global__ void filter_copy_fuse_kernel(
if (flag_data[p - ins_start] == 1) { if (flag_data[p - ins_start] == 1) {
auto output_start_idx = prefix_sum_data2[p - ins_start]; auto output_start_idx = prefix_sum_data2[p - ins_start];
T* dst = out_data + output_start_idx * x1_embed_size; T* dst = out_data + output_start_idx * x1_embed_size;
const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size; const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size; const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
// optimized
for (const T *j = src_start; j != src_end; dst++, j++) { for (const T *j = src_start; j != src_end; dst++, j++) {
*dst = *j; *dst = *j;
} }
...@@ -338,12 +296,10 @@ __global__ void copy_grad_kernel(const size_t N, const int ins_per_thread, ...@@ -338,12 +296,10 @@ __global__ void copy_grad_kernel(const size_t N, const int ins_per_thread,
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
int ins_start = idx * ins_per_thread; int ins_start = idx * ins_per_thread;
int ins_end = (idx + 1) * ins_per_thread; int ins_end = (idx + 1) * ins_per_thread;
if (ins_start >= N) { if (ins_start >= N) {
return; return;
} }
if (ins_end > N) ins_end = N; if (ins_end > N) ins_end = N;
for (int p = ins_start; p < ins_end; p++) { for (int p = ins_start; p < ins_end; p++) {
T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size; T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size;
const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size; const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size;
...@@ -394,21 +350,17 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> { ...@@ -394,21 +350,17 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
const Tensor* x3 = context.Input<Tensor>("Filter_tag"); const Tensor* x3 = context.Input<Tensor>("Filter_tag");
const int64_t* x3_data = x3->data<int64_t>(); const int64_t* x3_data = x3->data<int64_t>();
// int x2_lods_filled = 1;
Vector<size_t> x2_lods; Vector<size_t> x2_lods;
// Vector, in GPU
if (x2->lod().size() != 0) { // lod_level = 1 if (x2->lod().size() != 0) { // lod_level = 1
x2_lods = x2->lod()[0]; x2_lods = x2->lod()[0];
// x2_lods_filled = 1;
} else { // lod_level = 0 } else { // lod_level = 0
const size_t x2_lods_size = x2->dims()[0]; const size_t x2_lods_size = x2->dims()[0];
const size_t instag_per_num = x2->dims()[1];
// x2_lods.resize(x2->dims()[0] + 1); // x2_lods.resize(x2->dims()[0] + 1);
// move to cuda // move to cuda
x2_lods.push_back(0); x2_lods.push_back(0);
for (size_t i = 0; i < x2_lods_size; i++) { for (size_t i = 0; i < x2_lods_size; i++) {
x2_lods.push_back(i + 1); x2_lods.push_back(x2_lods.back() + instag_per_num);
} }
} }
...@@ -417,13 +369,8 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> { ...@@ -417,13 +369,8 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place); size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place);
// Vector, in GPU
// int x1_lods_filled = 1;
Vector<size_t> x1_lods; Vector<size_t> x1_lods;
if (!is_x1_lod) { if (!is_x1_lod) {
// move to cuda
// x1_lods.resize(x1->dims()[0] + 1);
x1_lods.push_back(0); x1_lods.push_back(0);
for (int i = 0; i < x1->dims()[0]; i++) { for (int i = 0; i < x1->dims()[0]; i++) {
x1_lods.push_back(i + 1); x1_lods.push_back(i + 1);
...@@ -432,7 +379,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> { ...@@ -432,7 +379,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
// x1_lods = context.Input<LoDTensor>("Ins")->lod()[0]; // x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
// new: lod_level=0 => lod() return {} // new: lod_level=0 => lod() return {}
if (x1->lod().size() != 0) { // lod_level = 1 if (x1->lod().size() != 0) { // lod_level = 1
// x1_lods_filled = 1;
x1_lods = x1->lod()[0]; x1_lods = x1->lod()[0];
} else { // lod_level = 0 } else { // lod_level = 0
// x1_lods.resize(x1->dims()[0] + 1); // x1_lods.resize(x1->dims()[0] + 1);
...@@ -458,10 +404,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> { ...@@ -458,10 +404,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
LoDTensor* loss_weight = context.Output<LoDTensor>("LossWeight"); LoDTensor* loss_weight = context.Output<LoDTensor>("LossWeight");
int out_first = x1_lods.back(); int out_first = x1_lods.back();
// int out_first = x1->dims()[0];
// if (x1_lods_filled) {
// out_first = x1_lods.back();
// }
out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size})); out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size}));
map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3})); map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3}));
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/operators/math/pooling.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/flatten_grad_kernel.h" #include "paddle/phi/kernels/flatten_grad_kernel.h"
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/binary.h" #include "paddle/phi/infermeta/binary.h"
#include "paddle/phi/infermeta/ternary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -17,7 +17,6 @@ limitations under the License. */ ...@@ -17,7 +17,6 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/kron_op.h"
#include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
......
...@@ -167,9 +167,11 @@ class GroupNormGradOp : public framework::OperatorWithKernel { ...@@ -167,9 +167,11 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
// check input // check input
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GroupNormGrad");
OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "GroupNormGrad");
OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance", OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance",
"GroupNormGrad"); "GroupNormGrad");
OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "GroupNormGrad");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input", OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
framework::GradVarName("Y"), "GroupNormGrad"); framework::GradVarName("Y"), "GroupNormGrad");
...@@ -216,10 +218,12 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -216,10 +218,12 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker<T> {
void Apply(GradOpPtr<T> op) const override { void Apply(GradOpPtr<T> op) const override {
op->SetType("group_norm_grad"); op->SetType("group_norm_grad");
op->SetInput("X", this->Input("X"));
op->SetInput("Scale", this->Input("Scale")); op->SetInput("Scale", this->Input("Scale"));
op->SetInput("Bias", this->Input("Bias")); op->SetInput("Bias", this->Input("Bias"));
op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
op->SetInput("Y", this->Output("Y")); op->SetInput("Y", this->Output("Y"));
op->SetInput("Mean", this->Output("Mean"));
op->SetInput("Variance", this->Output("Variance")); op->SetInput("Variance", this->Output("Variance"));
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
......
...@@ -81,46 +81,74 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, ...@@ -81,46 +81,74 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
} }
template <typename T, typename AccT, int VecSize> template <typename T, typename AccT, int VecSize, int Num>
__device__ __forceinline__ void ThreadReduce(const T* input, int size, __device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
const int offset, AccT* mean, int size, const int offset,
AccT* var) { AccT* out_mean, AccT* out_var) {
const T* x = arrs[0];
const T* y;
if (Num == 2) {
y = arrs[1];
}
using VecT = kps::details::VectorType<T, VecSize>; using VecT = kps::details::VectorType<T, VecSize>;
int tid = threadIdx.x; int tid = threadIdx.x;
if (offset > 0) { if (offset > 0) {
input -= offset; x -= offset;
if (Num == 2) {
y -= offset;
}
size += offset; size += offset;
if (tid >= offset) { if (tid >= offset) {
AccT temp = input[tid]; if (Num == 1) {
*mean += temp; *out_mean += x[tid];
*var += temp * temp; *out_var += x[tid] * x[tid];
} else if (Num == 2) {
*out_mean += y[tid];
*out_var += y[tid] * x[tid];
}
} }
size -= blockDim.x; size -= blockDim.x;
input += blockDim.x; x += blockDim.x;
if (Num == 2) {
y += blockDim.x;
}
} }
int remain = size % (VecSize * blockDim.x); int remain = size % (VecSize * blockDim.x);
T ins[VecSize]; T ins_x[VecSize];
VecT* ins_vec = reinterpret_cast<VecT*>(&ins); T ins_y[VecSize];
VecT* ins_vec_x = reinterpret_cast<VecT*>(&ins_x);
VecT* ins_vec_y = reinterpret_cast<VecT*>(&ins_y);
// vector part // vector part
for (; VecSize * tid < (size - remain); tid += blockDim.x) { for (; VecSize * tid < (size - remain); tid += blockDim.x) {
*ins_vec = reinterpret_cast<const VecT*>(input)[tid]; *ins_vec_x = reinterpret_cast<const VecT*>(x)[tid];
if (Num == 2) {
*ins_vec_y = reinterpret_cast<const VecT*>(y)[tid];
}
#pragma unroll #pragma unroll
for (int i = 0; i < VecSize; ++i) { for (int i = 0; i < VecSize; ++i) {
AccT temp = ins[i]; if (Num == 1) {
*mean += temp; *out_mean += ins_x[i];
*var += temp * temp; *out_var += ins_x[i] * ins_x[i];
} else if (Num == 2) {
*out_mean += ins_y[i];
*out_var += ins_y[i] * ins_x[i];
}
} }
} }
// scalar part // scalar part
tid = size - remain + threadIdx.x; tid = size - remain + threadIdx.x;
for (; tid < size; tid += blockDim.x) { for (; tid < size; tid += blockDim.x) {
AccT temp = input[tid]; if (Num == 1) {
*mean += temp; *out_mean += x[tid];
*var += temp * temp; *out_var += x[tid] * x[tid];
} else if (Num == 2) {
*out_mean += y[tid];
*out_var += y[tid] * x[tid];
}
} }
} }
...@@ -148,7 +176,10 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var, ...@@ -148,7 +176,10 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
AccT x_var = static_cast<AccT>(0); AccT x_var = static_cast<AccT>(0);
const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
x += i * size; x += i * size;
ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var); phi::Array<const T*, 1> ins;
ins[0] = x;
ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>( x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
x_mean, kps::AddFunctor<AccT>()); x_mean, kps::AddFunctor<AccT>());
x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>( x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
...@@ -310,10 +341,12 @@ class GroupNormKernel<platform::CUDADeviceContext, T> ...@@ -310,10 +341,12 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
}; };
template <typename T, int flags> template <typename T, int flags>
__global__ void GroupNormBackwardGetMeanAndVar( __global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale,
const T* x, const T* scale, const T* bias, const T* d_y, int N, int C, const T* bias, const T* d_y,
int W, int imsize, int groups, int group_size, T epsilon, T* d_mean, int N, int C, int W, int imsize,
T* d_var, T* d_scale, T* d_bias, const DataLayout data_layout) { int groups, int group_size,
T epsilon, T* d_mean, T* d_var,
T* d_scale, T* d_bias) {
int gid = blockIdx.y; int gid = blockIdx.y;
int cid = blockIdx.x; int cid = blockIdx.x;
int bid = blockIdx.z; int bid = blockIdx.z;
...@@ -329,15 +362,11 @@ __global__ void GroupNormBackwardGetMeanAndVar( ...@@ -329,15 +362,11 @@ __global__ void GroupNormBackwardGetMeanAndVar(
for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
T val, dval; T val, dval;
if (data_layout == DataLayout::kNCHW) {
val = x[(bid * C + ccid) * imsize + imid] - x_bias; int hid = imid / W;
dval = d_y[(bid * C + ccid) * imsize + imid]; int wid = imid % W;
} else { val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
int hid = imid / W; dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
int wid = imid % W;
val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
}
d_var_data += val * dval; d_var_data += val * dval;
d_mean_data += dval * x_scale; d_mean_data += dval * x_scale;
...@@ -357,8 +386,7 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, ...@@ -357,8 +386,7 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
const T* bias, const T* var, const T* d_mean, const T* bias, const T* var, const T* d_mean,
const T* d_var, int N, int C, int W, const T* d_var, int N, int C, int W,
int imsize, int groups, int group_size, int imsize, int groups, int group_size,
T epsilon, T* d_x, T epsilon, T* d_x) {
const DataLayout data_layout) {
int gid = blockIdx.y; int gid = blockIdx.y;
int cid = blockIdx.x; int cid = blockIdx.x;
int bid = blockIdx.z; int bid = blockIdx.z;
...@@ -379,26 +407,142 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, ...@@ -379,26 +407,142 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
if (x_scale != 0) x_scale_inv = 1.0 / x_scale; if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
if (data_layout == DataLayout::kNCHW) { int hid = imid / W;
T tmp = x[(bid * C + ccid) * imsize + imid]; int wid = imid % W;
T v_y = (tmp - x_bias) * x_scale_inv; T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
T dly = d_y[(bid * C + ccid) * imsize + imid]; T v_y = (tmp - x_bias) * x_scale_inv;
d_x[(bid * C + ccid) * imsize + imid] = T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid];
x_var_inv * d_x[(bid * H + hid) * W * C + wid * C + ccid] =
(dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); x_var_inv *
} else { (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
int hid = imid / W; }
int wid = imid % W; }
T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
T v_y = (tmp - x_bias) * x_scale_inv; template <typename T, typename AccT, int VecSize>
T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid]; __global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
d_x[(bid * H + hid) * W * C + wid * C + ccid] = T* ds, T* db) {
x_var_inv * int i = blockIdx.x;
(dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); AccT ds_sum = static_cast<AccT>(0);
AccT db_sum = static_cast<AccT>(0);
const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
x += i * imsize;
phi::Array<const T*, 2> ins;
ins[0] = x;
ins[1] = dy;
ThreadReduce<T, AccT, VecSize, 2>(ins, imsize, input_offset, &db_sum,
&ds_sum);
ds_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
ds_sum, kps::AddFunctor<AccT>());
db_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
db_sum, kps::AddFunctor<AccT>());
__syncthreads();
if (threadIdx.x == 0) {
ds[i] = ds_sum;
db[i] = db_sum;
}
}
template <typename T>
__global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
T* ds, T* db) {
const int nc = blockIdx.x;
T ds_sum = 0;
T db_sum = 0;
for (int i = threadIdx.x; i < imsize; i += blockDim.x) {
const int index = nc * imsize + i;
ds_sum += dy[index] * x[index];
db_sum += dy[index];
}
CudaAtomicAddWithWarp(&ds[nc], ds_sum);
CudaAtomicAddWithWarp(&db[nc], db_sum);
}
template <typename T>
__global__ void GetScaleBiasGradientCUDAKernel(int N, int C, int group,
T epsilon, const T* mean,
const T* var, const T* ds,
const T* db, T* d_scale,
T* d_bias) {
const int c = blockIdx.x * blockDim.x + threadIdx.x;
if (c < C) {
const int G = group;
const int D = C / G;
T sum1 = 0;
T sum2 = 0;
for (int n = 0; n < N; ++n) {
const int nc = n * C + c;
const int ng = n * G + c / D;
sum1 += (d_scale == nullptr)
? T(0)
: ((ds[nc] - db[nc] * static_cast<T>(mean[ng])) *
static_cast<T>(rsqrt(var[ng] + epsilon)));
sum2 += (d_bias == nullptr) ? T(0) : db[nc];
}
if (d_scale != nullptr) {
d_scale[c] = sum1;
}
if (d_bias != nullptr) {
d_bias[c] = sum2;
} }
} }
} }
template <typename T, int BlockDim>
__global__ void GetBackwardParamsCUDAKernel(int imsize, int groups,
int group_size, T epsilon,
const T* mean, const T* var,
const T* scale, const T* ds,
const T* db, T* p1, T* p2, T* p3) {
const int n = blockIdx.x;
const int g = blockIdx.y;
const int ng = n * groups + g;
T sum1 = 0;
T sum2 = 0;
T var_inv = rsqrt(var[ng] + epsilon);
for (int64_t i = threadIdx.x; i < group_size; i += blockDim.x) {
const int64_t index = ng * group_size + i;
const int64_t c = g * group_size + i;
const T scale_v = scale == nullptr ? T(1) : static_cast<T>(scale[c]);
sum1 += ds[index] * scale_v;
sum2 += db[index] * scale_v;
const T scale_c = scale == nullptr ? T(0) : static_cast<T>(scale[c]);
p1[index] = scale_c * var_inv;
}
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
sum1 = BlockReduce(ds_storage).Reduce(sum1, cub::Sum());
sum2 = BlockReduce(db_storage).Reduce(sum2, cub::Sum());
if (threadIdx.x == 0) {
const T s = T(1) / static_cast<T>(group_size * imsize);
const T x = (sum2 * static_cast<T>(mean[ng]) - sum1) *
static_cast<T>(var_inv) * static_cast<T>(var_inv) *
static_cast<T>(var_inv) * s;
p2[ng] = x;
p3[ng] = -x * static_cast<T>(mean[ng]) - sum2 * static_cast<T>(var_inv) * s;
}
}
template <typename T>
__global__ void GetXGradientCUDAKernel(int imsize, int C, int group_size,
int groups, T* p1, T* p2, T* p3,
const T* x, const T* dy, T* dx) {
int cid = blockIdx.x;
int gid = blockIdx.y;
int bid = blockIdx.z;
int ccid = bid * C + gid * group_size + cid;
int ng = bid * groups + gid;
int nc = gid * group_size + cid;
for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
int index = (bid * C + nc) * imsize + imid;
dx[index] = p1[ccid] * dy[index] + p2[ng] * x[index] + p3[ng];
}
}
template <typename T> template <typename T>
class GroupNormGradKernel<platform::CUDADeviceContext, T> class GroupNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> { : public framework::OpKernel<T> {
...@@ -408,7 +552,9 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T> ...@@ -408,7 +552,9 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
const DataLayout data_layout = const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str); framework::StringToDataLayout(data_layout_str);
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
auto* x = ctx.Input<Tensor>("Y"); auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* mean = ctx.Input<Tensor>("Mean");
auto* var = ctx.Input<Tensor>("Variance"); auto* var = ctx.Input<Tensor>("Variance");
auto* scale = ctx.Input<Tensor>("Scale"); auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias"); auto* bias = ctx.Input<Tensor>("Bias");
...@@ -433,31 +579,27 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T> ...@@ -433,31 +579,27 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero; phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
Tensor temp_var; Tensor ds, db;
temp_var.mutable_data<T>(var->dims(), ctx.GetPlace()); ds.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
set_zero(dev_ctx, &temp_var, static_cast<T>(0)); db.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
T* temp_var_data = temp_var.data<T>(); T* ds_data = ds.data<T>();
T* db_data = db.data<T>();
Tensor temp_mean;
temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
T* temp_mean_data = temp_mean.data<T>();
auto* y_data = y->data<T>();
auto* x_data = x->data<T>(); auto* x_data = x->data<T>();
T* d_x_data = nullptr; T* d_x_data = nullptr;
if (d_x) d_x_data = d_x->data<T>(); if (d_x) d_x_data = d_x->data<T>();
auto* y_data = d_y->data<T>(); auto* dy_data = d_y->data<T>();
auto* var_data = var->data<T>(); auto* var_data = var->data<T>();
auto* mean_data = mean->data<T>();
T* d_scale_data = nullptr; T* d_scale_data = nullptr;
if (d_scale) { if (d_scale) {
d_scale->mutable_data<T>(ctx.GetPlace()); d_scale->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, d_scale, static_cast<T>(0));
d_scale_data = d_scale->data<T>(); d_scale_data = d_scale->data<T>();
} }
T* d_bias_data = nullptr; T* d_bias_data = nullptr;
if (d_bias) { if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace()); d_bias->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, d_bias, static_cast<T>(0));
d_bias_data = d_bias->data<T>(); d_bias_data = d_bias->data<T>();
} }
...@@ -479,22 +621,103 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T> ...@@ -479,22 +621,103 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
#ifdef __HIPCC__ #ifdef __HIPCC__
int block_size = std::max(std::min(256, imsize), 64); int block_size = std::max(std::min(256, imsize), 64);
const int block_dims = 256;
#else #else
int block_size = std::min(1024, imsize); int block_size = std::min(1024, imsize);
const int block_dims = 1024;
#endif #endif
dim3 grid(group_size, groups, x_dims[0]); dim3 grid(group_size, groups, x_dims[0]);
dim3 threads(block_size, 1, 1); dim3 threads(block_size, 1, 1);
int flags = int flags =
(scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data, if (data_layout == DataLayout::kNCHW) {
bias_data, y_data, x_dims[0], C, W, imsize, groups, using AccT = typename details::MPTypeTrait<T>::Type;
group_size, epsilon, temp_mean_data, temp_var_data, constexpr int vec_size = sizeof(float4) / sizeof(T);
d_scale_data, d_bias_data, data_layout); const int max_num_threads = 1024;
if (d_x_data != nullptr) { int max_block_size = std::min(imsize / vec_size, max_num_threads);
UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data, int block_size_nchw = 1;
bias_data, var_data, temp_mean_data, temp_var_data, while (block_size_nchw < max_block_size) {
x_dims[0], C, W, imsize, groups, group_size, epsilon, block_size_nchw *= 2;
d_x_data, data_layout); }
block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
dim3 blocks(block_size_nchw);
if (imsize < vec_size) {
if (d_scale) {
set_zero(dev_ctx, d_scale, static_cast<T>(0));
}
if (d_bias) {
set_zero(dev_ctx, d_bias, static_cast<T>(0));
}
ScalarGetDsDbCUDAKernel<
T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
imsize, x_data, dy_data, ds_data, db_data);
} else {
VectorizedGetDsDbCUDAKernel<
T, AccT, vec_size><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
imsize, x_data, dy_data, ds_data, db_data);
}
if (d_scale || d_bias) {
const int block = 256;
GetScaleBiasGradientCUDAKernel<
T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>(
x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data,
db_data, d_scale_data, d_bias_data);
}
if (d_x_data != nullptr) {
// p1 * dy + p2 * x + p3,
// p1, p2, p3 represent the reverse calculation of temporary variables
// p1 = scale * var_inv
// p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n)
// p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n);
Tensor p1, p2, p3;
p1.mutable_data<T>({x_dims[0] * C}, ctx.GetPlace());
p2.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
p3.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
T* p1_data = p1.data<T>();
T* p2_data = p2.data<T>();
T* p3_data = p3.data<T>();
GetBackwardParamsCUDAKernel<T, block_dims><<<
dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>(
imsize, groups, group_size, epsilon, mean_data, var_data,
scale_data, ds_data, db_data, p1_data, p2_data, p3_data);
GetXGradientCUDAKernel<T><<<grid, threads, 0, dev_ctx.stream()>>>(
imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data,
dy_data, d_x_data);
}
} else {
if (d_scale) {
set_zero(dev_ctx, d_scale, static_cast<T>(0));
}
if (d_bias) {
set_zero(dev_ctx, d_bias, static_cast<T>(0));
}
Tensor temp_var;
temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
set_zero(dev_ctx, &temp_var, static_cast<T>(0));
T* temp_var_data = temp_var.data<T>();
Tensor temp_mean;
temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
T* temp_mean_data = temp_mean.data<T>();
int flags = (scale_data != nullptr) * kHasScale +
(bias_data != nullptr) * kHasBias;
UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, y_data,
scale_data, bias_data, dy_data, x_dims[0], C, W, imsize,
groups, group_size, epsilon, temp_mean_data,
temp_var_data, d_scale_data, d_bias_data);
if (d_x_data != nullptr) {
UNROLL_ALL_CASES(flags, GroupNormBackward, y_data, dy_data, scale_data,
bias_data, var_data, temp_mean_data, temp_var_data,
x_dims[0], C, W, imsize, groups, group_size, epsilon,
d_x_data);
}
} }
} }
}; };
......
...@@ -61,13 +61,13 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( ...@@ -61,13 +61,13 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D(
template <typename T> template <typename T>
__forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex(
int* in_img_idx, int* w_id, T* w1lambda, T* w2lambda, T src_w, int* in_img_idx, int* x_id, T* lambda1, T* lambda2, T src_x,
const int in_img_w) { const int in_img_x) {
src_w = (src_w > 0) ? src_w : 0.f; src_x = (src_x > 0) ? src_x : 0.f;
*in_img_idx = static_cast<int>(src_w); *in_img_idx = static_cast<int>(src_x);
*w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0; *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0;
*w1lambda = src_w - *in_img_idx; *lambda1 = src_x - *in_img_idx;
*w2lambda = 1.f - *w1lambda; *lambda2 = 1.f - *lambda1;
} }
struct FastDivModForInterpolate { struct FastDivModForInterpolate {
...@@ -670,83 +670,102 @@ __global__ void KeBilinearInterpBwShareMemory( ...@@ -670,83 +670,102 @@ __global__ void KeBilinearInterpBwShareMemory(
} }
} }
__device__ __forceinline__ int GetInputIndex(const size_t nc, const int height,
const int width, const int h,
const int w) {
return (nc * height + h) * width + w;
}
template <typename T>
__global__ void KeBilinearInterpNCHWBw(T* in, const int in_h, const int in_w,
const int out_h, const int out_w,
const int n, const int num_channels,
float ratio_h, float ratio_w,
const T* __restrict__ out,
const T align_type_value) {
int index = threadIdx.x + blockDim.x * blockIdx.x;
int stride = blockDim.x * gridDim.x;
int num_out = n * num_channels * out_h * out_w;
int num_in = n * num_channels * in_h * in_w;
for (; index < num_out; index += stride) {
int index_tmp = index;
int w2 = index_tmp % out_w;
index_tmp /= out_w;
int h2 = index_tmp % out_h;
int nc = index_tmp / out_h;
int h1, y_id;
T h1lambda, h0lambda;
T src_y = ratio_h * (h2 + align_type_value) - align_type_value;
PreCalculatorForLinearInterpInputIndex(&h1, &y_id, &h1lambda, &h0lambda,
src_y, in_h);
int w1, x_id;
T w1lambda, w0lambda;
T src_x = ratio_w * (w2 + align_type_value) - align_type_value;
PreCalculatorForLinearInterpInputIndex(&w1, &x_id, &w1lambda, &w0lambda,
src_x, in_w);
T d2val = out[index];
platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1),
h0lambda * w0lambda * d2val);
platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id),
h0lambda * w1lambda * d2val);
platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1),
h1lambda * w0lambda * d2val);
platform::CudaAtomicAdd(
in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id),
h1lambda * w1lambda * d2val);
}
}
template <typename T> template <typename T>
__global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w, __global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w,
const T* __restrict__ out, const int out_h, const T* __restrict__ out, const int out_h,
const int out_w, const int n, const int out_w, const int n,
const int num_channels, float ratio_h, const int out_chw, const int num_channels,
float ratio_w, const T align_type_value, float ratio_h, float ratio_w,
bool is_nchw) { const T align_type_value,
FastDivModForInterpolate divmods) {
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
int in_chw = in_h * in_w * num_channels; int in_chw = in_h * in_w * num_channels;
int out_chw = num_channels * out_h * out_w;
int nthreads = n * out_chw; int nthreads = n * out_chw;
if (is_nchw) { for (; tid < nthreads; tid += stride) {
for (; tid < nthreads; tid += stride) { auto out_id_divmod = divmods.output_w_div.Divmod(tid);
int out_id_h = tid / out_chw; int out_id_h = out_id_divmod.val[0];
int out_id_w = tid % out_chw; int out_id_w = out_id_divmod.val[1];
const int in_img_size = in_h * in_w;
const int out_img_size = out_h * out_w; int channel_id = divmods.channels_div.Divmod(tid).val[1];
T value = out[out_id_h * out_chw + out_id_w]; auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w);
int out_img_idy = outimg_id_divmod.val[0];
int channel_id = out_id_w / out_img_size; int out_img_idx =
int out_img_idy = (out_id_w % out_img_size) / out_w; divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0];
int out_img_idx = tid % out_w;
int in_img_idx, in_img_idy, w_id, h_id; int in_img_idx, in_img_idy, w_id, h_id;
T w1lambda, h1lambda, w2lambda, h2lambda; T w1lambda, h1lambda, w2lambda, h2lambda;
T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_w);
&w2lambda, src_w, in_w); PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_h);
&h2lambda, src_h, in_h);
T value = out[tid];
T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size + T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
in_img_idy * in_w + in_img_idx]; in_img_idx * num_channels + channel_id];
platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value); platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
platform::CudaAtomicAdd(&in_pos[h_id * in_w], h2lambda * w1lambda * value);
h1lambda * w2lambda * value); platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id], h1lambda * w2lambda * value);
h1lambda * w1lambda * value); platform::CudaAtomicAdd(
} &in_pos[h_id * in_w * num_channels + w_id * num_channels],
} else { h1lambda * w1lambda * value);
for (; tid < nthreads; tid += stride) {
int out_id_h = tid / out_chw;
int out_id_w = tid % out_chw;
const int in_img_size = in_h * in_w;
const int out_img_size = out_h * out_w;
T value = out[out_id_h * out_chw + out_id_w];
int out_img_idy = out_id_w / (out_w * num_channels);
int out_img_idx = out_id_w % (out_w * num_channels) / num_channels;
int channel_id = tid % num_channels;
int in_img_idx, in_img_idy, w_id, h_id;
T w1lambda, h1lambda, w2lambda, h2lambda;
T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda,
&w2lambda, src_w, in_w);
PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda,
&h2lambda, src_h, in_h);
T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
in_img_idx * num_channels + channel_id];
platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
h2lambda * w1lambda * value);
platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
h1lambda * w2lambda * value);
platform::CudaAtomicAdd(
&in_pos[h_id * in_w * num_channels + w_id * num_channels],
h1lambda * w1lambda * value);
}
} }
} }
...@@ -1907,11 +1926,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, ...@@ -1907,11 +1926,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
ctx.cuda_device_context().stream()>>>( ctx.cuda_device_context().stream()>>>(
input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
ratio_h, ratio_w, align_type_value, is_nchw); ratio_h, ratio_w, align_type_value, is_nchw);
} else if (!optimize_flag & is_nchw) {
//
const int num_kernels = n * c * out_h * out_w;
const int num_threads =
std::min(ctx.cuda_device_context().GetMaxThreadsPerBlock(), 1024);
KeBilinearInterpNCHWBw<
T><<<platform::DivUp(num_kernels, num_threads), num_threads, 0,
ctx.cuda_device_context().stream()>>>(
input_grad_data, in_h, in_w, out_h, out_w, n, c, ratio_h, ratio_w,
output_grad_data, align_type_value);
} else { } else {
int64_t cw = c * out_w;
auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw);
KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0, KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
ctx.cuda_device_context().stream()>>>( ctx.cuda_device_context().stream()>>>(
input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n,
ratio_h, ratio_w, align_type_value, is_nchw); out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods);
} }
} else if ("bicubic" == interp_method) { } else if ("bicubic" == interp_method) {
#ifdef __HIPCC__ #ifdef __HIPCC__
......
...@@ -17,9 +17,7 @@ limitations under the License. */ ...@@ -17,9 +17,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/operators/kron_op.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -178,27 +176,4 @@ namespace ops = paddle::operators; ...@@ -178,27 +176,4 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker, REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker,
ops::KronGradOpMaker<paddle::framework::OpDesc>, ops::KronGradOpMaker<paddle::framework::OpDesc>,
ops::KronGradOpMaker<paddle::imperative::OpBase>); ops::KronGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
kron, ops::KronKernel<paddle::platform::CPUDeviceContext, float>,
ops::KronKernel<paddle::platform::CPUDeviceContext, double>,
ops::KronKernel<paddle::platform::CPUDeviceContext,
paddle::platform::float16>,
ops::KronKernel<paddle::platform::CPUDeviceContext, int>,
ops::KronKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::KronKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::KronKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OPERATOR(kron_grad, ops::KronGradOp); REGISTER_OPERATOR(kron_grad, ops::KronGradOp);
REGISTER_OP_CPU_KERNEL(
kron_grad, ops::KronGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::KronGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::KronGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::float16>,
ops::KronGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::KronGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::KronGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::KronGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/kron_op.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
kron, ops::KronKernel<paddle::platform::CUDADeviceContext, float>,
ops::KronKernel<paddle::platform::CUDADeviceContext, double>,
ops::KronKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::KronKernel<paddle::platform::CUDADeviceContext, int>,
ops::KronKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::KronKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::KronKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
kron_grad, ops::KronGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::KronGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::KronGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::KronGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::KronGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::KronGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::KronGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
...@@ -17,9 +17,11 @@ ...@@ -17,9 +17,11 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/operators/lstsq_op.h" #include "paddle/fluid/operators/lstsq_op.h"
#include "paddle/fluid/operators/qr_op.h" #include "paddle/fluid/operators/qr_op.h"
#include "paddle/fluid/platform/dynload/cusolver.h" #include "paddle/fluid/platform/dynload/cusolver.h"
#include "paddle/phi/kernels/triangular_solve_kernel.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -70,6 +72,10 @@ class LstsqCUDAKernel : public framework::OpKernel<T> { ...@@ -70,6 +72,10 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
Tensor tau = dito.Fill(tau_dims_vec, 0); Tensor tau = dito.Fill(tau_dims_vec, 0);
auto tau_data = tau.mutable_data<T>(context.GetPlace()); auto tau_data = tau.mutable_data<T>(context.GetPlace());
using Context =
typename framework::ConvertToPhiContext<DeviceContext>::TYPE;
auto& phi_dev_ctx = static_cast<const Context&>(dev_ctx);
if (m >= n) { if (m >= n) {
Tensor tmp_x = dito.Transpose(new_x); Tensor tmp_x = dito.Transpose(new_x);
Tensor tmp_y = dito.Transpose(new_y); Tensor tmp_y = dito.Transpose(new_y);
...@@ -93,8 +99,9 @@ class LstsqCUDAKernel : public framework::OpKernel<T> { ...@@ -93,8 +99,9 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn}); Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn});
// Step 3, solve R X = Y // Step 3, solve R X = Y
triangular_solve<DeviceContext, T>(dev_ctx, res_r, slice_y, solution, phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, res_r, slice_y, true,
true, false, false); false, false, solution);
} else { } else {
auto x_data = new_x.mutable_data<T>(context.GetPlace()); auto x_data = new_x.mutable_data<T>(context.GetPlace());
auto y_data = new_y.mutable_data<T>(context.GetPlace()); auto y_data = new_y.mutable_data<T>(context.GetPlace());
...@@ -105,8 +112,8 @@ class LstsqCUDAKernel : public framework::OpKernel<T> { ...@@ -105,8 +112,8 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
// Step 2, solve R^H Z = Y // Step 2, solve R^H Z = Y
Tensor trans_r = dito.Transpose(new_x); Tensor trans_r = dito.Transpose(new_x);
triangular_solve<DeviceContext, T>(dev_ctx, trans_r, new_y, solution, phi::TriangularSolveKernel<T, Context>(phi_dev_ctx, trans_r, new_y, true,
true, true, false); true, false, solution);
// Step 3, X <- Q Z // Step 3, X <- Q Z
BatchedOrgqr<DeviceContext, T>(dev_ctx, batch_count, n, n, min_mn, x_data, BatchedOrgqr<DeviceContext, T>(dev_ctx, batch_count, n, n, min_mn, x_data,
......
...@@ -22,7 +22,6 @@ ...@@ -22,7 +22,6 @@
#include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/math/matrix_solve.h"
#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/svd_helper.h"
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/operators/triangular_solve_op.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
......
此差异已折叠。
...@@ -20,7 +20,6 @@ math_library(sampler DEPS generator) ...@@ -20,7 +20,6 @@ math_library(sampler DEPS generator)
# math_library(math_function DEPS blas dense_tensor tensor) # math_library(math_function DEPS blas dense_tensor tensor)
math_library(maxouting) math_library(maxouting)
math_library(pooling)
if(WITH_MKLDNN) if(WITH_MKLDNN)
math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler) math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler)
......
...@@ -117,14 +117,6 @@ class MatrixSolveFunctor { ...@@ -117,14 +117,6 @@ class MatrixSolveFunctor {
const framework::Tensor& b, framework::Tensor* out); const framework::Tensor& b, framework::Tensor* out);
}; };
template <typename DeviceContext, typename T>
class TriangularSolveFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor* a,
framework::Tensor* b, bool left, bool upper, bool transpose,
bool unitriangular);
};
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
此差异已折叠。
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/phi/core/kernel_registry.h"
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
...@@ -32,6 +33,8 @@ USE_OP_DEVICE_KERNEL(relu, MKLDNN); ...@@ -32,6 +33,8 @@ USE_OP_DEVICE_KERNEL(relu, MKLDNN);
USE_OP_ITSELF(softmax); USE_OP_ITSELF(softmax);
USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册