未验证 提交 344b99e1 编写于 作者: H huangjiyi 提交者: GitHub

[PHI decoupling] move softmax from fluid to phi and remove cpu_vec.h in fluid (#48970)

上级 4672ea8e
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/inference/utils/table_printer.h" #include "paddle/fluid/inference/utils/table_printer.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/utils/string/split.h" #include "paddle/utils/string/split.h"
#ifdef PADDLE_WITH_TENSORRT #ifdef PADDLE_WITH_TENSORRT
...@@ -624,10 +624,11 @@ void AnalysisConfig::EnableMkldnnQuantizer() { ...@@ -624,10 +624,11 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
void AnalysisConfig::EnableMkldnnBfloat16() { void AnalysisConfig::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (platform::MayIUse(platform::cpu_isa_t::avx512_core)) { if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core)) {
use_mkldnn_bfloat16_ = true; use_mkldnn_bfloat16_ = true;
LOG(INFO) << "Hardware support for BFLOAT16" LOG(INFO) << "Hardware support for BFLOAT16"
<< (platform::MayIUse(platform::cpu_isa_t::avx512_bf16) << (phi::backends::cpu::MayIUse(
phi::backends::cpu::cpu_isa_t::avx512_bf16)
? " is enabled" ? " is enabled"
: " is disabled. Simulation will be used"); : " is disabled. Simulation will be used");
} else { } else {
......
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
DEFINE_string(dirname, "", "dirname to tests."); DEFINE_string(dirname, "", "dirname to tests.");
...@@ -327,7 +327,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) { ...@@ -327,7 +327,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
config.EnableMkldnnBfloat16(); config.EnableMkldnnBfloat16();
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (platform::MayIUse(platform::cpu_isa_t::avx512_core)) if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core))
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true); ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
else else
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false); ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
DEFINE_string(dirname, "", "dirname to tests."); DEFINE_string(dirname, "", "dirname to tests.");
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN"); DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
...@@ -47,7 +47,7 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) { ...@@ -47,7 +47,7 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInputs(&input_slots_all); SetInputs(&input_slots_all);
if (FLAGS_enable_mkldnn && FLAGS_enable_bf16 && if (FLAGS_enable_mkldnn && FLAGS_enable_bf16 &&
platform::MayIUse(platform::cpu_isa_t::avx512_bf16)) { phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_bf16)) {
b_cfg.EnableMkldnnBfloat16(); b_cfg.EnableMkldnnBfloat16();
} else { } else {
FLAGS_enable_bf16 = false; FLAGS_enable_bf16 = false;
......
...@@ -27,9 +27,9 @@ limitations under the License. */ ...@@ -27,9 +27,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/memory_block.h" #include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/allocation/system_allocator.h" #include "paddle/fluid/memory/allocation/system_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
......
...@@ -78,8 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() { ...@@ -78,8 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std::call_once(init_flag, []() { std::call_once(init_flag, []() {
a = new detail::BuddyAllocator( a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator), std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), phi::backends::cpu::CpuMinChunkSize(),
platform::CpuMaxChunkSize()); phi::backends::cpu::CpuMaxChunkSize());
}); });
return a; return a;
...@@ -290,8 +290,8 @@ BuddyAllocator *GetNPUPinnedBuddyAllocator() { ...@@ -290,8 +290,8 @@ BuddyAllocator *GetNPUPinnedBuddyAllocator() {
std::call_once(init_flag, []() { std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>( ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUPinnedAllocator), new detail::NPUPinnedAllocator),
platform::NPUPinnedMinChunkSize(), phi::backends::cpu::NPUPinnedMinChunkSize(),
platform::NPUPinnedMaxChunkSize()); phi::backends::cpu::NPUPinnedMaxChunkSize());
}); });
return ba; return ba;
...@@ -562,8 +562,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() { ...@@ -562,8 +562,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
std::call_once(init_flag, []() { std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>( ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::CUDAPinnedAllocator), new detail::CUDAPinnedAllocator),
platform::CUDAPinnedMinChunkSize(), phi::backends::cpu::CUDAPinnedMinChunkSize(),
platform::CUDAPinnedMaxChunkSize()); phi::backends::cpu::CUDAPinnedMaxChunkSize());
}); });
return ba; return ba;
......
...@@ -28,10 +28,10 @@ limitations under the License. */ ...@@ -28,10 +28,10 @@ limitations under the License. */
#endif #endif
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h" #include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif #endif
...@@ -206,7 +206,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -206,7 +206,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
// of host pinned allocation. Allocates too much would reduce // of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging. // the amount of memory available to the underlying system for paging.
size_t usable = size_t usable =
paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_; phi::backends::cpu::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
if (size > usable) { if (size > usable) {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
...@@ -362,7 +362,7 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -362,7 +362,7 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
size_t usable = size_t usable =
paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_; phi::backends::cpu::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
if (size > usable) { if (size > usable) {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
......
...@@ -18,9 +18,9 @@ limitations under the License. */ ...@@ -18,9 +18,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/memory_block.h" #include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
// This unit test is an example comparing the performance between using pinned // This unit test is an example comparing the performance between using pinned
// memory and not. In general, using pinned memory will be faster. // memory and not. In general, using pinned memory will be faster.
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h" #include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h" #include "paddle/phi/kernels/funcs/fc_functor.h"
...@@ -315,10 +315,10 @@ use lstm_x_t as input and compute as standard LSTM. ...@@ -315,10 +315,10 @@ use lstm_x_t as input and compute as standard LSTM.
template <typename T> template <typename T>
inline void bias_relu(const int n, const T* x, const T* bias, T* y) { inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
if (bias) { if (bias) {
phi::funcs::vec_add_bias<T, platform::avx>(n, *bias, x, y); phi::funcs::vec_add_bias<T, phi::backends::cpu::avx>(n, *bias, x, y);
phi::funcs::vec_relu<T, platform::avx>(n, y, y); phi::funcs::vec_relu<T, phi::backends::cpu::avx>(n, y, y);
} else { } else {
phi::funcs::vec_relu<T, platform::avx>(n, x, y); phi::funcs::vec_relu<T, phi::backends::cpu::avx>(n, x, y);
} }
} }
...@@ -329,8 +329,9 @@ inline void vec_softmax(const int n, const T* x, T* y) { ...@@ -329,8 +329,9 @@ inline void vec_softmax(const int n, const T* x, T* y) {
for (int i = 1; i < n; ++i) { for (int i = 1; i < n; ++i) {
scalar = scalar < x[i] ? x[i] : scalar; scalar = scalar < x[i] ? x[i] : scalar;
} }
phi::funcs::vec_add_bias<T, platform::avx>(n, -scalar, x, y); // sub phi::funcs::vec_add_bias<T, phi::backends::cpu::avx>(
phi::funcs::vec_exp<T>(n, y, y); // exp n, -scalar, x, y); // sub
phi::funcs::vec_exp<T>(n, y, y); // exp
// sum // sum
scalar = T(0); scalar = T(0);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
...@@ -393,13 +394,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> { ...@@ -393,13 +394,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
if (platform::MayIUse(platform::avx)) { if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
phi::funcs::VecActivations<T, platform::avx> act_functor; phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor;
act_gate = act_functor(act_gate_str); act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str); act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str); act_cand = act_functor(act_cand_str);
} else { } else {
phi::funcs::VecActivations<T, platform::isa_any> act_functor; phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor;
act_gate = act_functor(act_gate_str); act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str); act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str); act_cand = act_functor(act_cand_str);
......
...@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h" #include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h" #include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -129,15 +131,15 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> { ...@@ -129,15 +131,15 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
softmax_2d.ShareDataWith(*softmax).Resize({N, D}); softmax_2d.ShareDataWith(*softmax).Resize({N, D});
loss_2d.ShareDataWith(*loss).Resize({N, 1}); loss_2d.ShareDataWith(*loss).Resize({N, 1});
auto eigen_logits = math::EigenMatrix<T>::From(logits_2d); auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d); auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d);
// step 1, obtain logit_max // step 1, obtain logit_max
phi::DenseTensor logits_max; phi::DenseTensor logits_max;
logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx); logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* logits_max_buff = logits_max.mutable_data<T>(place); void* logits_max_buff = logits_max.mutable_data<T>(place);
auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max); auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1); Eigen::DSizes<int, 1> along_axis(1);
eigen_logits_max.device(*dev_ctx.eigen_device()) = eigen_logits_max.device(*dev_ctx.eigen_device()) =
eigen_logits.maximum(along_axis); eigen_logits.maximum(along_axis);
...@@ -158,7 +160,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> { ...@@ -158,7 +160,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_logits - (eigen_logits -
eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class)) eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
.unaryExpr(math::ValueClip<T>()); .unaryExpr(phi::funcs::ValueClip<T>());
// step 3, obtain predict target // step 3, obtain predict target
phi::DenseTensor predicted_logits; phi::DenseTensor predicted_logits;
...@@ -217,7 +219,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> { ...@@ -217,7 +219,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx); sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place); void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits); auto eigen_sum_exp_logits =
phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) = eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
eigen_softmax.sum(along_axis); eigen_softmax.sum(along_axis);
...@@ -231,8 +234,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> { ...@@ -231,8 +234,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
comm->comm(), comm->comm(),
stream)); stream));
auto eigen_loss = math::EigenMatrix<T>::From(loss_2d); auto eigen_loss = phi::funcs::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits); auto eigen_predicted_logits =
phi::funcs::EigenMatrix<T>::From(predicted_logits);
eigen_loss.device(*dev_ctx.eigen_device()) = eigen_loss.device(*dev_ctx.eigen_device()) =
(eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) - (eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) -
...@@ -281,14 +285,14 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> { ...@@ -281,14 +285,14 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
softmax_2d.ShareDataWith(*softmax).Resize({N, D}); softmax_2d.ShareDataWith(*softmax).Resize({N, D});
loss_2d.ShareDataWith(*loss).Resize({N, 1}); loss_2d.ShareDataWith(*loss).Resize({N, 1});
auto eigen_logits = math::EigenMatrix<T>::From(logits_2d); auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d); auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d);
// step 1, obtain logit_max // step 1, obtain logit_max
phi::DenseTensor logits_max; phi::DenseTensor logits_max;
logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx); logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max); auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1); Eigen::DSizes<int, 1> along_axis(1);
eigen_logits_max.device(*dev_ctx.eigen_device()) = eigen_logits_max.device(*dev_ctx.eigen_device()) =
eigen_logits.maximum(along_axis); eigen_logits.maximum(along_axis);
...@@ -304,7 +308,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> { ...@@ -304,7 +308,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_logits - (eigen_logits -
eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class)) eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
.unaryExpr(math::ValueClip<T>()); .unaryExpr(phi::funcs::ValueClip<T>());
// step 3, obtain predict target // step 3, obtain predict target
phi::DenseTensor predicted_logits; phi::DenseTensor predicted_logits;
...@@ -357,7 +361,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> { ...@@ -357,7 +361,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx); sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place); void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits); auto eigen_sum_exp_logits =
phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) = eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
eigen_softmax.sum(along_axis); eigen_softmax.sum(along_axis);
...@@ -366,8 +371,9 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> { ...@@ -366,8 +371,9 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
opts.reduce_op = distributed::ReduceOp::SUM; opts.reduce_op = distributed::ReduceOp::SUM;
pg->AllReduce(in_out, in_out, opts)->Synchronize(); pg->AllReduce(in_out, in_out, opts)->Synchronize();
auto eigen_loss = math::EigenMatrix<T>::From(loss_2d); auto eigen_loss = phi::funcs::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits); auto eigen_predicted_logits =
phi::funcs::EigenMatrix<T>::From(predicted_logits);
eigen_loss.device(*dev_ctx.eigen_device()) = eigen_loss.device(*dev_ctx.eigen_device()) =
(eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) - (eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) -
......
...@@ -22,9 +22,9 @@ limitations under the License. */ ...@@ -22,9 +22,9 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h" #include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/elementwise_kernel.h"
namespace paddle { namespace paddle {
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h" #include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/sequence2batch.h" #include "paddle/phi/kernels/funcs/sequence2batch.h"
...@@ -278,13 +278,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> { ...@@ -278,13 +278,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \ auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \ auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \ auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (platform::MayIUse(platform::avx)) { \ if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) { \
phi::funcs::VecActivations<T, platform::avx> act_functor; \ phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor; \
act_gate = act_functor(act_gate_str); \ act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \ act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \ act_cand = act_functor(act_cand_str); \
} else { \ } else { \
phi::funcs::VecActivations<T, platform::isa_any> act_functor; \ phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \ act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \ act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \ act_cand = act_functor(act_cand_str); \
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h" #include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h" #include "paddle/phi/kernels/funcs/fc_functor.h"
...@@ -225,11 +225,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> { ...@@ -225,11 +225,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
std::function<void(const int, const T*, T*)> fc_act; std::function<void(const int, const T*, T*)> fc_act;
auto& fc_act_str = ctx.Attr<std::string>("fc_activation"); auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
if (platform::MayIUse(platform::avx)) { if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
phi::funcs::VecActivations<T, platform::avx> act_functor; phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor;
fc_act = act_functor(fc_act_str); fc_act = act_functor(fc_act_str);
} else { } else {
phi::funcs::VecActivations<T, platform::isa_any> act_functor; phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor;
fc_act = act_functor(fc_act_str); fc_act = act_functor(fc_act_str);
} }
......
...@@ -9,7 +9,7 @@ file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n") ...@@ -9,7 +9,7 @@ file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
file(APPEND ${jit_file} file(APPEND ${jit_file}
"\#include \"paddle/fluid/operators/jit/registry.h\"\n\n") "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash) set(JIT_KERNEL_DEPS device_context cblas gflags enforce place xxhash)
file( file(
GLOB jit_kernel_cc_srcs GLOB jit_kernel_cc_srcs
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/act.h" #include "paddle/fluid/operators/jit/gen/act.h"
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -98,27 +98,27 @@ DECLARE_ACT_CREATOR(VTanh); ...@@ -98,27 +98,27 @@ DECLARE_ACT_CREATOR(VTanh);
// TODO(TJ): tuning use me // TODO(TJ): tuning use me
bool VReluCreator::CanBeUsed(const int& d) const { bool VReluCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx); return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
} }
bool VSquareCreator::CanBeUsed(const int& d) const { bool VSquareCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx); return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
} }
bool VIdentityCreator::CanBeUsed(const int& d) const { bool VIdentityCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx); return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
} }
bool VExpCreator::CanBeUsed(const int& d) const { bool VExpCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d < 32; return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d < 32;
} }
bool VSigmoidCreator::CanBeUsed(const int& d) const { bool VSigmoidCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx); return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
} }
bool VTanhCreator::CanBeUsed(const int& d) const { bool VTanhCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx); return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
} }
size_t VReluCreator::CodeSize(const int& d) const { size_t VReluCreator::CodeSize(const int& d) const {
......
...@@ -84,8 +84,8 @@ class VActFunc : public JitCode { ...@@ -84,8 +84,8 @@ class VActFunc : public JitCode {
// compute EXP with ymm, xmm // compute EXP with ymm, xmm
template <typename JMM> template <typename JMM>
void exp_jmm(JMM& dst, void exp_jmm(JMM& dst, // NOLINT
JMM& src, JMM& src, // NOLINT
int src_idx = 11, int src_idx = 11,
int fx_idx = 12, // NOLINT int fx_idx = 12, // NOLINT
int fy_idx = 13, int fy_idx = 13,
...@@ -144,10 +144,11 @@ class VActFunc : public JitCode { ...@@ -144,10 +144,11 @@ class VActFunc : public JitCode {
vcvttps2dq(ymm_int, jmm_fx); vcvttps2dq(ymm_int, jmm_fx);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f)); mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
vmovdqa(jmm_tmp, ptr[reg_ptr_global]); vmovdqa(jmm_tmp, ptr[reg_ptr_global]);
if (MayIUse(avx2) || std::is_same<JMM, xmm_t>::value) { if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx2) ||
std::is_same<JMM, xmm_t>::value) {
vpaddd(ymm_int, ymm_int, jmm_tmp); vpaddd(ymm_int, ymm_int, jmm_tmp);
vpslld(ymm_int, ymm_int, 23); vpslld(ymm_int, ymm_int, 23);
} else if (MayIUse(avx)) { } else if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
xmm_t xtmp1 = xmm_t(ymm_int.getIdx()); xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx()); xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx());
reg64_t reg_ptr_tmp = reg_ptr_global; reg64_t reg_ptr_tmp = reg_ptr_global;
...@@ -174,8 +175,8 @@ class VActFunc : public JitCode { ...@@ -174,8 +175,8 @@ class VActFunc : public JitCode {
// compute SIGMOID with ymm, xmm // compute SIGMOID with ymm, xmm
template <typename JMM> template <typename JMM>
void sigmoid_jmm(JMM& dst, void sigmoid_jmm(JMM& dst, // NOLINT
JMM& src, JMM& src, // NOLINT
int src_idx = 11, // NOLINT int src_idx = 11, // NOLINT
int fx_idx = 12, int fx_idx = 12,
int fy_idx = 13, int fy_idx = 13,
...@@ -203,8 +204,8 @@ class VActFunc : public JitCode { ...@@ -203,8 +204,8 @@ class VActFunc : public JitCode {
// compute TANH with ymm, xmm // compute TANH with ymm, xmm
template <typename JMM> template <typename JMM>
void tanh_jmm(JMM& dst, void tanh_jmm(JMM& dst, // NOLINT
JMM& src, JMM& src, // NOLINT
int src_idx = 11, // NOLINT int src_idx = 11, // NOLINT
int fx_idx = 12, int fx_idx = 12,
int fy_idx = 13, int fy_idx = 13,
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof #include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -132,7 +132,7 @@ void AdamJitCode::genCode() { ...@@ -132,7 +132,7 @@ void AdamJitCode::genCode() {
class AdamCreator : public JitCodeCreator<adam_attr_t> { class AdamCreator : public JitCodeCreator<adam_attr_t> {
public: public:
bool CanBeUsed(const adam_attr_t& attr) const override { bool CanBeUsed(const adam_attr_t& attr) const override {
return platform::MayIUse(platform::avx512f); return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f);
} }
size_t CodeSize(const adam_attr_t& attr) const override { size_t CodeSize(const adam_attr_t& attr) const override {
return 96 + 32 * 8; return 96 + 32 * 8;
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof #include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -147,7 +147,7 @@ void AdamWJitCode::genCode() { ...@@ -147,7 +147,7 @@ void AdamWJitCode::genCode() {
class AdamWCreator : public JitCodeCreator<int> { class AdamWCreator : public JitCodeCreator<int> {
public: public:
bool CanBeUsed(const int& attr) const override { bool CanBeUsed(const int& attr) const override {
return platform::MayIUse(platform::avx512f); return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f);
} }
size_t CodeSize(const int& attr) const override { return 96 + 32 * 8; } size_t CodeSize(const int& attr) const override { return 96 + 32 * 8; }
std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "paddle/fluid/operators/jit/macro.h" #include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -145,7 +145,7 @@ void NCHW16CMulNCJitCode::genCode() { ...@@ -145,7 +145,7 @@ void NCHW16CMulNCJitCode::genCode() {
class NCHW16CMulNCCreator : public JitCodeCreator<int> { class NCHW16CMulNCCreator : public JitCodeCreator<int> {
public: public:
bool CanBeUsed(const int& attr) const override { bool CanBeUsed(const int& attr) const override {
return platform::MayIUse(platform::avx512f); return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f);
} }
size_t CodeSize(const int& d) const override { return 256 * 1024; } size_t CodeSize(const int& d) const override { return 256 * 1024; }
std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {
...@@ -157,7 +157,8 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> { ...@@ -157,7 +157,8 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
class name##Creator : public JitCodeCreator<int> { \ class name##Creator : public JitCodeCreator<int> { \
public: \ public: \
bool CanBeUsed(const int& attr) const override { \ bool CanBeUsed(const int& attr) const override { \
return platform::MayIUse(platform::avx) && attr <= 1024; \ return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
attr <= 1024; \
} \ } \
size_t CodeSize(const int& d) const override { \ size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h" #include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -123,7 +123,7 @@ void EmbSeqPoolJitCode::genCode() { ...@@ -123,7 +123,7 @@ void EmbSeqPoolJitCode::genCode() {
class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> { class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
public: public:
bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override { bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx) && return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
attr.table_width % YMM_FLOAT_BLOCK == 0; attr.table_width % YMM_FLOAT_BLOCK == 0;
} }
size_t CodeSize(const emb_seq_pool_attr_t& attr) const override { size_t CodeSize(const emb_seq_pool_attr_t& attr) const override {
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h" #include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -85,20 +85,21 @@ void GRUJitCode::genCode() { ...@@ -85,20 +85,21 @@ void GRUJitCode::genCode() {
ret(); ret();
} }
#define DECLARE_GRU_CREATOR(name) \ #define DECLARE_GRU_CREATOR(name) \
class name##Creator : public JitCodeCreator<gru_attr_t> { \ class name##Creator : public JitCodeCreator<gru_attr_t> { \
public: \ public: \
/* TODO(TJ): enable more */ \ /* TODO(TJ): enable more */ \
bool CanBeUsed(const gru_attr_t& attr) const override { \ bool CanBeUsed(const gru_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
} \ attr.d % 8 == 0; \
size_t CodeSize(const gru_attr_t& attr) const override { \ } \
return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \ size_t CodeSize(const gru_attr_t& attr) const override { \
} \ return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \
std::unique_ptr<GenBase> CreateJitCode( \ } \
const gru_attr_t& attr) const override { \ std::unique_ptr<GenBase> CreateJitCode( \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \ const gru_attr_t& attr) const override { \
} \ return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
} }
DECLARE_GRU_CREATOR(GRUH1); DECLARE_GRU_CREATOR(GRUH1);
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/hopv.h" #include "paddle/fluid/operators/jit/gen/hopv.h"
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -78,7 +78,7 @@ void HOPVJitCode::genCode() { ...@@ -78,7 +78,7 @@ void HOPVJitCode::genCode() {
class name##Creator : public JitCodeCreator<int> { \ class name##Creator : public JitCodeCreator<int> { \
public: \ public: \
bool CanBeUsed(const int& attr) const override { \ bool CanBeUsed(const int& attr) const override { \
return platform::MayIUse(platform::avx); \ return phi::backends::cpu::MayIUse(phi::backends::cpu::avx); \
} \ } \
size_t CodeSize(const int& d) const override { \ size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include <type_traits> #include <type_traits>
#include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#define XBYAK_USE_MMAP_ALLOCATOR #define XBYAK_USE_MMAP_ALLOCATOR
#include "xbyak/xbyak.h" #include "xbyak/xbyak.h"
...@@ -92,7 +92,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { ...@@ -92,7 +92,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
for (int i = 0; i < num_g_abi_regs; ++i) { for (int i = 0; i < num_g_abi_regs; ++i) {
push(Xbyak::Reg64(g_abi_regs[i])); push(Xbyak::Reg64(g_abi_regs[i]));
} }
if (platform::MayIUse(platform::avx512f)) { if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
} }
} }
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h" #include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -113,20 +113,21 @@ void LSTMJitCode::genCode() { ...@@ -113,20 +113,21 @@ void LSTMJitCode::genCode() {
} }
} }
#define DECLARE_LSTM_CREATOR(name) \ #define DECLARE_LSTM_CREATOR(name) \
class name##Creator : public JitCodeCreator<lstm_attr_t> { \ class name##Creator : public JitCodeCreator<lstm_attr_t> { \
public: \ public: \
/* TODO(TJ): enable more */ \ /* TODO(TJ): enable more */ \
bool CanBeUsed(const lstm_attr_t& attr) const override { \ bool CanBeUsed(const lstm_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
} \ attr.d % 8 == 0; \
size_t CodeSize(const lstm_attr_t& attr) const override { \ } \
return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \ size_t CodeSize(const lstm_attr_t& attr) const override { \
} \ return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \
std::unique_ptr<GenBase> CreateJitCode( \ } \
const lstm_attr_t& attr) const override { \ std::unique_ptr<GenBase> CreateJitCode( \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \ const lstm_attr_t& attr) const override { \
} \ return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
} }
DECLARE_LSTM_CREATOR(LSTMCtHt); DECLARE_LSTM_CREATOR(LSTMCtHt);
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof #include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -110,12 +110,13 @@ void MatMulJitCode::genCode() { ...@@ -110,12 +110,13 @@ void MatMulJitCode::genCode() {
class MatMulCreator : public JitCodeCreator<matmul_attr_t> { class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
public: public:
bool CanBeUsed(const matmul_attr_t& attr) const override { bool CanBeUsed(const matmul_attr_t& attr) const override {
return attr.m == 1 && platform::MayIUse(platform::avx512f) && return attr.m == 1 &&
phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f) &&
attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512; attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
} }
size_t CodeSize(const matmul_attr_t& attr) const override { size_t CodeSize(const matmul_attr_t& attr) const override {
int block = YMM_FLOAT_BLOCK; int block = YMM_FLOAT_BLOCK;
if (platform::MayIUse(platform::avx512f)) { if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
block = ZMM_FLOAT_BLOCK; block = ZMM_FLOAT_BLOCK;
} }
return 96 + 4 * attr.k * (attr.n / block + 1) * 8; return 96 + 4 * attr.k * (attr.n / block + 1) * 8;
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -59,7 +59,7 @@ void SeqPoolJitCode::genCode() { ...@@ -59,7 +59,7 @@ void SeqPoolJitCode::genCode() {
class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> { class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
public: public:
bool CanBeUsed(const seq_pool_attr_t& attr) const override { bool CanBeUsed(const seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx); return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
} }
size_t CodeSize(const seq_pool_attr_t& attr) const override { size_t CodeSize(const seq_pool_attr_t& attr) const override {
return 96 + ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) * return 96 + ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof #include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -109,7 +109,7 @@ void SgdJitCode::genCode() { ...@@ -109,7 +109,7 @@ void SgdJitCode::genCode() {
class SgdCreator : public JitCodeCreator<sgd_attr_t> { class SgdCreator : public JitCodeCreator<sgd_attr_t> {
public: public:
bool CanBeUsed(const sgd_attr_t& attr) const override { bool CanBeUsed(const sgd_attr_t& attr) const override {
return platform::MayIUse(platform::avx) && return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
attr.grad_width % YMM_FLOAT_BLOCK == 0; attr.grad_width % YMM_FLOAT_BLOCK == 0;
} }
size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; } size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/vbroadcast.h" #include "paddle/fluid/operators/jit/gen/vbroadcast.h"
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -69,7 +69,8 @@ void VBroadcastJitCode::genCode() { ...@@ -69,7 +69,8 @@ void VBroadcastJitCode::genCode() {
class VBroadcastCreator : public JitCodeCreator<int64_t> { class VBroadcastCreator : public JitCodeCreator<int64_t> {
public: public:
bool CanBeUsed(const int64_t& w) const override { bool CanBeUsed(const int64_t& w) const override {
return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0; return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
w % YMM_FLOAT_BLOCK == 0;
} }
size_t CodeSize(const int64_t& w) const override { size_t CodeSize(const int64_t& w) const override {
return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8; return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include <fstream> #include <fstream>
#include "paddle/fluid/memory/allocation/cpu_allocator.h" // for posix_memalign #include "paddle/fluid/memory/allocation/cpu_allocator.h" // for posix_memalign
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifndef _WIN32 #ifndef _WIN32
#define posix_memalign_free free #define posix_memalign_free free
...@@ -66,7 +66,7 @@ void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); } ...@@ -66,7 +66,7 @@ void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) { std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
int block; int block;
int max_num_regs; int max_num_regs;
if (platform::MayIUse(platform::avx512f)) { if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
block = ZMM_FLOAT_BLOCK; block = ZMM_FLOAT_BLOCK;
max_num_regs = 32; max_num_regs = 32;
} else { } else {
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <limits> #include <limits>
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -172,7 +172,7 @@ bool CRFDecodingKernel::CanBeUsed(const int& d) const { ...@@ -172,7 +172,7 @@ bool CRFDecodingKernel::CanBeUsed(const int& d) const {
#else #else
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
#endif #endif
return platform::MayIUse(platform::avx) && d >= block; return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d >= block;
} }
} // namespace intrinsic } // namespace intrinsic
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <limits> #include <limits>
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -179,7 +179,8 @@ void LayerNorm(float* x, ...@@ -179,7 +179,8 @@ void LayerNorm(float* x,
} }
bool LayerNormKernel::CanBeUsed(const int& d) const { bool LayerNormKernel::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK; return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
d >= YMM_FLOAT_BLOCK;
} }
} // namespace intrinsic } // namespace intrinsic
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
#include "paddle/fluid/operators/jit/refer/refer.h" #include "paddle/fluid/operators/jit/refer/refer.h"
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/dynload/mklml.h" #include "paddle/fluid/platform/dynload/mklml.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -188,17 +188,17 @@ void StrideASum<double>(const double* x, double* res, int n, int stride) { ...@@ -188,17 +188,17 @@ void StrideASum<double>(const double* x, double* res, int n, int stride) {
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template <> template <>
bool VMulKernel<float>::CanBeUsed(const int& d) const { bool VMulKernel<float>::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx512f) && d > 512; return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f) && d > 512;
} }
template <> template <>
bool VAddKernel<float>::CanBeUsed(const int& d) const { bool VAddKernel<float>::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d > 512; return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d > 512;
} }
template <> template <>
bool VScalKernel<float>::CanBeUsed(const int& d) const { bool VScalKernel<float>::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx512f) && d > 512; return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f) && d > 512;
} }
template <> template <>
...@@ -274,7 +274,7 @@ bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const { ...@@ -274,7 +274,7 @@ bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
template <> template <>
bool MatMulKernel<float>::CanBeUsed(const matmul_attr_t& attr) const { bool MatMulKernel<float>::CanBeUsed(const matmul_attr_t& attr) const {
return platform::MayIUse(platform::avx); return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
} }
template <> template <>
...@@ -285,7 +285,7 @@ bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const { ...@@ -285,7 +285,7 @@ bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
template <> template <>
bool SoftmaxKernel<float>::CanBeUsed(const int& d) const { bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
// tuned on avx2 // tuned on avx2
return platform::MayIUse(platform::avx) && d < 60; return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d < 60;
} }
#define AWALYS_USE_ME_WITH_DOUBLE(func) \ #define AWALYS_USE_ME_WITH_DOUBLE(func) \
......
...@@ -19,8 +19,8 @@ limitations under the License. */ ...@@ -19,8 +19,8 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
DEFINE_double(acc, 1e-5, "Test accuracy threshold."); DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
...@@ -437,7 +437,7 @@ void TestKernelNCHW16CMulNC() { ...@@ -437,7 +437,7 @@ void TestKernelNCHW16CMulNC() {
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
if (std::is_same<T, float>::value && if (std::is_same<T, float>::value &&
paddle::platform::MayIUse(paddle::platform::avx512f)) { phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
EXPECT_TRUE(jitcode != nullptr); EXPECT_TRUE(jitcode != nullptr);
} }
for (int ni = 0; ni < n; ni++) { for (int ni = 0; ni < n; ni++) {
...@@ -1393,7 +1393,7 @@ TEST(JITKernel_helper, pack_weights) { ...@@ -1393,7 +1393,7 @@ TEST(JITKernel_helper, pack_weights) {
} }
int block = 0; int block = 0;
std::vector<int> groups; std::vector<int> groups;
if (paddle::platform::MayIUse(paddle::platform::avx512f)) { if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
block = ZMM_FLOAT_BLOCK; block = ZMM_FLOAT_BLOCK;
groups.push_back(30); groups.push_back(30);
} else { } else {
......
...@@ -32,7 +32,6 @@ math_library(maxouting) ...@@ -32,7 +32,6 @@ math_library(maxouting)
math_library(sequence_padding) math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_pooling DEPS math_function jit_kernel_helper)
math_library(sequence_scale) math_library(sequence_scale)
math_library(softmax DEPS math_function jit_kernel_helper)
if(WITH_ASCEND_CL) if(WITH_ASCEND_CL)
math_library(beam_search DEPS math_function beam_search_npu) math_library(beam_search DEPS math_function beam_search_npu)
elseif(WITH_XPU) elseif(WITH_XPU)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include <functional>
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
namespace paddle {
namespace operators {
namespace math {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define YMM_FLOAT_BLOCK 8
#define AVX_DOUBLE_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define AVX2_DOUBLE_BLOCK 4
#define ZMM_FLOAT_BLOCK 16
#define AVX512_DOUBLE_BLOCK 8
template <typename T>
inline void vec_exp(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = std::exp(x[i]);
}
}
template <typename T>
inline void vec_scal(const int n, const T a, T* x) {
for (int i = 0; i < n; ++i) {
x[i] = a * x[i];
}
}
#ifdef PADDLE_WITH_MKLML
template <>
inline void vec_exp<float>(const int n, const float* x, float* y) {
constexpr int small_enough = 128;
if (n < small_enough) {
for (int i = 0; i < n; ++i) {
y[i] = std::exp(x[i]);
}
} else {
platform::dynload::vsExp(n, x, y);
}
}
template <>
inline void vec_exp<double>(const int n, const double* x, double* y) {
platform::dynload::vdExp(n, x, y);
}
template <>
inline void vec_scal<float>(const int n, const float a, float* x) {
platform::dynload::cblas_sscal(n, a, x, 1);
}
template <>
inline void vec_scal<double>(const int n, const double a, double* x) {
platform::dynload::cblas_dscal(n, a, x, 1);
}
#endif
// MKL scal only support inplace, choose this if src and dst are not equal
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_scal(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = a * x[i];
}
}
template <>
inline void vec_scal<float, platform::avx>(const int n,
const float a,
const float* x,
float* y) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_scal<float, platform::isa_any>(n, a, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 scalar = _mm256_set1_ps(a);
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
y[i] = a * x[i];
}
#else
vec_scal<float, platform::isa_any>(n, a, x, y);
#endif
}
template <>
inline void vec_scal<float, platform::avx2>(const int n,
const float a,
const float* x,
float* y) {
vec_scal<float, platform::avx>(n, a, x, y);
}
template <>
inline void vec_scal<float, platform::avx512f>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_scal<float, platform::avx2>(n, a, x, y);
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_sum(const size_t n, const T* x, T* s) {
s[0] = x[0];
for (size_t i = 1; i < n; ++i) {
s[0] += x[i];
}
}
template <>
inline void vec_sum<float, platform::avx>(const size_t n,
const float* x,
float* s) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_sum<float, platform::isa_any>(n, x, s);
return;
}
unsigned int i, end;
i = end = 0;
s[0] = 0.f;
end = n & ~(block - 1);
__m256 tmp = _mm256_setzero_ps();
for (i = 0; i < end; i += block) {
tmp = _mm256_add_ps(tmp, _mm256_loadu_ps(x + i));
}
__m256 hsum = _mm256_hadd_ps(tmp, tmp);
hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
_mm_store_ss(
s,
_mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
for (; i < n; i++) {
s[0] += x[i];
}
#else
vec_sum<float, platform::isa_any>(n, x, s);
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
for (size_t i = 0; i < n; ++i) {
z[i] = x[i] * y[i];
}
}
template <>
inline void vec_mul<float, platform::avx>(const size_t n,
const float* x,
const float* y,
float* z) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_mul<float, platform::isa_any>(n, x, y, z);
return;
}
unsigned int i = 0, end = 0;
end = n & ~(block - 1);
for (i = 0; i < end; i += block) {
_mm256_storeu_ps(
z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
}
for (; i < n; i++) {
z[i] = x[i] * y[i];
}
#else
vec_mul<float, platform::isa_any>(n, x, y, z);
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
z[0] = x[0] * y[0];
for (size_t i = 1; i < n; ++i) {
z[0] += x[i] * y[i];
}
}
template <>
inline void vec_mul_reduce<float, platform::avx>(const size_t n,
const float* x,
const float* y,
float* z) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
return;
}
unsigned int i = 0, end = 0;
z[0] = 0.f;
end = n & ~(block - 1);
__m256 tmp = _mm256_setzero_ps();
for (i = 0; i < end; i += block) {
tmp = _mm256_add_ps(
tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
}
__m256 hsum = _mm256_hadd_ps(tmp, tmp);
hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
_mm_store_ss(
z,
_mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
for (; i < n; i++) {
z[0] += x[i] * y[i];
}
#else
vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = a - x[i];
}
}
template <>
inline void vec_bias_sub<float, platform::avx>(const int n,
const float a,
const float* x,
float* y) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_bias_sub<float, platform::isa_any>(n, a, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(a);
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_sub_ps(bias, tmp); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
y[i] = a - x[i];
}
#else
vec_bias_sub<float, platform::isa_any>(n, a, x, y);
#endif
}
template <>
inline void vec_bias_sub<float, platform::avx2>(const int n,
const float a,
const float* x,
float* y) {
vec_bias_sub<float, platform::avx>(n, a, x, y);
}
template <>
inline void vec_bias_sub<float, platform::avx512f>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_bias_sub<float, platform::avx2>(n, a, x, y);
}
// out = x*y + (1-x)*z
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
for (int i = 0; i < n; ++i) {
out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
}
}
template <>
inline void vec_cross<float, platform::avx>(
const int n, const float* x, const float* y, const float* z, float* out) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_cross<float, platform::isa_any>(n, x, y, z, out);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(1.f);
__m256 tmpx, tmpy, tmpz;
for (i = 0; i < end; i += block) {
tmpx = _mm256_loadu_ps(x + i);
tmpy = _mm256_loadu_ps(y + i);
tmpz = _mm256_loadu_ps(z + i);
tmpy = _mm256_mul_ps(tmpx, tmpy);
tmpx = _mm256_sub_ps(bias, tmpx);
tmpz = _mm256_mul_ps(tmpx, tmpz);
tmpz = _mm256_add_ps(tmpy, tmpz);
_mm256_storeu_ps(out + i, tmpz);
}
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
}
#else
vec_cross<float, platform::isa_any>(n, x, y, z, out);
#endif
}
template <>
inline void vec_cross<float, platform::avx2>(
const int n, const float* x, const float* y, const float* z, float* out) {
vec_cross<float, platform::avx>(n, x, y, z, out);
}
template <>
inline void vec_cross<float, platform::avx512f>(
const int n, const float* x, const float* y, const float* z, float* out) {
// TODO(TJ): enable me
vec_cross<float, platform::avx>(n, x, y, z, out);
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
for (size_t i = 0; i < n; ++i) {
y[i] = x[i] < a ? a : x[i];
}
}
template <>
inline void vec_clip<float, platform::avx>(const size_t n,
const float a,
const float* x,
float* y) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_clip<float, platform::isa_any>(n, a, x, y);
return;
}
unsigned int i = 0, end = 0;
end = n & ~(block - 1);
__m256 threshold = _mm256_set1_ps(a);
for (i = 0; i < end; i += block) {
_mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold));
}
for (; i < n; i++) {
y[i] = x[i] < a ? a : x[i];
}
#else
vec_clip<float, platform::isa_any>(n, a, x, y);
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] + a;
}
}
template <>
inline void vec_add_bias<float, platform::avx>(const int n,
const float a,
const float* x,
float* y) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_add_bias<float, platform::isa_any>(n, a, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(a);
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_add_ps(tmp, bias); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
y[i] = x[i] + a;
}
#else
vec_add_bias<float, platform::isa_any>(n, a, x, y);
#endif
}
template <>
inline void vec_add_bias<float, platform::avx2>(const int n,
const float a,
const float* x,
float* y) {
vec_add_bias<float, platform::avx>(n, a, x, y);
}
template <>
inline void vec_add_bias<float, platform::avx512f>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_add_bias<float, platform::avx2>(n, a, x, y);
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_identity(const int n, const T* x, T* y) {
// do nothing
return;
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_sigmoid(const int n, const T* x, T* y) {
const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX;
for (int i = 0; i < n; ++i) {
y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
y[i] = static_cast<T>(0) - y[i];
}
vec_exp<T>(n, y, y);
for (int i = 0; i < n; ++i) {
y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
}
}
template <>
inline void vec_sigmoid<float, platform::avx>(const int n,
const float* x,
float* y) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_sigmoid<float, platform::isa_any>(n, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
__m256 zeros = _mm256_setzero_ps();
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, min); \
tmp = _mm256_min_ps(tmp, max); \
tmp = _mm256_sub_ps(zeros, tmp); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest != 0) {
// can not continue move step since the src and dst address could be equal
const float xmin = SIGMOID_THRESHOLD_MIN;
const float xmax = SIGMOID_THRESHOLD_MAX;
for (i = n - rest; i < n; ++i) {
y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
}
}
vec_exp<float>(n, y, y);
__m256 ones = _mm256_set1_ps(1.0f);
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(y + i); \
tmp = _mm256_add_ps(ones, tmp); \
tmp = _mm256_div_ps(ones, tmp); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step
for (i = n - rest; i < n; ++i) {
y[i] = 1.f / (1.f + y[i]);
}
#else
vec_sigmoid<float, platform::isa_any>(n, x, y);
#endif
}
template <>
inline void vec_sigmoid<float, platform::avx2>(const int n,
const float* x,
float* y) {
vec_sigmoid<float, platform::avx>(n, x, y);
}
template <>
inline void vec_sigmoid<float, platform::avx512f>(const int n,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_sigmoid<float, platform::avx2>(n, x, y);
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_tanh(const int n, const T* x, T* y) {
vec_scal<T, isa>(n, static_cast<T>(2), x, y);
vec_sigmoid<T, isa>(n, y, y);
vec_scal<T>(n, static_cast<T>(2), y);
vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
}
// TODO(TJ): make relu clip
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_relu(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
}
}
template <>
inline void vec_relu<float, platform::avx>(const int n,
const float* x,
float* y) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block * 4) {
vec_relu<float, platform::isa_any>(n, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 zeros = _mm256_setzero_ps();
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
if (rest == 0) {
return;
}
i = n - block;
MOVE_ONE_STEP;
#undef MOVE_ONE_STEP
#else
vec_relu<float, platform::isa_any>(n, x, y);
#endif
}
template <>
inline void vec_relu<float, platform::avx2>(const int n,
const float* x,
float* y) {
vec_relu<float, platform::avx>(n, x, y);
}
template <>
inline void vec_relu<float, platform::avx512f>(const int n,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_relu<float, platform::avx2>(n, x, y);
}
// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
class VecActivations {
public:
std::function<void(const int, const T*, T*)> operator()(
const std::string& type) {
if (type == "sigmoid") {
return vec_sigmoid<T, isa>;
} else if (type == "relu") {
return vec_relu<T, isa>;
} else if (type == "tanh") {
return vec_tanh<T, isa>;
} else if (type == "identity" || type == "") {
return vec_identity<T, isa>;
}
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected type should be one of sigmod, relu, tanh, identity. But got "
"not support type: %s.",
type));
}
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -21,9 +21,9 @@ limitations under the License. */ ...@@ -21,9 +21,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/sample_prob.h" #include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/sample_logits_op.h" #include "paddle/fluid/operators/sample_logits_op.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -21,8 +21,8 @@ limitations under the License. */ ...@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/sample_prob.h" #include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -61,7 +61,7 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> { ...@@ -61,7 +61,7 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
phi::make_ddim({1UL, end_pos - start_pos}); phi::make_ddim({1UL, end_pos - start_pos});
x_i.Resize(dims_i); x_i.Resize(dims_i);
out_i.Resize(dims_i); out_i.Resize(dims_i);
math::SoftmaxCUDNNFunctor<T, phi::GPUContext>()( phi::funcs::SoftmaxCUDNNFunctor<T, phi::GPUContext>()(
ctx.template device_context<phi::GPUContext>(), &x_i, &out_i); ctx.template device_context<phi::GPUContext>(), &x_i, &out_i);
} }
} }
...@@ -95,7 +95,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> { ...@@ -95,7 +95,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
out_i.Resize(dims_i); out_i.Resize(dims_i);
out_grad_i.Resize(dims_i); out_grad_i.Resize(dims_i);
x_grad_i.Resize(dims_i); x_grad_i.Resize(dims_i);
math::SoftmaxGradCUDNNFunctor<T, phi::GPUContext>()( phi::funcs::SoftmaxGradCUDNNFunctor<T, phi::GPUContext>()(
ctx.template device_context<phi::GPUContext>(), ctx.template device_context<phi::GPUContext>(),
&out_i, &out_i,
&out_grad_i, &out_grad_i,
......
...@@ -16,10 +16,10 @@ limitations under the License. */ ...@@ -16,10 +16,10 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h" #include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -56,18 +56,10 @@ cc_test( ...@@ -56,18 +56,10 @@ cc_test(
SRCS enforce_test.cc SRCS enforce_test.cc
DEPS enforce) DEPS enforce)
set(CPU_INFO_DEPS gflags glog enforce)
if(WITH_XBYAK)
list(APPEND CPU_INFO_DEPS xbyak)
endif()
cc_library(
cpu_info
SRCS cpu_info.cc
DEPS ${CPU_INFO_DEPS})
cc_test( cc_test(
cpu_info_test cpu_info_test
SRCS cpu_info_test.cc SRCS cpu_info_test.cc
DEPS cpu_info) DEPS phi_backends)
cc_library( cc_library(
os_info os_info
SRCS os_info.cc SRCS os_info.cc
...@@ -194,7 +186,6 @@ cc_library( ...@@ -194,7 +186,6 @@ cc_library(
phi_place phi_place
eigen3 eigen3
cpu_helper cpu_helper
cpu_info
framework_proto framework_proto
${IPU_CTX_DEPS} ${IPU_CTX_DEPS}
${GPU_CTX_DEPS} ${GPU_CTX_DEPS}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h>
#ifdef _WIN32
#if defined(__AVX2__)
#include <immintrin.h> // avx2
#elif defined(__AVX__)
#include <intrin.h> // avx
#endif // AVX
#else // WIN32
#ifdef __AVX__
#include <immintrin.h>
#endif
#endif // WIN32
#if defined(_WIN32)
#define ALIGN32_BEG __declspec(align(32))
#define ALIGN32_END
#else
#define ALIGN32_BEG
#define ALIGN32_END __attribute__((aligned(32)))
#endif // _WIN32
#ifndef PADDLE_WITH_XBYAK
#ifdef _WIN32
#define cpuid(reg, x) __cpuidex(reg, x, 0)
#else
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
!defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
#include <cpuid.h>
inline void cpuid(int reg[4], int x) {
__cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
}
#endif
#endif
#endif
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace platform {
size_t CpuTotalPhysicalMemory();
//! Get the maximum allocation size for a machine.
size_t CpuMaxAllocSize();
//! Get the maximum allocation size for a machine.
size_t CUDAPinnedMaxAllocSize();
using phi::backends::cpu::CpuMinChunkSize;
//! Get the maximum chunk size for buddy allocator.
size_t CpuMaxChunkSize();
//! Get the minimum chunk size for buddy allocator.
size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize();
//! Get the maximum allocation size for a machine.
size_t NPUPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator.
size_t NPUPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t NPUPinnedMaxChunkSize();
using namespace phi::backends::cpu; // NOLINT
// May I use some instruction
bool MayIUse(const cpu_isa_t cpu_isa);
} // namespace platform
} // namespace paddle
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#include <sstream> #include <sstream>
...@@ -23,7 +23,8 @@ DECLARE_double(fraction_of_cpu_memory_to_use); ...@@ -23,7 +23,8 @@ DECLARE_double(fraction_of_cpu_memory_to_use);
TEST(CpuMemoryUsage, Print) { TEST(CpuMemoryUsage, Print) {
std::stringstream ss; std::stringstream ss;
size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024; size_t memory_size =
phi::backends::cpu::CpuMaxAllocSize() / 1024 / 1024 / 1024;
float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100; float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n", std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
......
...@@ -16,9 +16,9 @@ limitations under the License. */ ...@@ -16,9 +16,9 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
......
...@@ -29,7 +29,7 @@ cc_library( ...@@ -29,7 +29,7 @@ cc_library(
cc_library( cc_library(
cpu_utilization cpu_utilization
SRCS cpu_utilization.cc SRCS cpu_utilization.cc
DEPS cpu_info os_info enforce glog) DEPS phi_backends os_info enforce glog)
cc_library( cc_library(
new_profiler new_profiler
SRCS profiler.cc SRCS profiler.cc
......
...@@ -72,7 +72,6 @@ limitations under the License. */ ...@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
...@@ -89,6 +88,7 @@ limitations under the License. */ ...@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h" #include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h" #include "paddle/utils/none.h"
......
...@@ -72,7 +72,6 @@ limitations under the License. */ ...@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
...@@ -89,6 +88,7 @@ limitations under the License. */ ...@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h" #include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h" #include "paddle/utils/none.h"
......
...@@ -75,7 +75,6 @@ limitations under the License. */ ...@@ -75,7 +75,6 @@ limitations under the License. */
#include "paddle/fluid/operators/ops_extra_info.h" #include "paddle/fluid/operators/ops_extra_info.h"
#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
...@@ -94,6 +93,7 @@ limitations under the License. */ ...@@ -94,6 +93,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/io.h" #include "paddle/fluid/pybind/io.h"
#include "paddle/fluid/pybind/jit.h" #include "paddle/fluid/pybind/jit.h"
#include "paddle/fluid/pybind/xpu_streams_py.h" #include "paddle/fluid/pybind/xpu_streams_py.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h" #include "paddle/utils/none.h"
...@@ -327,7 +327,7 @@ bool SupportsBfloat16() { ...@@ -327,7 +327,7 @@ bool SupportsBfloat16() {
#ifndef PADDLE_WITH_MKLDNN #ifndef PADDLE_WITH_MKLDNN
return false; return false;
#else #else
if (platform::MayIUse(platform::cpu_isa_t::avx512_core)) if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core))
return true; return true;
else else
return false; return false;
...@@ -338,7 +338,7 @@ bool SupportsBfloat16FastPerformance() { ...@@ -338,7 +338,7 @@ bool SupportsBfloat16FastPerformance() {
#ifndef PADDLE_WITH_MKLDNN #ifndef PADDLE_WITH_MKLDNN
return false; return false;
#else #else
if (platform::MayIUse(platform::cpu_isa_t::avx512_bf16)) if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_bf16))
return true; return true;
else else
return false; return false;
...@@ -349,8 +349,8 @@ bool SupportsInt8() { ...@@ -349,8 +349,8 @@ bool SupportsInt8() {
#ifndef PADDLE_WITH_MKLDNN #ifndef PADDLE_WITH_MKLDNN
return false; return false;
#else #else
return (platform::MayIUse(platform::cpu_isa_t::avx2) || return (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx2) ||
platform::MayIUse(platform::cpu_isa_t::avx512f)); phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512f));
#endif #endif
} }
...@@ -358,7 +358,8 @@ bool SupportsVNNI() { ...@@ -358,7 +358,8 @@ bool SupportsVNNI() {
#ifndef PADDLE_WITH_MKLDNN #ifndef PADDLE_WITH_MKLDNN
return false; return false;
#else #else
return platform::MayIUse(platform::cpu_isa_t::avx512_core_vnni); return phi::backends::cpu::MayIUse(
phi::backends::cpu::cpu_isa_t::avx512_core_vnni);
#endif #endif
} }
...@@ -615,7 +616,7 @@ PYBIND11_MODULE(libpaddle, m) { ...@@ -615,7 +616,7 @@ PYBIND11_MODULE(libpaddle, m) {
BindJit(&m); BindJit(&m);
// Not used, just make sure cpu_info.cc is linked. // Not used, just make sure cpu_info.cc is linked.
paddle::platform::CpuTotalPhysicalMemory(); phi::backends::cpu::CpuTotalPhysicalMemory();
paddle::memory::allocation::UseAllocatorStrategyGFlag(); paddle::memory::allocation::UseAllocatorStrategyGFlag();
......
...@@ -72,7 +72,6 @@ limitations under the License. */ ...@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
...@@ -89,6 +88,7 @@ limitations under the License. */ ...@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h" #include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h" #include "paddle/utils/none.h"
......
add_subdirectory(dynload) add_subdirectory(dynload)
add_subdirectory(gpu) add_subdirectory(gpu)
set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc) set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context) set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context)
if(WITH_XBYAK)
list(APPEND BACKENDS_DEPS xbyak)
endif()
if(WITH_GPU OR WITH_ROCM) if(WITH_GPU OR WITH_ROCM)
list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef PADDLE_WITH_XBYAK
#include "xbyak/xbyak_util.h"
#endif
#ifdef __APPLE__ #ifdef __APPLE__
#include <sys/sysctl.h> #include <sys/sysctl.h>
...@@ -30,6 +26,10 @@ limitations under the License. */ ...@@ -30,6 +26,10 @@ limitations under the License. */
#include <unistd.h> #include <unistd.h>
#endif // _WIN32 #endif // _WIN32
#ifdef PADDLE_WITH_XBYAK
#include "xbyak/xbyak_util.h"
#endif
#include <algorithm> #include <algorithm>
#include "paddle/phi/core/flags.h" #include "paddle/phi/core/flags.h"
...@@ -47,8 +47,9 @@ PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory, ...@@ -47,8 +47,9 @@ PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory,
true, true,
"If set, allocate cpu pinned memory."); "If set, allocate cpu pinned memory.");
namespace paddle { namespace phi {
namespace platform { namespace backends {
namespace cpu {
size_t CpuTotalPhysicalMemory() { size_t CpuTotalPhysicalMemory() {
#ifdef __APPLE__ #ifdef __APPLE__
...@@ -87,6 +88,11 @@ size_t CpuMaxChunkSize() { ...@@ -87,6 +88,11 @@ size_t CpuMaxChunkSize() {
static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20)); static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
} }
size_t CpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 4 KB.
return 1 << 12;
}
size_t CUDAPinnedMaxAllocSize() { size_t CUDAPinnedMaxAllocSize() {
// For distributed systems, it requires configuring and limiting // For distributed systems, it requires configuring and limiting
// the fraction of memory to use. // the fraction of memory to use.
...@@ -206,5 +212,6 @@ bool MayIUse(const cpu_isa_t cpu_isa) { ...@@ -206,5 +212,6 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
} }
#endif #endif
} // namespace platform } // namespace cpu
} // namespace paddle } // namespace backends
} // namespace phi
...@@ -36,15 +36,52 @@ ...@@ -36,15 +36,52 @@
#define ALIGN32_END __attribute__((aligned(32))) #define ALIGN32_END __attribute__((aligned(32)))
#endif // _WIN32 #endif // _WIN32
#ifndef PADDLE_WITH_XBYAK
#ifdef _WIN32
#define cpuid(reg, x) __cpuidex(reg, x, 0)
#else
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
!defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
#include <cpuid.h>
inline void cpuid(int reg[4], int x) {
__cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
}
#endif
#endif
#endif
namespace phi { namespace phi {
namespace backends { namespace backends {
namespace cpu { namespace cpu {
size_t CpuTotalPhysicalMemory();
//! Get the maximum allocation size for a machine.
size_t CpuMaxAllocSize();
//! Get the maximum allocation size for a machine.
size_t CUDAPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator. //! Get the minimum chunk size for buddy allocator.
inline size_t CpuMinChunkSize() { size_t CpuMinChunkSize();
// Allow to allocate the minimum chunk size is 4 KB.
return 1 << 12; //! Get the maximum chunk size for buddy allocator.
} size_t CpuMaxChunkSize();
//! Get the minimum chunk size for buddy allocator.
size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize();
//! Get the maximum allocation size for a machine.
size_t NPUPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator.
size_t NPUPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t NPUPinnedMaxChunkSize();
typedef enum { typedef enum {
isa_any, isa_any,
...@@ -59,6 +96,8 @@ typedef enum { ...@@ -59,6 +96,8 @@ typedef enum {
avx512_bf16, avx512_bf16,
} cpu_isa_t; // Instruction set architecture } cpu_isa_t; // Instruction set architecture
// May I use some instruction
bool MayIUse(const cpu_isa_t cpu_isa);
} // namespace cpu } // namespace cpu
} // namespace backends } // namespace backends
} // namespace phi } // namespace phi
...@@ -19,6 +19,7 @@ math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function) ...@@ -19,6 +19,7 @@ math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
math_library(cross_entropy) math_library(cross_entropy)
math_library(im2col) math_library(im2col)
math_library(vol2col) math_library(vol2col)
math_library(softmax DEPS math_function)
cc_library( cc_library(
phi_data_layout_transform phi_data_layout_transform
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <stdint.h> #include <stdint.h>
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
namespace phi { namespace phi {
......
...@@ -12,20 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,20 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/softmax.h" #include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace paddle { namespace phi {
namespace operators { namespace funcs {
namespace math {
template class SoftmaxFunctor<phi::CPUContext, float>; template class SoftmaxFunctor<phi::CPUContext, float>;
template class SoftmaxFunctor<phi::CPUContext, double>; template class SoftmaxFunctor<phi::CPUContext, double>;
template class SoftmaxGradFunctor<phi::CPUContext, float>; template class SoftmaxGradFunctor<phi::CPUContext, float>;
template class SoftmaxGradFunctor<phi::CPUContext, double>; template class SoftmaxGradFunctor<phi::CPUContext, double>;
} // namespace math } // namespace funcs
} // namespace operators } // namespace phi
} // namespace paddle
...@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and ...@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace paddle { namespace phi {
namespace operators { namespace funcs {
namespace math {
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
using DataLayout = platform::DataLayout; using DataLayout = phi::backends::gpu::DataLayout;
template <typename T> template <typename T>
using CudnnDataType = platform::CudnnDataType<T>; using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
template <typename T, typename DeviceContext> template <typename T, typename DeviceContext>
void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()( void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
...@@ -51,31 +50,31 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()( ...@@ -51,31 +50,31 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
xDesc.descriptor<T>(layout, cudnn_tensor_dims); xDesc.descriptor<T>(layout, cudnn_tensor_dims);
miopenTensorDescriptor_t cudnn_y_desc = miopenTensorDescriptor_t cudnn_y_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims); xDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( PADDLE_ENFORCE_GPU_SUCCESS(
context.cudnn_handle(), phi::dynload::miopenSoftmaxForward_V2(context.cudnn_handle(),
CudnnDataType<T>::kOne(), CudnnDataType<T>::kOne(),
cudnn_x_desc, cudnn_x_desc,
X->data<T>(), X->data<T>(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kZero(),
cudnn_y_desc, cudnn_y_desc,
Y->mutable_data<T>(context.GetPlace()), context.template Alloc<T>(Y),
MIOPEN_SOFTMAX_ACCURATE, MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE)); MIOPEN_SOFTMAX_MODE_INSTANCE));
#else #else
cudnnTensorDescriptor_t cudnn_x_desc = cudnnTensorDescriptor_t cudnn_x_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims); xDesc.descriptor<T>(layout, cudnn_tensor_dims);
cudnnTensorDescriptor_t cudnn_y_desc = cudnnTensorDescriptor_t cudnn_y_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims); xDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward( PADDLE_ENFORCE_GPU_SUCCESS(
context.cudnn_handle(), phi::dynload::cudnnSoftmaxForward(context.cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE, CUDNN_SOFTMAX_MODE_INSTANCE,
CudnnDataType<T>::kOne(), CudnnDataType<T>::kOne(),
cudnn_x_desc, cudnn_x_desc,
X->data<T>(), X->data<T>(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kZero(),
cudnn_y_desc, cudnn_y_desc,
Y->mutable_data<T>(context.GetPlace()))); context.template Alloc<T>(Y)));
#endif #endif
} }
...@@ -106,18 +105,18 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()( ...@@ -106,18 +105,18 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
dxDesc.descriptor<T>(layout, cudnn_tensor_dims); dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
miopenTensorDescriptor_t cudnn_ygrad_desc = miopenTensorDescriptor_t cudnn_ygrad_desc =
dyDesc.descriptor<T>(layout, cudnn_tensor_dims); dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( PADDLE_ENFORCE_GPU_SUCCESS(
context.cudnn_handle(), phi::dynload::miopenSoftmaxBackward_V2(context.cudnn_handle(),
CudnnDataType<T>::kOne(), CudnnDataType<T>::kOne(),
cudnn_y_desc, cudnn_y_desc,
Y->data<T>(), Y->data<T>(),
cudnn_ygrad_desc, cudnn_ygrad_desc,
YGrad->data<T>(), YGrad->data<T>(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kZero(),
cudnn_xgrad_desc, cudnn_xgrad_desc,
XGrad->mutable_data<T>(context.GetPlace()), context.template Alloc<T>(XGrad),
MIOPEN_SOFTMAX_ACCURATE, MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE)); MIOPEN_SOFTMAX_MODE_INSTANCE));
#else #else
cudnnTensorDescriptor_t cudnn_y_desc = cudnnTensorDescriptor_t cudnn_y_desc =
yDesc.descriptor<T>(layout, cudnn_tensor_dims); yDesc.descriptor<T>(layout, cudnn_tensor_dims);
...@@ -125,28 +124,28 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()( ...@@ -125,28 +124,28 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
dxDesc.descriptor<T>(layout, cudnn_tensor_dims); dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
cudnnTensorDescriptor_t cudnn_ygrad_desc = cudnnTensorDescriptor_t cudnn_ygrad_desc =
dyDesc.descriptor<T>(layout, cudnn_tensor_dims); dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward( PADDLE_ENFORCE_GPU_SUCCESS(
context.cudnn_handle(), phi::dynload::cudnnSoftmaxBackward(context.cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE, CUDNN_SOFTMAX_MODE_INSTANCE,
CudnnDataType<T>::kOne(), CudnnDataType<T>::kOne(),
cudnn_y_desc, cudnn_y_desc,
Y->data<T>(), Y->data<T>(),
cudnn_ygrad_desc, cudnn_ygrad_desc,
YGrad->data<T>(), YGrad->data<T>(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kZero(),
cudnn_xgrad_desc, cudnn_xgrad_desc,
XGrad->mutable_data<T>(context.GetPlace()))); context.template Alloc<T>(XGrad)));
#endif #endif
} }
template class SoftmaxCUDNNFunctor<float, phi::GPUContext>; template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
template class SoftmaxCUDNNFunctor<platform::float16, phi::GPUContext>; template class SoftmaxCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>; template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<platform::float16, phi::GPUContext>; template class SoftmaxGradCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
#if CUDNN_VERSION_MIN(8, 1, 0) #if CUDNN_VERSION_MIN(8, 1, 0)
template class SoftmaxCUDNNFunctor<platform::bfloat16, phi::GPUContext>; template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<platform::bfloat16, phi::GPUContext>; template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
#endif #endif
// MIOPEN do not support double // MIOPEN do not support double
...@@ -155,15 +154,14 @@ template class SoftmaxCUDNNFunctor<double, phi::GPUContext>; ...@@ -155,15 +154,14 @@ template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>; template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
#endif #endif
template class SoftmaxFunctor<phi::GPUContext, platform::float16>; template class SoftmaxFunctor<phi::GPUContext, phi::dtype::float16>;
template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16>; template class SoftmaxFunctor<phi::GPUContext, phi::dtype::bfloat16>;
template class SoftmaxFunctor<phi::GPUContext, float>; template class SoftmaxFunctor<phi::GPUContext, float>;
template class SoftmaxFunctor<phi::GPUContext, double>; template class SoftmaxFunctor<phi::GPUContext, double>;
template class SoftmaxGradFunctor<phi::GPUContext, float>; template class SoftmaxGradFunctor<phi::GPUContext, float>;
template class SoftmaxGradFunctor<phi::GPUContext, double>; template class SoftmaxGradFunctor<phi::GPUContext, double>;
template class SoftmaxGradFunctor<phi::GPUContext, platform::float16>; template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::float16>;
template class SoftmaxGradFunctor<phi::GPUContext, platform::bfloat16>; template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::bfloat16>;
} // namespace math } // namespace funcs
} // namespace operators } // namespace phi
} // namespace paddle
...@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/tensor.h" #include "paddle/phi/core/dense_tensor.h"
namespace paddle { namespace phi {
namespace operators { namespace funcs {
namespace math {
template <typename DeviceContext, typename T, typename Enable = void> template <typename DeviceContext, typename T, typename Enable = void>
class SoftmaxFunctor { class SoftmaxFunctor {
...@@ -58,6 +57,5 @@ class SoftmaxGradCUDNNFunctor { ...@@ -58,6 +57,5 @@ class SoftmaxGradCUDNNFunctor {
#endif #endif
} // namespace math } // namespace funcs
} // namespace operators } // namespace phi
} // namespace paddle
...@@ -15,24 +15,22 @@ limitations under the License. */ ...@@ -15,24 +15,22 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace paddle { namespace phi {
namespace operators { namespace funcs {
namespace math {
template <typename T, template <typename T,
int MajorType = Eigen::RowMajor, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; using EigenMatrix = phi::EigenMatrix<T, MajorType, IndexType>;
template <typename T> template <typename T>
struct ValueClip { struct ValueClip {
...@@ -104,7 +102,7 @@ class SoftmaxEigen { ...@@ -104,7 +102,7 @@ class SoftmaxEigen {
}; };
template <typename DeviceContext> template <typename DeviceContext>
class SoftmaxEigen<DeviceContext, platform::float16> { class SoftmaxEigen<DeviceContext, phi::dtype::float16> {
public: public:
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const int axis_dim, const int axis_dim,
...@@ -114,8 +112,8 @@ class SoftmaxEigen<DeviceContext, platform::float16> { ...@@ -114,8 +112,8 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
constexpr int kClassDim = 1; constexpr int kClassDim = 1;
constexpr int kAxisDim = 1; constexpr int kAxisDim = 1;
auto logits = EigenMatrix<platform::float16>::From(*X); auto logits = EigenMatrix<phi::dtype::float16>::From(*X);
auto softmax = EigenMatrix<platform::float16>::From(*Y); auto softmax = EigenMatrix<phi::dtype::float16>::From(*Y);
const int batch_size = logits.dimension(kBatchDim); const int batch_size = logits.dimension(kBatchDim);
const int num_classes = logits.dimension(kClassDim); const int num_classes = logits.dimension(kClassDim);
...@@ -139,7 +137,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> { ...@@ -139,7 +137,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
(logits - logits.maximum(along_axis) (logits - logits.maximum(along_axis)
.reshape(batch_by_one) .reshape(batch_by_one)
.broadcast(one_by_class)) .broadcast(one_by_class))
.unaryExpr(ValueClip<platform::float16>()); .unaryExpr(ValueClip<phi::dtype::float16>());
} else { } else {
// axis != -1, class dimension split into (axis, remain), max and sum // axis != -1, class dimension split into (axis, remain), max and sum
// should be calculated along axis dimension // should be calculated along axis dimension
...@@ -149,7 +147,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> { ...@@ -149,7 +147,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
.reshape(batch_one_remain) .reshape(batch_one_remain)
.broadcast(one_axis_one) .broadcast(one_axis_one)
.reshape(batch_classes)) .reshape(batch_classes))
.unaryExpr(ValueClip<platform::float16>()); .unaryExpr(ValueClip<phi::dtype::float16>());
} }
softmax.device(*context.eigen_device()) = softmax.exp(); softmax.device(*context.eigen_device()) = softmax.exp();
...@@ -162,7 +160,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> { ...@@ -162,7 +160,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
}; };
template <typename DeviceContext> template <typename DeviceContext>
class SoftmaxEigen<DeviceContext, platform::bfloat16> { class SoftmaxEigen<DeviceContext, phi::dtype::bfloat16> {
public: public:
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const int axis_dim, const int axis_dim,
...@@ -172,8 +170,8 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> { ...@@ -172,8 +170,8 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
constexpr int kClassDim = 1; constexpr int kClassDim = 1;
constexpr int kAxisDim = 1; constexpr int kAxisDim = 1;
auto logits = EigenMatrix<platform::bfloat16>::From(*X); auto logits = EigenMatrix<phi::dtype::bfloat16>::From(*X);
auto softmax = EigenMatrix<platform::bfloat16>::From(*Y); auto softmax = EigenMatrix<phi::dtype::bfloat16>::From(*Y);
const int batch_size = logits.dimension(kBatchDim); const int batch_size = logits.dimension(kBatchDim);
const int num_classes = logits.dimension(kClassDim); const int num_classes = logits.dimension(kClassDim);
...@@ -197,7 +195,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> { ...@@ -197,7 +195,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
(logits - logits.maximum(along_axis) (logits - logits.maximum(along_axis)
.reshape(batch_by_one) .reshape(batch_by_one)
.broadcast(one_by_class)) .broadcast(one_by_class))
.unaryExpr(ValueClip<platform::bfloat16>()); .unaryExpr(ValueClip<phi::dtype::bfloat16>());
} else { } else {
// axis != -1, class dimension split into (axis, remain), max and sum // axis != -1, class dimension split into (axis, remain), max and sum
// should be calculated along axis dimension // should be calculated along axis dimension
...@@ -207,7 +205,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> { ...@@ -207,7 +205,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
.reshape(batch_one_remain) .reshape(batch_one_remain)
.broadcast(one_axis_one) .broadcast(one_axis_one)
.reshape(batch_classes)) .reshape(batch_classes))
.unaryExpr(ValueClip<platform::bfloat16>()); .unaryExpr(ValueClip<phi::dtype::bfloat16>());
} }
softmax.device(*context.eigen_device()) = softmax.exp(); softmax.device(*context.eigen_device()) = softmax.exp();
...@@ -247,21 +245,24 @@ class SoftmaxFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> { ...@@ -247,21 +245,24 @@ class SoftmaxFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
const int batch_size = in_dims[kBatchDim]; const int batch_size = in_dims[kBatchDim];
const int num_remain = num_classes / axis_dim; const int num_remain = num_classes / axis_dim;
if (num_remain == 1 && platform::MayIUse(platform::avx)) { if (num_remain == 1 &&
phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
const T* in_data = X->data<T>(); const T* in_data = X->data<T>();
T* out_data = Y->data<T>(); T* out_data = Y->data<T>();
for (int bs = 0; bs < batch_size; ++bs) { for (int bs = 0; bs < batch_size; ++bs) {
T max_val = *std::max_element(in_data, in_data + num_classes); T max_val = *std::max_element(in_data, in_data + num_classes);
max_val *= static_cast<T>(-1); max_val *= static_cast<T>(-1);
vec_add_bias<T, platform::avx>(num_classes, max_val, in_data, out_data); vec_add_bias<T, phi::backends::cpu::avx>(
vec_clip<T, platform::avx>( num_classes, max_val, in_data, out_data);
vec_clip<T, phi::backends::cpu::avx>(
num_classes, static_cast<T>(-64), out_data, out_data); num_classes, static_cast<T>(-64), out_data, out_data);
vec_exp<T>(num_classes, out_data, out_data); vec_exp<T>(num_classes, out_data, out_data);
T sum = 0; T sum = 0;
vec_sum<T, platform::avx>(num_classes, out_data, &sum); vec_sum<T, phi::backends::cpu::avx>(num_classes, out_data, &sum);
sum = static_cast<T>(1) / sum; sum = static_cast<T>(1) / sum;
vec_scal<T, platform::avx>(num_classes, sum, out_data, out_data); vec_scal<T, phi::backends::cpu::avx>(
num_classes, sum, out_data, out_data);
in_data += num_classes; in_data += num_classes;
out_data += num_classes; out_data += num_classes;
...@@ -308,16 +309,16 @@ class SoftmaxGradEigen { ...@@ -308,16 +309,16 @@ class SoftmaxGradEigen {
}; };
template <typename DeviceContext> template <typename DeviceContext>
class SoftmaxGradEigen<DeviceContext, platform::float16> { class SoftmaxGradEigen<DeviceContext, phi::dtype::float16> {
public: public:
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const int axis_dim, const int axis_dim,
const phi::DenseTensor* y, const phi::DenseTensor* y,
const phi::DenseTensor* y_grad, const phi::DenseTensor* y_grad,
phi::DenseTensor* x_grad) { phi::DenseTensor* x_grad) {
auto softmax = EigenMatrix<platform::float16>::From(*y); auto softmax = EigenMatrix<phi::dtype::float16>::From(*y);
auto softmax_grad = EigenMatrix<platform::float16>::From(*y_grad); auto softmax_grad = EigenMatrix<phi::dtype::float16>::From(*y_grad);
auto logits_grad = EigenMatrix<platform::float16>::From(*x_grad); auto logits_grad = EigenMatrix<phi::dtype::float16>::From(*x_grad);
constexpr int kBatchDim = 0; constexpr int kBatchDim = 0;
constexpr int kClassDim = 1; constexpr int kClassDim = 1;
...@@ -342,16 +343,16 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> { ...@@ -342,16 +343,16 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
}; };
template <typename DeviceContext> template <typename DeviceContext>
class SoftmaxGradEigen<DeviceContext, platform::bfloat16> { class SoftmaxGradEigen<DeviceContext, phi::dtype::bfloat16> {
public: public:
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const int axis_dim, const int axis_dim,
const phi::DenseTensor* y, const phi::DenseTensor* y,
const phi::DenseTensor* y_grad, const phi::DenseTensor* y_grad,
phi::DenseTensor* x_grad) { phi::DenseTensor* x_grad) {
auto softmax = EigenMatrix<platform::bfloat16>::From(*y); auto softmax = EigenMatrix<phi::dtype::bfloat16>::From(*y);
auto softmax_grad = EigenMatrix<platform::bfloat16>::From(*y_grad); auto softmax_grad = EigenMatrix<phi::dtype::bfloat16>::From(*y_grad);
auto logits_grad = EigenMatrix<platform::bfloat16>::From(*x_grad); auto logits_grad = EigenMatrix<phi::dtype::bfloat16>::From(*x_grad);
constexpr int kBatchDim = 0; constexpr int kBatchDim = 0;
constexpr int kClassDim = 1; constexpr int kClassDim = 1;
...@@ -400,17 +401,20 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> { ...@@ -400,17 +401,20 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
const int batch_size = out_dims[kBatchDim]; const int batch_size = out_dims[kBatchDim];
const int num_remain = num_classes / axis_dim; const int num_remain = num_classes / axis_dim;
if (num_remain == 1 && platform::MayIUse(platform::avx)) { if (num_remain == 1 &&
phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
const T* out_data = y->data<T>(); const T* out_data = y->data<T>();
const T* out_grad = y_grad->data<T>(); const T* out_grad = y_grad->data<T>();
T* in_grad = x_grad->data<T>(); T* in_grad = x_grad->data<T>();
for (int bs = 0; bs < batch_size; ++bs) { for (int bs = 0; bs < batch_size; ++bs) {
T scalar; T scalar;
vec_mul_reduce<T, platform::avx>( vec_mul_reduce<T, phi::backends::cpu::avx>(
num_classes, out_grad, out_data, &scalar); num_classes, out_grad, out_data, &scalar);
scalar *= static_cast<T>(-1); scalar *= static_cast<T>(-1);
vec_add_bias<T, platform::avx>(num_classes, scalar, out_grad, in_grad); vec_add_bias<T, phi::backends::cpu::avx>(
vec_mul<T, platform::avx>(num_classes, out_data, in_grad, in_grad); num_classes, scalar, out_grad, in_grad);
vec_mul<T, phi::backends::cpu::avx>(
num_classes, out_data, in_grad, in_grad);
out_data += num_classes; out_data += num_classes;
out_grad += num_classes; out_grad += num_classes;
in_grad += num_classes; in_grad += num_classes;
...@@ -422,6 +426,5 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> { ...@@ -422,6 +426,5 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
} }
}; };
} // namespace math } // namespace funcs
} // namespace operators } // namespace phi
} // namespace paddle
...@@ -22,7 +22,6 @@ limitations under the License. */ ...@@ -22,7 +22,6 @@ limitations under the License. */
namespace cub = hipcub; namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
...@@ -32,6 +31,7 @@ namespace cub = hipcub; ...@@ -32,6 +31,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace phi { namespace phi {
......
...@@ -22,7 +22,6 @@ limitations under the License. */ ...@@ -22,7 +22,6 @@ limitations under the License. */
namespace cub = hipcub; namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
...@@ -33,6 +32,7 @@ namespace cub = hipcub; ...@@ -33,6 +32,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/cross_entropy.h" #include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace phi { namespace phi {
...@@ -1386,7 +1386,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx, ...@@ -1386,7 +1386,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
labels_2d.Resize({n, label.numel() / n}); labels_2d.Resize({n, label.numel() / n});
DenseTensor loss_2d(*loss); DenseTensor loss_2d(*loss);
loss_2d.Resize({n, 1}); loss_2d.Resize({n, 1});
paddle::operators::math::SoftmaxCUDNNFunctor<T, GPUContext>()( phi::funcs::SoftmaxCUDNNFunctor<T, GPUContext>()(
dev_ctx, &logits_2d, &softmax_2d); dev_ctx, &logits_2d, &softmax_2d);
phi::funcs::CrossEntropyFunctor<GPUContext, T>()(dev_ctx, phi::funcs::CrossEntropyFunctor<GPUContext, T>()(dev_ctx,
&loss_2d, &loss_2d,
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
#pragma once #pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace phi { namespace phi {
...@@ -50,7 +50,7 @@ void GumbelSoftmaxGradKernel(const Context& ctx, ...@@ -50,7 +50,7 @@ void GumbelSoftmaxGradKernel(const Context& ctx,
dx_2d.Resize({size_to_axis, size_from_axis}); dx_2d.Resize({size_to_axis, size_from_axis});
out_2d.Resize({size_to_axis, size_from_axis}); out_2d.Resize({size_to_axis, size_from_axis});
dout_2d.Resize({size_to_axis, size_from_axis}); dout_2d.Resize({size_to_axis, size_from_axis});
paddle::operators::math::SoftmaxGradFunctor<Context, T>()( phi::funcs::SoftmaxGradFunctor<Context, T>()(
ctx, axis_dim, &out_2d, &dout_2d, &dx_2d); ctx, axis_dim, &out_2d, &dout_2d, &dx_2d);
} }
......
...@@ -16,12 +16,12 @@ ...@@ -16,12 +16,12 @@
#include <random> #include <random>
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace phi { namespace phi {
...@@ -87,8 +87,7 @@ void GumbelSoftmaxKernelHelper(const Context& ctx, ...@@ -87,8 +87,7 @@ void GumbelSoftmaxKernelHelper(const Context& ctx,
size_to_axis, size_to_axis,
size_from_axis, size_from_axis,
temperature); temperature);
paddle::operators::math::SoftmaxFunctor<Context, T>()( phi::funcs::SoftmaxFunctor<Context, T>()(ctx, axis_dim, &x_noise_2d, &out_2d);
ctx, axis_dim, &x_noise_2d, &out_2d);
if (hard) { if (hard) {
OneHotGenerator<Context, T>::Transform(ctx, x, out, axis); OneHotGenerator<Context, T>::Transform(ctx, x, out, axis);
......
...@@ -14,9 +14,9 @@ limitations under the License. */ ...@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/softmax_grad_kernel.h" #include "paddle/phi/kernels/softmax_grad_kernel.h"
namespace phi { namespace phi {
...@@ -50,7 +50,7 @@ void SoftmaxGradKernel(const Context& dev_ctx, ...@@ -50,7 +50,7 @@ void SoftmaxGradKernel(const Context& dev_ctx,
Out_2d.ShareDataWith(out).Resize({n, d}); Out_2d.ShareDataWith(out).Resize({n, d});
dOut_2d.ShareDataWith(out_grad).Resize({n, d}); dOut_2d.ShareDataWith(out_grad).Resize({n, d});
paddle::operators::math::SoftmaxGradFunctor<Context, T>()( phi::funcs::SoftmaxGradFunctor<Context, T>()(
dev_ctx, axis_dim, &Out_2d, &dOut_2d, &dX_2d); dev_ctx, axis_dim, &Out_2d, &dOut_2d, &dX_2d);
} }
......
...@@ -14,9 +14,9 @@ limitations under the License. */ ...@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/softmax_kernel.h" #include "paddle/phi/kernels/softmax_kernel.h"
namespace phi { namespace phi {
...@@ -47,8 +47,7 @@ void SoftmaxKernel(const Context& dev_ctx, ...@@ -47,8 +47,7 @@ void SoftmaxKernel(const Context& dev_ctx,
DenseTensor X_2d, Out_2d; DenseTensor X_2d, Out_2d;
X_2d.ShareDataWith(x).Resize({n, d}); X_2d.ShareDataWith(x).Resize({n, d});
Out_2d.ShareDataWith(*out).Resize({n, d}); Out_2d.ShareDataWith(*out).Resize({n, d});
paddle::operators::math::SoftmaxFunctor<Context, T>()( phi::funcs::SoftmaxFunctor<Context, T>()(dev_ctx, axis_dim, &X_2d, &Out_2d);
dev_ctx, axis_dim, &X_2d, &Out_2d);
} }
} // namespace phi } // namespace phi
...@@ -22,7 +22,7 @@ endif() ...@@ -22,7 +22,7 @@ endif()
cc_test( cc_test(
test_cpu_vec test_cpu_vec
SRCS test_cpu_vec.cc SRCS test_cpu_vec.cc
DEPS blas cpu_info) DEPS blas phi_backends)
# For String Kernels # For String Kernels
cc_test( cc_test(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册