未验证 提交 344b99e1 编写于 作者: H huangjiyi 提交者: GitHub

[PHI decoupling] move softmax from fluid to phi and remove cpu_vec.h in fluid (#48970)

上级 4672ea8e
......@@ -19,9 +19,9 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/inference/utils/table_printer.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/utils/string/split.h"
#ifdef PADDLE_WITH_TENSORRT
......@@ -624,10 +624,11 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
void AnalysisConfig::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
if (platform::MayIUse(platform::cpu_isa_t::avx512_core)) {
if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core)) {
use_mkldnn_bfloat16_ = true;
LOG(INFO) << "Hardware support for BFLOAT16"
<< (platform::MayIUse(platform::cpu_isa_t::avx512_bf16)
<< (phi::backends::cpu::MayIUse(
phi::backends::cpu::cpu_isa_t::avx512_bf16)
? " is enabled"
: " is disabled. Simulation will be used");
} else {
......
......@@ -29,7 +29,7 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
DEFINE_string(dirname, "", "dirname to tests.");
......@@ -327,7 +327,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
config.EnableUseGpu(100, 0);
config.EnableMkldnnBfloat16();
#ifdef PADDLE_WITH_MKLDNN
if (platform::MayIUse(platform::cpu_isa_t::avx512_core))
if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core))
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
else
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
......
......@@ -27,7 +27,7 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
DEFINE_string(dirname, "", "dirname to tests.");
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
......@@ -47,7 +47,7 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) {
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInputs(&input_slots_all);
if (FLAGS_enable_mkldnn && FLAGS_enable_bf16 &&
platform::MayIUse(platform::cpu_isa_t::avx512_bf16)) {
phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_bf16)) {
b_cfg.EnableMkldnnBfloat16();
} else {
FLAGS_enable_bf16 = false;
......
......@@ -27,9 +27,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/allocation/system_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace memory {
......
......@@ -78,8 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std::call_once(init_flag, []() {
a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(),
platform::CpuMaxChunkSize());
phi::backends::cpu::CpuMinChunkSize(),
phi::backends::cpu::CpuMaxChunkSize());
});
return a;
......@@ -290,8 +290,8 @@ BuddyAllocator *GetNPUPinnedBuddyAllocator() {
std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUPinnedAllocator),
platform::NPUPinnedMinChunkSize(),
platform::NPUPinnedMaxChunkSize());
phi::backends::cpu::NPUPinnedMinChunkSize(),
phi::backends::cpu::NPUPinnedMaxChunkSize());
});
return ba;
......@@ -562,8 +562,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::CUDAPinnedAllocator),
platform::CUDAPinnedMinChunkSize(),
platform::CUDAPinnedMaxChunkSize());
phi::backends::cpu::CUDAPinnedMinChunkSize(),
phi::backends::cpu::CUDAPinnedMaxChunkSize());
});
return ba;
......
......@@ -28,10 +28,10 @@ limitations under the License. */
#endif
#include "gflags/gflags.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
......@@ -206,7 +206,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t usable =
paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
phi::backends::cpu::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
if (size > usable) {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
......@@ -362,7 +362,7 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
size_t usable =
paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
phi::backends::cpu::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
if (size > usable) {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
......
......@@ -18,9 +18,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
// This unit test is an example comparing the performance between using pinned
// memory and not. In general, using pinned memory will be faster.
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
......@@ -315,10 +315,10 @@ use lstm_x_t as input and compute as standard LSTM.
template <typename T>
inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
if (bias) {
phi::funcs::vec_add_bias<T, platform::avx>(n, *bias, x, y);
phi::funcs::vec_relu<T, platform::avx>(n, y, y);
phi::funcs::vec_add_bias<T, phi::backends::cpu::avx>(n, *bias, x, y);
phi::funcs::vec_relu<T, phi::backends::cpu::avx>(n, y, y);
} else {
phi::funcs::vec_relu<T, platform::avx>(n, x, y);
phi::funcs::vec_relu<T, phi::backends::cpu::avx>(n, x, y);
}
}
......@@ -329,8 +329,9 @@ inline void vec_softmax(const int n, const T* x, T* y) {
for (int i = 1; i < n; ++i) {
scalar = scalar < x[i] ? x[i] : scalar;
}
phi::funcs::vec_add_bias<T, platform::avx>(n, -scalar, x, y); // sub
phi::funcs::vec_exp<T>(n, y, y); // exp
phi::funcs::vec_add_bias<T, phi::backends::cpu::avx>(
n, -scalar, x, y); // sub
phi::funcs::vec_exp<T>(n, y, y); // exp
// sum
scalar = T(0);
for (int i = 0; i < n; ++i) {
......@@ -393,13 +394,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
if (platform::MayIUse(platform::avx)) {
phi::funcs::VecActivations<T, platform::avx> act_functor;
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor;
act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str);
} else {
phi::funcs::VecActivations<T, platform::isa_any> act_functor;
phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor;
act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str);
......
......@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace paddle {
namespace operators {
......@@ -129,15 +131,15 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
softmax_2d.ShareDataWith(*softmax).Resize({N, D});
loss_2d.ShareDataWith(*loss).Resize({N, 1});
auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d);
// step 1, obtain logit_max
phi::DenseTensor logits_max;
logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* logits_max_buff = logits_max.mutable_data<T>(place);
auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1);
eigen_logits_max.device(*dev_ctx.eigen_device()) =
eigen_logits.maximum(along_axis);
......@@ -158,7 +160,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_logits -
eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
.unaryExpr(math::ValueClip<T>());
.unaryExpr(phi::funcs::ValueClip<T>());
// step 3, obtain predict target
phi::DenseTensor predicted_logits;
......@@ -217,7 +219,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
auto eigen_sum_exp_logits =
phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
eigen_softmax.sum(along_axis);
......@@ -231,8 +234,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
comm->comm(),
stream));
auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
auto eigen_loss = phi::funcs::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits =
phi::funcs::EigenMatrix<T>::From(predicted_logits);
eigen_loss.device(*dev_ctx.eigen_device()) =
(eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) -
......@@ -281,14 +285,14 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
softmax_2d.ShareDataWith(*softmax).Resize({N, D});
loss_2d.ShareDataWith(*loss).Resize({N, 1});
auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d);
// step 1, obtain logit_max
phi::DenseTensor logits_max;
logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1);
eigen_logits_max.device(*dev_ctx.eigen_device()) =
eigen_logits.maximum(along_axis);
......@@ -304,7 +308,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_logits -
eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
.unaryExpr(math::ValueClip<T>());
.unaryExpr(phi::funcs::ValueClip<T>());
// step 3, obtain predict target
phi::DenseTensor predicted_logits;
......@@ -357,7 +361,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
auto eigen_sum_exp_logits =
phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
eigen_softmax.sum(along_axis);
......@@ -366,8 +371,9 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
opts.reduce_op = distributed::ReduceOp::SUM;
pg->AllReduce(in_out, in_out, opts)->Synchronize();
auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
auto eigen_loss = phi::funcs::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits =
phi::funcs::EigenMatrix<T>::From(predicted_logits);
eigen_loss.device(*dev_ctx.eigen_device()) =
(eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) -
......
......@@ -22,9 +22,9 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace paddle {
namespace operators {
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/elementwise_kernel.h"
namespace paddle {
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/sequence2batch.h"
......@@ -278,13 +278,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (platform::MayIUse(platform::avx)) { \
phi::funcs::VecActivations<T, platform::avx> act_functor; \
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) { \
phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
} else { \
phi::funcs::VecActivations<T, platform::isa_any> act_functor; \
phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
......@@ -225,11 +225,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
std::function<void(const int, const T*, T*)> fc_act;
auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
if (platform::MayIUse(platform::avx)) {
phi::funcs::VecActivations<T, platform::avx> act_functor;
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor;
fc_act = act_functor(fc_act_str);
} else {
phi::funcs::VecActivations<T, platform::isa_any> act_functor;
phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor;
fc_act = act_functor(fc_act_str);
}
......
......@@ -9,7 +9,7 @@ file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
file(APPEND ${jit_file}
"\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash)
set(JIT_KERNEL_DEPS device_context cblas gflags enforce place xxhash)
file(
GLOB jit_kernel_cc_srcs
......
......@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/act.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -98,27 +98,27 @@ DECLARE_ACT_CREATOR(VTanh);
// TODO(TJ): tuning use me
bool VReluCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx);
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
}
bool VSquareCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx);
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
}
bool VIdentityCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx);
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
}
bool VExpCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d < 32;
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d < 32;
}
bool VSigmoidCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx);
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
}
bool VTanhCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx);
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
}
size_t VReluCreator::CodeSize(const int& d) const {
......
......@@ -84,8 +84,8 @@ class VActFunc : public JitCode {
// compute EXP with ymm, xmm
template <typename JMM>
void exp_jmm(JMM& dst,
JMM& src,
void exp_jmm(JMM& dst, // NOLINT
JMM& src, // NOLINT
int src_idx = 11,
int fx_idx = 12, // NOLINT
int fy_idx = 13,
......@@ -144,10 +144,11 @@ class VActFunc : public JitCode {
vcvttps2dq(ymm_int, jmm_fx);
mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
vmovdqa(jmm_tmp, ptr[reg_ptr_global]);
if (MayIUse(avx2) || std::is_same<JMM, xmm_t>::value) {
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx2) ||
std::is_same<JMM, xmm_t>::value) {
vpaddd(ymm_int, ymm_int, jmm_tmp);
vpslld(ymm_int, ymm_int, 23);
} else if (MayIUse(avx)) {
} else if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx());
reg64_t reg_ptr_tmp = reg_ptr_global;
......@@ -174,8 +175,8 @@ class VActFunc : public JitCode {
// compute SIGMOID with ymm, xmm
template <typename JMM>
void sigmoid_jmm(JMM& dst,
JMM& src,
void sigmoid_jmm(JMM& dst, // NOLINT
JMM& src, // NOLINT
int src_idx = 11, // NOLINT
int fx_idx = 12,
int fy_idx = 13,
......@@ -203,8 +204,8 @@ class VActFunc : public JitCode {
// compute TANH with ymm, xmm
template <typename JMM>
void tanh_jmm(JMM& dst,
JMM& src,
void tanh_jmm(JMM& dst, // NOLINT
JMM& src, // NOLINT
int src_idx = 11, // NOLINT
int fx_idx = 12,
int fy_idx = 13,
......
......@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -132,7 +132,7 @@ void AdamJitCode::genCode() {
class AdamCreator : public JitCodeCreator<adam_attr_t> {
public:
bool CanBeUsed(const adam_attr_t& attr) const override {
return platform::MayIUse(platform::avx512f);
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f);
}
size_t CodeSize(const adam_attr_t& attr) const override {
return 96 + 32 * 8;
......
......@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -147,7 +147,7 @@ void AdamWJitCode::genCode() {
class AdamWCreator : public JitCodeCreator<int> {
public:
bool CanBeUsed(const int& attr) const override {
return platform::MayIUse(platform::avx512f);
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f);
}
size_t CodeSize(const int& attr) const override { return 96 + 32 * 8; }
std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {
......
......@@ -16,7 +16,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -145,7 +145,7 @@ void NCHW16CMulNCJitCode::genCode() {
class NCHW16CMulNCCreator : public JitCodeCreator<int> {
public:
bool CanBeUsed(const int& attr) const override {
return platform::MayIUse(platform::avx512f);
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f);
}
size_t CodeSize(const int& d) const override { return 256 * 1024; }
std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {
......@@ -157,7 +157,8 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
class name##Creator : public JitCodeCreator<int> { \
public: \
bool CanBeUsed(const int& attr) const override { \
return platform::MayIUse(platform::avx) && attr <= 1024; \
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
attr <= 1024; \
} \
size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
......
......@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -123,7 +123,7 @@ void EmbSeqPoolJitCode::genCode() {
class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
public:
bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx) &&
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
attr.table_width % YMM_FLOAT_BLOCK == 0;
}
size_t CodeSize(const emb_seq_pool_attr_t& attr) const override {
......
......@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -85,20 +85,21 @@ void GRUJitCode::genCode() {
ret();
}
#define DECLARE_GRU_CREATOR(name) \
class name##Creator : public JitCodeCreator<gru_attr_t> { \
public: \
/* TODO(TJ): enable more */ \
bool CanBeUsed(const gru_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
} \
size_t CodeSize(const gru_attr_t& attr) const override { \
return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \
} \
std::unique_ptr<GenBase> CreateJitCode( \
const gru_attr_t& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
#define DECLARE_GRU_CREATOR(name) \
class name##Creator : public JitCodeCreator<gru_attr_t> { \
public: \
/* TODO(TJ): enable more */ \
bool CanBeUsed(const gru_attr_t& attr) const override { \
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
attr.d % 8 == 0; \
} \
size_t CodeSize(const gru_attr_t& attr) const override { \
return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \
} \
std::unique_ptr<GenBase> CreateJitCode( \
const gru_attr_t& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
}
DECLARE_GRU_CREATOR(GRUH1);
......
......@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/hopv.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -78,7 +78,7 @@ void HOPVJitCode::genCode() {
class name##Creator : public JitCodeCreator<int> { \
public: \
bool CanBeUsed(const int& attr) const override { \
return platform::MayIUse(platform::avx); \
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx); \
} \
size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
......
......@@ -18,7 +18,7 @@
#include <type_traits>
#include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#define XBYAK_USE_MMAP_ALLOCATOR
#include "xbyak/xbyak.h"
......@@ -92,7 +92,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
for (int i = 0; i < num_g_abi_regs; ++i) {
push(Xbyak::Reg64(g_abi_regs[i]));
}
if (platform::MayIUse(platform::avx512f)) {
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
}
}
......
......@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -113,20 +113,21 @@ void LSTMJitCode::genCode() {
}
}
#define DECLARE_LSTM_CREATOR(name) \
class name##Creator : public JitCodeCreator<lstm_attr_t> { \
public: \
/* TODO(TJ): enable more */ \
bool CanBeUsed(const lstm_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
} \
size_t CodeSize(const lstm_attr_t& attr) const override { \
return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \
} \
std::unique_ptr<GenBase> CreateJitCode( \
const lstm_attr_t& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
#define DECLARE_LSTM_CREATOR(name) \
class name##Creator : public JitCodeCreator<lstm_attr_t> { \
public: \
/* TODO(TJ): enable more */ \
bool CanBeUsed(const lstm_attr_t& attr) const override { \
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
attr.d % 8 == 0; \
} \
size_t CodeSize(const lstm_attr_t& attr) const override { \
return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \
} \
std::unique_ptr<GenBase> CreateJitCode( \
const lstm_attr_t& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
}
DECLARE_LSTM_CREATOR(LSTMCtHt);
......
......@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -110,12 +110,13 @@ void MatMulJitCode::genCode() {
class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
public:
bool CanBeUsed(const matmul_attr_t& attr) const override {
return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
return attr.m == 1 &&
phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f) &&
attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
}
size_t CodeSize(const matmul_attr_t& attr) const override {
int block = YMM_FLOAT_BLOCK;
if (platform::MayIUse(platform::avx512f)) {
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
block = ZMM_FLOAT_BLOCK;
}
return 96 + 4 * attr.k * (attr.n / block + 1) * 8;
......
......@@ -16,7 +16,7 @@
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -59,7 +59,7 @@ void SeqPoolJitCode::genCode() {
class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
public:
bool CanBeUsed(const seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx);
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
}
size_t CodeSize(const seq_pool_attr_t& attr) const override {
return 96 + ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
......
......@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -109,7 +109,7 @@ void SgdJitCode::genCode() {
class SgdCreator : public JitCodeCreator<sgd_attr_t> {
public:
bool CanBeUsed(const sgd_attr_t& attr) const override {
return platform::MayIUse(platform::avx) &&
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
attr.grad_width % YMM_FLOAT_BLOCK == 0;
}
size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }
......
......@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -69,7 +69,8 @@ void VBroadcastJitCode::genCode() {
class VBroadcastCreator : public JitCodeCreator<int64_t> {
public:
bool CanBeUsed(const int64_t& w) const override {
return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
w % YMM_FLOAT_BLOCK == 0;
}
size_t CodeSize(const int64_t& w) const override {
return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
......
......@@ -17,8 +17,8 @@
#include <fstream>
#include "paddle/fluid/memory/allocation/cpu_allocator.h" // for posix_memalign
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifndef _WIN32
#define posix_memalign_free free
......@@ -66,7 +66,7 @@ void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
int block;
int max_num_regs;
if (platform::MayIUse(platform::avx512f)) {
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
block = ZMM_FLOAT_BLOCK;
max_num_regs = 32;
} else {
......
......@@ -17,7 +17,7 @@
#include <limits>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -172,7 +172,7 @@ bool CRFDecodingKernel::CanBeUsed(const int& d) const {
#else
constexpr int block = YMM_FLOAT_BLOCK;
#endif
return platform::MayIUse(platform::avx) && d >= block;
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d >= block;
}
} // namespace intrinsic
......
......@@ -17,7 +17,7 @@
#include <limits>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -179,7 +179,8 @@ void LayerNorm(float* x,
}
bool LayerNormKernel::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK;
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
d >= YMM_FLOAT_BLOCK;
}
} // namespace intrinsic
......
......@@ -16,8 +16,8 @@
#include "paddle/fluid/operators/jit/refer/refer.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/dynload/mklml.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace operators {
......@@ -188,17 +188,17 @@ void StrideASum<double>(const double* x, double* res, int n, int stride) {
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template <>
bool VMulKernel<float>::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx512f) && d > 512;
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f) && d > 512;
}
template <>
bool VAddKernel<float>::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d > 512;
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d > 512;
}
template <>
bool VScalKernel<float>::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx512f) && d > 512;
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f) && d > 512;
}
template <>
......@@ -274,7 +274,7 @@ bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
template <>
bool MatMulKernel<float>::CanBeUsed(const matmul_attr_t& attr) const {
return platform::MayIUse(platform::avx);
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
}
template <>
......@@ -285,7 +285,7 @@ bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
template <>
bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
// tuned on avx2
return platform::MayIUse(platform::avx) && d < 60;
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d < 60;
}
#define AWALYS_USE_ME_WITH_DOUBLE(func) \
......
......@@ -19,8 +19,8 @@ limitations under the License. */
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
......@@ -437,7 +437,7 @@ void TestKernelNCHW16CMulNC() {
EXPECT_TRUE(tgt != nullptr);
if (std::is_same<T, float>::value &&
paddle::platform::MayIUse(paddle::platform::avx512f)) {
phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
EXPECT_TRUE(jitcode != nullptr);
}
for (int ni = 0; ni < n; ni++) {
......@@ -1393,7 +1393,7 @@ TEST(JITKernel_helper, pack_weights) {
}
int block = 0;
std::vector<int> groups;
if (paddle::platform::MayIUse(paddle::platform::avx512f)) {
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
block = ZMM_FLOAT_BLOCK;
groups.push_back(30);
} else {
......
......@@ -32,7 +32,6 @@ math_library(maxouting)
math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function jit_kernel_helper)
math_library(sequence_scale)
math_library(softmax DEPS math_function jit_kernel_helper)
if(WITH_ASCEND_CL)
math_library(beam_search DEPS math_function beam_search_npu)
elseif(WITH_XPU)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include <functional>
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
namespace paddle {
namespace operators {
namespace math {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define YMM_FLOAT_BLOCK 8
#define AVX_DOUBLE_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define AVX2_DOUBLE_BLOCK 4
#define ZMM_FLOAT_BLOCK 16
#define AVX512_DOUBLE_BLOCK 8
template <typename T>
inline void vec_exp(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = std::exp(x[i]);
}
}
template <typename T>
inline void vec_scal(const int n, const T a, T* x) {
for (int i = 0; i < n; ++i) {
x[i] = a * x[i];
}
}
#ifdef PADDLE_WITH_MKLML
template <>
inline void vec_exp<float>(const int n, const float* x, float* y) {
constexpr int small_enough = 128;
if (n < small_enough) {
for (int i = 0; i < n; ++i) {
y[i] = std::exp(x[i]);
}
} else {
platform::dynload::vsExp(n, x, y);
}
}
template <>
inline void vec_exp<double>(const int n, const double* x, double* y) {
platform::dynload::vdExp(n, x, y);
}
template <>
inline void vec_scal<float>(const int n, const float a, float* x) {
platform::dynload::cblas_sscal(n, a, x, 1);
}
template <>
inline void vec_scal<double>(const int n, const double a, double* x) {
platform::dynload::cblas_dscal(n, a, x, 1);
}
#endif
// MKL scal only support inplace, choose this if src and dst are not equal
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_scal(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = a * x[i];
}
}
template <>
inline void vec_scal<float, platform::avx>(const int n,
const float a,
const float* x,
float* y) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_scal<float, platform::isa_any>(n, a, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 scalar = _mm256_set1_ps(a);
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
y[i] = a * x[i];
}
#else
vec_scal<float, platform::isa_any>(n, a, x, y);
#endif
}
template <>
inline void vec_scal<float, platform::avx2>(const int n,
const float a,
const float* x,
float* y) {
vec_scal<float, platform::avx>(n, a, x, y);
}
template <>
inline void vec_scal<float, platform::avx512f>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_scal<float, platform::avx2>(n, a, x, y);
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_sum(const size_t n, const T* x, T* s) {
s[0] = x[0];
for (size_t i = 1; i < n; ++i) {
s[0] += x[i];
}
}
template <>
inline void vec_sum<float, platform::avx>(const size_t n,
const float* x,
float* s) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_sum<float, platform::isa_any>(n, x, s);
return;
}
unsigned int i, end;
i = end = 0;
s[0] = 0.f;
end = n & ~(block - 1);
__m256 tmp = _mm256_setzero_ps();
for (i = 0; i < end; i += block) {
tmp = _mm256_add_ps(tmp, _mm256_loadu_ps(x + i));
}
__m256 hsum = _mm256_hadd_ps(tmp, tmp);
hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
_mm_store_ss(
s,
_mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
for (; i < n; i++) {
s[0] += x[i];
}
#else
vec_sum<float, platform::isa_any>(n, x, s);
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
for (size_t i = 0; i < n; ++i) {
z[i] = x[i] * y[i];
}
}
template <>
inline void vec_mul<float, platform::avx>(const size_t n,
const float* x,
const float* y,
float* z) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_mul<float, platform::isa_any>(n, x, y, z);
return;
}
unsigned int i = 0, end = 0;
end = n & ~(block - 1);
for (i = 0; i < end; i += block) {
_mm256_storeu_ps(
z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
}
for (; i < n; i++) {
z[i] = x[i] * y[i];
}
#else
vec_mul<float, platform::isa_any>(n, x, y, z);
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
z[0] = x[0] * y[0];
for (size_t i = 1; i < n; ++i) {
z[0] += x[i] * y[i];
}
}
template <>
inline void vec_mul_reduce<float, platform::avx>(const size_t n,
const float* x,
const float* y,
float* z) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
return;
}
unsigned int i = 0, end = 0;
z[0] = 0.f;
end = n & ~(block - 1);
__m256 tmp = _mm256_setzero_ps();
for (i = 0; i < end; i += block) {
tmp = _mm256_add_ps(
tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
}
__m256 hsum = _mm256_hadd_ps(tmp, tmp);
hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
_mm_store_ss(
z,
_mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
for (; i < n; i++) {
z[0] += x[i] * y[i];
}
#else
vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = a - x[i];
}
}
template <>
inline void vec_bias_sub<float, platform::avx>(const int n,
const float a,
const float* x,
float* y) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_bias_sub<float, platform::isa_any>(n, a, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(a);
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_sub_ps(bias, tmp); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
y[i] = a - x[i];
}
#else
vec_bias_sub<float, platform::isa_any>(n, a, x, y);
#endif
}
template <>
inline void vec_bias_sub<float, platform::avx2>(const int n,
const float a,
const float* x,
float* y) {
vec_bias_sub<float, platform::avx>(n, a, x, y);
}
template <>
inline void vec_bias_sub<float, platform::avx512f>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_bias_sub<float, platform::avx2>(n, a, x, y);
}
// out = x*y + (1-x)*z
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
for (int i = 0; i < n; ++i) {
out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
}
}
template <>
inline void vec_cross<float, platform::avx>(
const int n, const float* x, const float* y, const float* z, float* out) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_cross<float, platform::isa_any>(n, x, y, z, out);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(1.f);
__m256 tmpx, tmpy, tmpz;
for (i = 0; i < end; i += block) {
tmpx = _mm256_loadu_ps(x + i);
tmpy = _mm256_loadu_ps(y + i);
tmpz = _mm256_loadu_ps(z + i);
tmpy = _mm256_mul_ps(tmpx, tmpy);
tmpx = _mm256_sub_ps(bias, tmpx);
tmpz = _mm256_mul_ps(tmpx, tmpz);
tmpz = _mm256_add_ps(tmpy, tmpz);
_mm256_storeu_ps(out + i, tmpz);
}
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
}
#else
vec_cross<float, platform::isa_any>(n, x, y, z, out);
#endif
}
template <>
inline void vec_cross<float, platform::avx2>(
const int n, const float* x, const float* y, const float* z, float* out) {
vec_cross<float, platform::avx>(n, x, y, z, out);
}
template <>
inline void vec_cross<float, platform::avx512f>(
const int n, const float* x, const float* y, const float* z, float* out) {
// TODO(TJ): enable me
vec_cross<float, platform::avx>(n, x, y, z, out);
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
for (size_t i = 0; i < n; ++i) {
y[i] = x[i] < a ? a : x[i];
}
}
template <>
inline void vec_clip<float, platform::avx>(const size_t n,
const float a,
const float* x,
float* y) {
#ifdef __AVX__
constexpr unsigned int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_clip<float, platform::isa_any>(n, a, x, y);
return;
}
unsigned int i = 0, end = 0;
end = n & ~(block - 1);
__m256 threshold = _mm256_set1_ps(a);
for (i = 0; i < end; i += block) {
_mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold));
}
for (; i < n; i++) {
y[i] = x[i] < a ? a : x[i];
}
#else
vec_clip<float, platform::isa_any>(n, a, x, y);
#endif
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] + a;
}
}
template <>
inline void vec_add_bias<float, platform::avx>(const int n,
const float a,
const float* x,
float* y) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_add_bias<float, platform::isa_any>(n, a, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(a);
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_add_ps(tmp, bias); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
y[i] = x[i] + a;
}
#else
vec_add_bias<float, platform::isa_any>(n, a, x, y);
#endif
}
template <>
inline void vec_add_bias<float, platform::avx2>(const int n,
const float a,
const float* x,
float* y) {
vec_add_bias<float, platform::avx>(n, a, x, y);
}
template <>
inline void vec_add_bias<float, platform::avx512f>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_add_bias<float, platform::avx2>(n, a, x, y);
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_identity(const int n, const T* x, T* y) {
// do nothing
return;
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_sigmoid(const int n, const T* x, T* y) {
const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX;
for (int i = 0; i < n; ++i) {
y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
y[i] = static_cast<T>(0) - y[i];
}
vec_exp<T>(n, y, y);
for (int i = 0; i < n; ++i) {
y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
}
}
template <>
inline void vec_sigmoid<float, platform::avx>(const int n,
const float* x,
float* y) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) {
vec_sigmoid<float, platform::isa_any>(n, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
__m256 zeros = _mm256_setzero_ps();
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, min); \
tmp = _mm256_min_ps(tmp, max); \
tmp = _mm256_sub_ps(zeros, tmp); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest != 0) {
// can not continue move step since the src and dst address could be equal
const float xmin = SIGMOID_THRESHOLD_MIN;
const float xmax = SIGMOID_THRESHOLD_MAX;
for (i = n - rest; i < n; ++i) {
y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
}
}
vec_exp<float>(n, y, y);
__m256 ones = _mm256_set1_ps(1.0f);
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(y + i); \
tmp = _mm256_add_ps(ones, tmp); \
tmp = _mm256_div_ps(ones, tmp); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step
for (i = n - rest; i < n; ++i) {
y[i] = 1.f / (1.f + y[i]);
}
#else
vec_sigmoid<float, platform::isa_any>(n, x, y);
#endif
}
template <>
inline void vec_sigmoid<float, platform::avx2>(const int n,
const float* x,
float* y) {
vec_sigmoid<float, platform::avx>(n, x, y);
}
template <>
inline void vec_sigmoid<float, platform::avx512f>(const int n,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_sigmoid<float, platform::avx2>(n, x, y);
}
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_tanh(const int n, const T* x, T* y) {
vec_scal<T, isa>(n, static_cast<T>(2), x, y);
vec_sigmoid<T, isa>(n, y, y);
vec_scal<T>(n, static_cast<T>(2), y);
vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
}
// TODO(TJ): make relu clip
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_relu(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
}
}
template <>
inline void vec_relu<float, platform::avx>(const int n,
const float* x,
float* y) {
#ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK;
if (n < block * 4) {
vec_relu<float, platform::isa_any>(n, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 zeros = _mm256_setzero_ps();
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
if (rest == 0) {
return;
}
i = n - block;
MOVE_ONE_STEP;
#undef MOVE_ONE_STEP
#else
vec_relu<float, platform::isa_any>(n, x, y);
#endif
}
template <>
inline void vec_relu<float, platform::avx2>(const int n,
const float* x,
float* y) {
vec_relu<float, platform::avx>(n, x, y);
}
template <>
inline void vec_relu<float, platform::avx512f>(const int n,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_relu<float, platform::avx2>(n, x, y);
}
// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
template <typename T, platform::cpu_isa_t isa = platform::isa_any>
class VecActivations {
public:
std::function<void(const int, const T*, T*)> operator()(
const std::string& type) {
if (type == "sigmoid") {
return vec_sigmoid<T, isa>;
} else if (type == "relu") {
return vec_relu<T, isa>;
} else if (type == "tanh") {
return vec_tanh<T, isa>;
} else if (type == "identity" || type == "") {
return vec_identity<T, isa>;
}
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected type should be one of sigmod, relu, tanh, identity. But got "
"not support type: %s.",
type));
}
};
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -21,9 +21,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/sample_logits_op.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace paddle {
namespace operators {
......
......@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace paddle {
namespace operators {
......
......@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace paddle {
namespace operators {
......@@ -61,7 +61,7 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
phi::make_ddim({1UL, end_pos - start_pos});
x_i.Resize(dims_i);
out_i.Resize(dims_i);
math::SoftmaxCUDNNFunctor<T, phi::GPUContext>()(
phi::funcs::SoftmaxCUDNNFunctor<T, phi::GPUContext>()(
ctx.template device_context<phi::GPUContext>(), &x_i, &out_i);
}
}
......@@ -95,7 +95,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
out_i.Resize(dims_i);
out_grad_i.Resize(dims_i);
x_grad_i.Resize(dims_i);
math::SoftmaxGradCUDNNFunctor<T, phi::GPUContext>()(
phi::funcs::SoftmaxGradCUDNNFunctor<T, phi::GPUContext>()(
ctx.template device_context<phi::GPUContext>(),
&out_i,
&out_grad_i,
......
......@@ -16,10 +16,10 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace paddle {
namespace operators {
......
......@@ -56,18 +56,10 @@ cc_test(
SRCS enforce_test.cc
DEPS enforce)
set(CPU_INFO_DEPS gflags glog enforce)
if(WITH_XBYAK)
list(APPEND CPU_INFO_DEPS xbyak)
endif()
cc_library(
cpu_info
SRCS cpu_info.cc
DEPS ${CPU_INFO_DEPS})
cc_test(
cpu_info_test
SRCS cpu_info_test.cc
DEPS cpu_info)
DEPS phi_backends)
cc_library(
os_info
SRCS os_info.cc
......@@ -194,7 +186,6 @@ cc_library(
phi_place
eigen3
cpu_helper
cpu_info
framework_proto
${IPU_CTX_DEPS}
${GPU_CTX_DEPS}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h>
#ifdef _WIN32
#if defined(__AVX2__)
#include <immintrin.h> // avx2
#elif defined(__AVX__)
#include <intrin.h> // avx
#endif // AVX
#else // WIN32
#ifdef __AVX__
#include <immintrin.h>
#endif
#endif // WIN32
#if defined(_WIN32)
#define ALIGN32_BEG __declspec(align(32))
#define ALIGN32_END
#else
#define ALIGN32_BEG
#define ALIGN32_END __attribute__((aligned(32)))
#endif // _WIN32
#ifndef PADDLE_WITH_XBYAK
#ifdef _WIN32
#define cpuid(reg, x) __cpuidex(reg, x, 0)
#else
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
!defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
#include <cpuid.h>
inline void cpuid(int reg[4], int x) {
__cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
}
#endif
#endif
#endif
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace paddle {
namespace platform {
size_t CpuTotalPhysicalMemory();
//! Get the maximum allocation size for a machine.
size_t CpuMaxAllocSize();
//! Get the maximum allocation size for a machine.
size_t CUDAPinnedMaxAllocSize();
using phi::backends::cpu::CpuMinChunkSize;
//! Get the maximum chunk size for buddy allocator.
size_t CpuMaxChunkSize();
//! Get the minimum chunk size for buddy allocator.
size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize();
//! Get the maximum allocation size for a machine.
size_t NPUPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator.
size_t NPUPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t NPUPinnedMaxChunkSize();
using namespace phi::backends::cpu; // NOLINT
// May I use some instruction
bool MayIUse(const cpu_isa_t cpu_isa);
} // namespace platform
} // namespace paddle
......@@ -11,7 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include <sstream>
......@@ -23,7 +23,8 @@ DECLARE_double(fraction_of_cpu_memory_to_use);
TEST(CpuMemoryUsage, Print) {
std::stringstream ss;
size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
size_t memory_size =
phi::backends::cpu::CpuMaxAllocSize() / 1024 / 1024 / 1024;
float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
......
......@@ -16,9 +16,9 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/string/split.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
......
......@@ -29,7 +29,7 @@ cc_library(
cc_library(
cpu_utilization
SRCS cpu_utilization.cc
DEPS cpu_info os_info enforce glog)
DEPS phi_backends os_info enforce glog)
cc_library(
new_profiler
SRCS profiler.cc
......
......@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
......@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
......
......@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
......@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
......
......@@ -75,7 +75,6 @@ limitations under the License. */
#include "paddle/fluid/operators/ops_extra_info.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
......@@ -94,6 +93,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/io.h"
#include "paddle/fluid/pybind/jit.h"
#include "paddle/fluid/pybind/xpu_streams_py.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
......@@ -327,7 +327,7 @@ bool SupportsBfloat16() {
#ifndef PADDLE_WITH_MKLDNN
return false;
#else
if (platform::MayIUse(platform::cpu_isa_t::avx512_core))
if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core))
return true;
else
return false;
......@@ -338,7 +338,7 @@ bool SupportsBfloat16FastPerformance() {
#ifndef PADDLE_WITH_MKLDNN
return false;
#else
if (platform::MayIUse(platform::cpu_isa_t::avx512_bf16))
if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_bf16))
return true;
else
return false;
......@@ -349,8 +349,8 @@ bool SupportsInt8() {
#ifndef PADDLE_WITH_MKLDNN
return false;
#else
return (platform::MayIUse(platform::cpu_isa_t::avx2) ||
platform::MayIUse(platform::cpu_isa_t::avx512f));
return (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx2) ||
phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512f));
#endif
}
......@@ -358,7 +358,8 @@ bool SupportsVNNI() {
#ifndef PADDLE_WITH_MKLDNN
return false;
#else
return platform::MayIUse(platform::cpu_isa_t::avx512_core_vnni);
return phi::backends::cpu::MayIUse(
phi::backends::cpu::cpu_isa_t::avx512_core_vnni);
#endif
}
......@@ -615,7 +616,7 @@ PYBIND11_MODULE(libpaddle, m) {
BindJit(&m);
// Not used, just make sure cpu_info.cc is linked.
paddle::platform::CpuTotalPhysicalMemory();
phi::backends::cpu::CpuTotalPhysicalMemory();
paddle::memory::allocation::UseAllocatorStrategyGFlag();
......
......@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
......@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
......
add_subdirectory(dynload)
add_subdirectory(gpu)
set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc)
set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context)
if(WITH_XBYAK)
list(APPEND BACKENDS_DEPS xbyak)
endif()
if(WITH_GPU OR WITH_ROCM)
list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_XBYAK
#include "xbyak/xbyak_util.h"
#endif
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef __APPLE__
#include <sys/sysctl.h>
......@@ -30,6 +26,10 @@ limitations under the License. */
#include <unistd.h>
#endif // _WIN32
#ifdef PADDLE_WITH_XBYAK
#include "xbyak/xbyak_util.h"
#endif
#include <algorithm>
#include "paddle/phi/core/flags.h"
......@@ -47,8 +47,9 @@ PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory,
true,
"If set, allocate cpu pinned memory.");
namespace paddle {
namespace platform {
namespace phi {
namespace backends {
namespace cpu {
size_t CpuTotalPhysicalMemory() {
#ifdef __APPLE__
......@@ -87,6 +88,11 @@ size_t CpuMaxChunkSize() {
static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
}
size_t CpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 4 KB.
return 1 << 12;
}
size_t CUDAPinnedMaxAllocSize() {
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
......@@ -206,5 +212,6 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
}
#endif
} // namespace platform
} // namespace paddle
} // namespace cpu
} // namespace backends
} // namespace phi
......@@ -36,15 +36,52 @@
#define ALIGN32_END __attribute__((aligned(32)))
#endif // _WIN32
#ifndef PADDLE_WITH_XBYAK
#ifdef _WIN32
#define cpuid(reg, x) __cpuidex(reg, x, 0)
#else
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
!defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
#include <cpuid.h>
inline void cpuid(int reg[4], int x) {
__cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
}
#endif
#endif
#endif
namespace phi {
namespace backends {
namespace cpu {
size_t CpuTotalPhysicalMemory();
//! Get the maximum allocation size for a machine.
size_t CpuMaxAllocSize();
//! Get the maximum allocation size for a machine.
size_t CUDAPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator.
inline size_t CpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 4 KB.
return 1 << 12;
}
size_t CpuMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CpuMaxChunkSize();
//! Get the minimum chunk size for buddy allocator.
size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize();
//! Get the maximum allocation size for a machine.
size_t NPUPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator.
size_t NPUPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t NPUPinnedMaxChunkSize();
typedef enum {
isa_any,
......@@ -59,6 +96,8 @@ typedef enum {
avx512_bf16,
} cpu_isa_t; // Instruction set architecture
// May I use some instruction
bool MayIUse(const cpu_isa_t cpu_isa);
} // namespace cpu
} // namespace backends
} // namespace phi
......@@ -19,6 +19,7 @@ math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
math_library(cross_entropy)
math_library(im2col)
math_library(vol2col)
math_library(softmax DEPS math_function)
cc_library(
phi_data_layout_transform
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <stdint.h>
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace phi {
......
......@@ -12,20 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace paddle {
namespace operators {
namespace math {
namespace phi {
namespace funcs {
template class SoftmaxFunctor<phi::CPUContext, float>;
template class SoftmaxFunctor<phi::CPUContext, double>;
template class SoftmaxGradFunctor<phi::CPUContext, float>;
template class SoftmaxGradFunctor<phi::CPUContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
......@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace paddle {
namespace operators {
namespace math {
namespace phi {
namespace funcs {
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using DataLayout = platform::DataLayout;
using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
using DataLayout = phi::backends::gpu::DataLayout;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
template <typename T, typename DeviceContext>
void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
......@@ -51,31 +50,31 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
xDesc.descriptor<T>(layout, cudnn_tensor_dims);
miopenTensorDescriptor_t cudnn_y_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
context.cudnn_handle(),
CudnnDataType<T>::kOne(),
cudnn_x_desc,
X->data<T>(),
CudnnDataType<T>::kZero(),
cudnn_y_desc,
Y->mutable_data<T>(context.GetPlace()),
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenSoftmaxForward_V2(context.cudnn_handle(),
CudnnDataType<T>::kOne(),
cudnn_x_desc,
X->data<T>(),
CudnnDataType<T>::kZero(),
cudnn_y_desc,
context.template Alloc<T>(Y),
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
#else
cudnnTensorDescriptor_t cudnn_x_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims);
cudnnTensorDescriptor_t cudnn_y_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
context.cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE,
CudnnDataType<T>::kOne(),
cudnn_x_desc,
X->data<T>(),
CudnnDataType<T>::kZero(),
cudnn_y_desc,
Y->mutable_data<T>(context.GetPlace())));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnSoftmaxForward(context.cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE,
CudnnDataType<T>::kOne(),
cudnn_x_desc,
X->data<T>(),
CudnnDataType<T>::kZero(),
cudnn_y_desc,
context.template Alloc<T>(Y)));
#endif
}
......@@ -106,18 +105,18 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
miopenTensorDescriptor_t cudnn_ygrad_desc =
dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
context.cudnn_handle(),
CudnnDataType<T>::kOne(),
cudnn_y_desc,
Y->data<T>(),
cudnn_ygrad_desc,
YGrad->data<T>(),
CudnnDataType<T>::kZero(),
cudnn_xgrad_desc,
XGrad->mutable_data<T>(context.GetPlace()),
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenSoftmaxBackward_V2(context.cudnn_handle(),
CudnnDataType<T>::kOne(),
cudnn_y_desc,
Y->data<T>(),
cudnn_ygrad_desc,
YGrad->data<T>(),
CudnnDataType<T>::kZero(),
cudnn_xgrad_desc,
context.template Alloc<T>(XGrad),
MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
#else
cudnnTensorDescriptor_t cudnn_y_desc =
yDesc.descriptor<T>(layout, cudnn_tensor_dims);
......@@ -125,28 +124,28 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
cudnnTensorDescriptor_t cudnn_ygrad_desc =
dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
context.cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE,
CudnnDataType<T>::kOne(),
cudnn_y_desc,
Y->data<T>(),
cudnn_ygrad_desc,
YGrad->data<T>(),
CudnnDataType<T>::kZero(),
cudnn_xgrad_desc,
XGrad->mutable_data<T>(context.GetPlace())));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnSoftmaxBackward(context.cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE,
CudnnDataType<T>::kOne(),
cudnn_y_desc,
Y->data<T>(),
cudnn_ygrad_desc,
YGrad->data<T>(),
CudnnDataType<T>::kZero(),
cudnn_xgrad_desc,
context.template Alloc<T>(XGrad)));
#endif
}
template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
template class SoftmaxCUDNNFunctor<platform::float16, phi::GPUContext>;
template class SoftmaxCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<platform::float16, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
#if CUDNN_VERSION_MIN(8, 1, 0)
template class SoftmaxCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
#endif
// MIOPEN do not support double
......@@ -155,15 +154,14 @@ template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
#endif
template class SoftmaxFunctor<phi::GPUContext, platform::float16>;
template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16>;
template class SoftmaxFunctor<phi::GPUContext, phi::dtype::float16>;
template class SoftmaxFunctor<phi::GPUContext, phi::dtype::bfloat16>;
template class SoftmaxFunctor<phi::GPUContext, float>;
template class SoftmaxFunctor<phi::GPUContext, double>;
template class SoftmaxGradFunctor<phi::GPUContext, float>;
template class SoftmaxGradFunctor<phi::GPUContext, double>;
template class SoftmaxGradFunctor<phi::GPUContext, platform::float16>;
template class SoftmaxGradFunctor<phi::GPUContext, platform::bfloat16>;
template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::float16>;
template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::bfloat16>;
} // namespace math
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
......@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/tensor.h"
#include "paddle/phi/core/dense_tensor.h"
namespace paddle {
namespace operators {
namespace math {
namespace phi {
namespace funcs {
template <typename DeviceContext, typename T, typename Enable = void>
class SoftmaxFunctor {
......@@ -58,6 +57,5 @@ class SoftmaxGradCUDNNFunctor {
#endif
} // namespace math
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
......@@ -15,24 +15,22 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace paddle {
namespace operators {
namespace math {
namespace phi {
namespace funcs {
template <typename T,
int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
using EigenMatrix = phi::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
struct ValueClip {
......@@ -104,7 +102,7 @@ class SoftmaxEigen {
};
template <typename DeviceContext>
class SoftmaxEigen<DeviceContext, platform::float16> {
class SoftmaxEigen<DeviceContext, phi::dtype::float16> {
public:
void operator()(const DeviceContext& context,
const int axis_dim,
......@@ -114,8 +112,8 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
constexpr int kClassDim = 1;
constexpr int kAxisDim = 1;
auto logits = EigenMatrix<platform::float16>::From(*X);
auto softmax = EigenMatrix<platform::float16>::From(*Y);
auto logits = EigenMatrix<phi::dtype::float16>::From(*X);
auto softmax = EigenMatrix<phi::dtype::float16>::From(*Y);
const int batch_size = logits.dimension(kBatchDim);
const int num_classes = logits.dimension(kClassDim);
......@@ -139,7 +137,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
(logits - logits.maximum(along_axis)
.reshape(batch_by_one)
.broadcast(one_by_class))
.unaryExpr(ValueClip<platform::float16>());
.unaryExpr(ValueClip<phi::dtype::float16>());
} else {
// axis != -1, class dimension split into (axis, remain), max and sum
// should be calculated along axis dimension
......@@ -149,7 +147,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
.reshape(batch_one_remain)
.broadcast(one_axis_one)
.reshape(batch_classes))
.unaryExpr(ValueClip<platform::float16>());
.unaryExpr(ValueClip<phi::dtype::float16>());
}
softmax.device(*context.eigen_device()) = softmax.exp();
......@@ -162,7 +160,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
};
template <typename DeviceContext>
class SoftmaxEigen<DeviceContext, platform::bfloat16> {
class SoftmaxEigen<DeviceContext, phi::dtype::bfloat16> {
public:
void operator()(const DeviceContext& context,
const int axis_dim,
......@@ -172,8 +170,8 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
constexpr int kClassDim = 1;
constexpr int kAxisDim = 1;
auto logits = EigenMatrix<platform::bfloat16>::From(*X);
auto softmax = EigenMatrix<platform::bfloat16>::From(*Y);
auto logits = EigenMatrix<phi::dtype::bfloat16>::From(*X);
auto softmax = EigenMatrix<phi::dtype::bfloat16>::From(*Y);
const int batch_size = logits.dimension(kBatchDim);
const int num_classes = logits.dimension(kClassDim);
......@@ -197,7 +195,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
(logits - logits.maximum(along_axis)
.reshape(batch_by_one)
.broadcast(one_by_class))
.unaryExpr(ValueClip<platform::bfloat16>());
.unaryExpr(ValueClip<phi::dtype::bfloat16>());
} else {
// axis != -1, class dimension split into (axis, remain), max and sum
// should be calculated along axis dimension
......@@ -207,7 +205,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
.reshape(batch_one_remain)
.broadcast(one_axis_one)
.reshape(batch_classes))
.unaryExpr(ValueClip<platform::bfloat16>());
.unaryExpr(ValueClip<phi::dtype::bfloat16>());
}
softmax.device(*context.eigen_device()) = softmax.exp();
......@@ -247,21 +245,24 @@ class SoftmaxFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
const int batch_size = in_dims[kBatchDim];
const int num_remain = num_classes / axis_dim;
if (num_remain == 1 && platform::MayIUse(platform::avx)) {
if (num_remain == 1 &&
phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
const T* in_data = X->data<T>();
T* out_data = Y->data<T>();
for (int bs = 0; bs < batch_size; ++bs) {
T max_val = *std::max_element(in_data, in_data + num_classes);
max_val *= static_cast<T>(-1);
vec_add_bias<T, platform::avx>(num_classes, max_val, in_data, out_data);
vec_clip<T, platform::avx>(
vec_add_bias<T, phi::backends::cpu::avx>(
num_classes, max_val, in_data, out_data);
vec_clip<T, phi::backends::cpu::avx>(
num_classes, static_cast<T>(-64), out_data, out_data);
vec_exp<T>(num_classes, out_data, out_data);
T sum = 0;
vec_sum<T, platform::avx>(num_classes, out_data, &sum);
vec_sum<T, phi::backends::cpu::avx>(num_classes, out_data, &sum);
sum = static_cast<T>(1) / sum;
vec_scal<T, platform::avx>(num_classes, sum, out_data, out_data);
vec_scal<T, phi::backends::cpu::avx>(
num_classes, sum, out_data, out_data);
in_data += num_classes;
out_data += num_classes;
......@@ -308,16 +309,16 @@ class SoftmaxGradEigen {
};
template <typename DeviceContext>
class SoftmaxGradEigen<DeviceContext, platform::float16> {
class SoftmaxGradEigen<DeviceContext, phi::dtype::float16> {
public:
void operator()(const DeviceContext& context,
const int axis_dim,
const phi::DenseTensor* y,
const phi::DenseTensor* y_grad,
phi::DenseTensor* x_grad) {
auto softmax = EigenMatrix<platform::float16>::From(*y);
auto softmax_grad = EigenMatrix<platform::float16>::From(*y_grad);
auto logits_grad = EigenMatrix<platform::float16>::From(*x_grad);
auto softmax = EigenMatrix<phi::dtype::float16>::From(*y);
auto softmax_grad = EigenMatrix<phi::dtype::float16>::From(*y_grad);
auto logits_grad = EigenMatrix<phi::dtype::float16>::From(*x_grad);
constexpr int kBatchDim = 0;
constexpr int kClassDim = 1;
......@@ -342,16 +343,16 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
};
template <typename DeviceContext>
class SoftmaxGradEigen<DeviceContext, platform::bfloat16> {
class SoftmaxGradEigen<DeviceContext, phi::dtype::bfloat16> {
public:
void operator()(const DeviceContext& context,
const int axis_dim,
const phi::DenseTensor* y,
const phi::DenseTensor* y_grad,
phi::DenseTensor* x_grad) {
auto softmax = EigenMatrix<platform::bfloat16>::From(*y);
auto softmax_grad = EigenMatrix<platform::bfloat16>::From(*y_grad);
auto logits_grad = EigenMatrix<platform::bfloat16>::From(*x_grad);
auto softmax = EigenMatrix<phi::dtype::bfloat16>::From(*y);
auto softmax_grad = EigenMatrix<phi::dtype::bfloat16>::From(*y_grad);
auto logits_grad = EigenMatrix<phi::dtype::bfloat16>::From(*x_grad);
constexpr int kBatchDim = 0;
constexpr int kClassDim = 1;
......@@ -400,17 +401,20 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
const int batch_size = out_dims[kBatchDim];
const int num_remain = num_classes / axis_dim;
if (num_remain == 1 && platform::MayIUse(platform::avx)) {
if (num_remain == 1 &&
phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
const T* out_data = y->data<T>();
const T* out_grad = y_grad->data<T>();
T* in_grad = x_grad->data<T>();
for (int bs = 0; bs < batch_size; ++bs) {
T scalar;
vec_mul_reduce<T, platform::avx>(
vec_mul_reduce<T, phi::backends::cpu::avx>(
num_classes, out_grad, out_data, &scalar);
scalar *= static_cast<T>(-1);
vec_add_bias<T, platform::avx>(num_classes, scalar, out_grad, in_grad);
vec_mul<T, platform::avx>(num_classes, out_data, in_grad, in_grad);
vec_add_bias<T, phi::backends::cpu::avx>(
num_classes, scalar, out_grad, in_grad);
vec_mul<T, phi::backends::cpu::avx>(
num_classes, out_data, in_grad, in_grad);
out_data += num_classes;
out_grad += num_classes;
in_grad += num_classes;
......@@ -422,6 +426,5 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
}
};
} // namespace math
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
......@@ -22,7 +22,6 @@ limitations under the License. */
namespace cub = hipcub;
#endif
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h"
......@@ -32,6 +31,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace phi {
......
......@@ -22,7 +22,6 @@ limitations under the License. */
namespace cub = hipcub;
#endif
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h"
......@@ -33,6 +32,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace phi {
......@@ -1386,7 +1386,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
labels_2d.Resize({n, label.numel() / n});
DenseTensor loss_2d(*loss);
loss_2d.Resize({n, 1});
paddle::operators::math::SoftmaxCUDNNFunctor<T, GPUContext>()(
phi::funcs::SoftmaxCUDNNFunctor<T, GPUContext>()(
dev_ctx, &logits_2d, &softmax_2d);
phi::funcs::CrossEntropyFunctor<GPUContext, T>()(dev_ctx,
&loss_2d,
......
......@@ -14,11 +14,11 @@
#pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace phi {
......@@ -50,7 +50,7 @@ void GumbelSoftmaxGradKernel(const Context& ctx,
dx_2d.Resize({size_to_axis, size_from_axis});
out_2d.Resize({size_to_axis, size_from_axis});
dout_2d.Resize({size_to_axis, size_from_axis});
paddle::operators::math::SoftmaxGradFunctor<Context, T>()(
phi::funcs::SoftmaxGradFunctor<Context, T>()(
ctx, axis_dim, &out_2d, &dout_2d, &dx_2d);
}
......
......@@ -16,12 +16,12 @@
#include <random>
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace phi {
......@@ -87,8 +87,7 @@ void GumbelSoftmaxKernelHelper(const Context& ctx,
size_to_axis,
size_from_axis,
temperature);
paddle::operators::math::SoftmaxFunctor<Context, T>()(
ctx, axis_dim, &x_noise_2d, &out_2d);
phi::funcs::SoftmaxFunctor<Context, T>()(ctx, axis_dim, &x_noise_2d, &out_2d);
if (hard) {
OneHotGenerator<Context, T>::Transform(ctx, x, out, axis);
......
......@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/softmax_grad_kernel.h"
namespace phi {
......@@ -50,7 +50,7 @@ void SoftmaxGradKernel(const Context& dev_ctx,
Out_2d.ShareDataWith(out).Resize({n, d});
dOut_2d.ShareDataWith(out_grad).Resize({n, d});
paddle::operators::math::SoftmaxGradFunctor<Context, T>()(
phi::funcs::SoftmaxGradFunctor<Context, T>()(
dev_ctx, axis_dim, &Out_2d, &dOut_2d, &dX_2d);
}
......
......@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/softmax_kernel.h"
namespace phi {
......@@ -47,8 +47,7 @@ void SoftmaxKernel(const Context& dev_ctx,
DenseTensor X_2d, Out_2d;
X_2d.ShareDataWith(x).Resize({n, d});
Out_2d.ShareDataWith(*out).Resize({n, d});
paddle::operators::math::SoftmaxFunctor<Context, T>()(
dev_ctx, axis_dim, &X_2d, &Out_2d);
phi::funcs::SoftmaxFunctor<Context, T>()(dev_ctx, axis_dim, &X_2d, &Out_2d);
}
} // namespace phi
......@@ -22,7 +22,7 @@ endif()
cc_test(
test_cpu_vec
SRCS test_cpu_vec.cc
DEPS blas cpu_info)
DEPS blas phi_backends)
# For String Kernels
cc_test(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册