未验证 提交 09096aeb 编写于 作者: L Leo Chen 提交者: GitHub

unify cpu context (#43989)

* unify cpu context

* fix init()

* delete test_device_context

* fix test_scalar
上级 8d9f00a8
......@@ -20,12 +20,6 @@ namespace paddle {
namespace framework {
class OpDesc;
} // namespace framework
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -25,9 +25,6 @@ class EmptyGradOpMaker;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -23,9 +23,6 @@ class OpDesc;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -23,9 +23,6 @@ class OpDesc;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -25,9 +25,6 @@ class EmptyGradOpMaker;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -20,9 +20,6 @@ class OpDesc;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -23,9 +23,6 @@ class OpDesc;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -25,9 +25,6 @@ class OpDesc;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -34,9 +34,6 @@ namespace operators {
template <typename DeviceContext, typename T, typename Functor>
class OverflowKernel;
} // namespace operators
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace plat = paddle::platform;
......
......@@ -24,9 +24,6 @@ class OpDesc;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -13,26 +13,19 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {} // namespace framework
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class BeamSearchFunctor<platform::CPUDeviceContext, T> {
class BeamSearchFunctor<phi::CPUContext, T> {
public:
void operator()(const platform::CPUDeviceContext &context,
void operator()(const phi::CPUContext &context,
const framework::LoDTensor *pre_ids,
const framework::LoDTensor *pre_scores,
const framework::LoDTensor *ids,
......@@ -308,10 +301,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
}
};
template class BeamSearchFunctor<platform::CPUDeviceContext, int>;
template class BeamSearchFunctor<platform::CPUDeviceContext, int64_t>;
template class BeamSearchFunctor<platform::CPUDeviceContext, float>;
template class BeamSearchFunctor<platform::CPUDeviceContext, double>;
template class BeamSearchFunctor<phi::CPUContext, int>;
template class BeamSearchFunctor<phi::CPUContext, int64_t>;
template class BeamSearchFunctor<phi::CPUContext, float>;
template class BeamSearchFunctor<phi::CPUContext, double>;
} // namespace math
} // namespace operators
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -28,13 +29,6 @@ namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {} // namespace framework
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
......
......@@ -13,19 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/context_project.h"
namespace paddle {
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
#include "paddle/phi/backends/cpu/cpu_context.h"
namespace paddle {
namespace operators {
namespace math {
template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
template class ContextProjectFunctor<phi::CPUContext, float>;
template class ContextProjectFunctor<phi::CPUContext, double>;
} // namespace math
} // namespace operators
......
......@@ -14,16 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cos_sim_functor.h"
namespace paddle {
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
template <typename T>
struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
......
......@@ -17,12 +17,6 @@ limitations under the License. */
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
namespace paddle {
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
......@@ -129,9 +123,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
}
}
template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
template class CrossEntropyFunctor<phi::CPUContext, float>;
template class CrossEntropyFunctor<phi::CPUContext, double>;
} // namespace math
......
......@@ -15,12 +15,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace paddle {
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
......
......@@ -16,12 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
namespace paddle {
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace phi {
class CPUContext;
} // namespace phi
......@@ -166,24 +160,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
}
};
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUDeviceContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUDeviceContext,
double>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
phi::CPUContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
phi::CPUContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUDeviceContext,
float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUDeviceContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
phi::CPUContext,
float>;
......@@ -353,24 +335,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
}
};
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUDeviceContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUDeviceContext,
double>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi::CPUContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi::CPUContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUDeviceContext,
float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUDeviceContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
phi::CPUContext,
float>;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#endif
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle {
namespace operators {
namespace math {
using float16 = paddle::platform::float16;
template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
template struct SetConstant<platform::CPUDeviceContext, float>;
template struct SetConstant<platform::CPUDeviceContext, double>;
template struct SetConstant<platform::CPUDeviceContext, int16_t>;
template struct SetConstant<platform::CPUDeviceContext, int>;
template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template struct SetConstant<platform::CPUDeviceContext, bool>;
template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
template struct SetConstant<platform::CPUDeviceContext,
platform::complex<float>>;
template struct SetConstant<platform::CPUDeviceContext,
platform::complex<double>>;
template struct SetConstant<phi::CPUContext, platform::float16>;
template struct SetConstant<phi::CPUContext, platform::bfloat16>;
template struct SetConstant<phi::CPUContext, float>;
template struct SetConstant<phi::CPUContext, double>;
template struct SetConstant<phi::CPUContext, int16_t>;
template struct SetConstant<phi::CPUContext, int>;
template struct SetConstant<phi::CPUContext, int64_t>;
template struct SetConstant<phi::CPUContext, bool>;
template struct SetConstant<phi::CPUContext, uint8_t>;
template struct SetConstant<phi::CPUContext, platform::complex<float>>;
template struct SetConstant<phi::CPUContext, platform::complex<double>>;
#ifdef PADDLE_WITH_XPU
template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
template struct SetConstant<platform::XPUDeviceContext, float>;
template struct SetConstant<platform::XPUDeviceContext, double>;
template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
template struct SetConstant<platform::XPUDeviceContext, int16_t>;
template struct SetConstant<platform::XPUDeviceContext, int>;
template struct SetConstant<platform::XPUDeviceContext, int64_t>;
template struct SetConstant<platform::XPUDeviceContext, bool>;
template struct SetConstant<platform::XPUDeviceContext,
platform::complex<float>>;
template struct SetConstant<platform::XPUDeviceContext,
platform::complex<double>>;
#endif
#define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, \
platform::float16, \
RANK>; \
template struct Transpose<platform::CPUDeviceContext, \
platform::bfloat16, \
RANK>; \
template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
template struct Transpose<platform::CPUDeviceContext, double, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, bool, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, \
platform::complex<float>, \
RANK>; \
template struct Transpose<platform::CPUDeviceContext, \
platform::complex<double>, \
RANK>;
DEFINE_CPU_TRANS(1);
DEFINE_CPU_TRANS(2);
DEFINE_CPU_TRANS(3);
DEFINE_CPU_TRANS(4);
DEFINE_CPU_TRANS(5);
DEFINE_CPU_TRANS(6);
template <typename T>
struct TransposeNormal<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& in,
framework::Tensor* out,
const std::vector<int>& axis) {
const int rank = axis.size();
auto in_stride = phi::stride(in.dims());
auto out_stride = phi::stride(out->dims());
const T* in_ptr = in.data<T>();
T* out_ptr = out->data<T>();
auto transpose_helper = [&](int64_t beg, int64_t end) {
for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
int64_t in_idx = 0;
int64_t tmp_idx = out_idx;
// calculate the input index
for (int i = 0; i < rank; ++i) {
const int64_t coordinate = tmp_idx / out_stride[i];
tmp_idx -= coordinate * out_stride[i];
in_idx += coordinate * in_stride[axis[i]];
}
out_ptr[out_idx] = in_ptr[in_idx];
}
};
transpose_helper(0, out->numel());
}
};
// define transpose normal
#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
DEFINE_CPU_TRANS_NORMAL(platform::float16);
DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
DEFINE_CPU_TRANS_NORMAL(float);
DEFINE_CPU_TRANS_NORMAL(double);
DEFINE_CPU_TRANS_NORMAL(int);
DEFINE_CPU_TRANS_NORMAL(int64_t);
DEFINE_CPU_TRANS_NORMAL(bool);
DEFINE_CPU_TRANS_NORMAL(int16_t);
DEFINE_CPU_TRANS_NORMAL(uint8_t);
DEFINE_CPU_TRANS_NORMAL(int8_t);
DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
struct TensorSetConstantCPU {
TensorSetConstantCPU(framework::Tensor* tensor, float value)
: tensor_(tensor), value_(value) {}
template <typename T>
void apply() const {
auto cpu = platform::CPUPlace();
auto* begin = tensor_->mutable_data<T>(cpu);
std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
}
framework::Tensor* tensor_;
float value_;
};
template <>
void set_constant_with_place<platform::XPUPlace>(
const platform::DeviceContext& context,
framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::NPUPlace>(
const platform::DeviceContext& context,
framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::NPUPinnedPlace>(
const platform::DeviceContext& context,
framework::Tensor* tensor,
float value) {
PADDLE_THROW(
platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
}
template <>
void set_constant_with_place<platform::IPUPlace>(
const platform::DeviceContext& context,
framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CPUPlace>(
const platform::DeviceContext& context,
framework::Tensor* tensor,
float value) {
framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
}
template <>
void set_constant_with_place<platform::MLUPlace>(
const platform::DeviceContext& context,
framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CustomPlace>(
const platform::DeviceContext& context,
framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CUDAPinnedPlace>(
const platform::DeviceContext& context,
framework::Tensor* tensor,
float value) {
framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
}
struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
TensorSetConstantWithPlace(const platform::DeviceContext& context,
framework::Tensor* tensor,
float value)
: context_(context), tensor_(tensor), value_(value) {}
template <typename Place>
void operator()(Place place) const {
set_constant_with_place<Place>(context_, tensor_, value_);
}
const platform::DeviceContext& context_;
framework::Tensor* tensor_;
float value_;
};
void set_constant(const platform::DeviceContext& context,
framework::Tensor* tensor,
float value) {
TensorSetConstantWithPlace func(context, tensor, value);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// tensor->place().apply_visitor(func);
paddle::platform::VisitPlace(tensor->place(), func);
#else
func(platform::CPUPlace());
#endif
}
template <typename T>
struct RowwiseAdd<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector,
framework::Tensor* output) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(
vector.numel(),
size,
platform::errors::InvalidArgument(
"The input vector size"
" should be equal to the size of each row of input tensor."
" Expected vector size=%d, but received %d",
size,
vector.numel()));
const char* in_dims_cstr = in_dims.to_str().c_str();
const char* out_dims_cstr = out_dims.to_str().c_str();
PADDLE_ENFORCE_EQ(out_dims,
in_dims,
platform::errors::InvalidArgument(
"The output tensor shape should be same as the input"
" tensor shape. Expected output tensor shape: %s,"
" but received %s",
in_dims_cstr,
out_dims_cstr));
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(vector);
auto out = framework::EigenMatrix<T>::From(*output);
for (int64_t i = 0; i < in_dims[0]; ++i) {
out.chip(i, 0) = in.chip(i, 0) + vec;
}
}
};
template struct RowwiseAdd<platform::CPUDeviceContext, float>;
template struct RowwiseAdd<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, float>;
template struct ColwiseSum<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, int>;
template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
template struct RowwiseSum<platform::CPUDeviceContext, float>;
template struct RowwiseSum<platform::CPUDeviceContext, double>;
template struct RowwiseMean<platform::CPUDeviceContext, float>;
template struct RowwiseMean<platform::CPUDeviceContext, double>;
template <typename T>
struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
void operator()(platform::CPUDeviceContext* ctx,
const framework::Tensor& src,
framework::Tensor* dst) {
auto in = framework::EigenVector<T>::Flatten(src);
auto out = framework::EigenVector<T>::Flatten(*dst);
auto& place = *(ctx->eigen_device());
out.device(place) = out + in;
}
};
template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -109,11 +109,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
}
}
template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
template class MaxOutFunctor<platform::CPUDeviceContext, float>;
template class MaxOutFunctor<platform::CPUDeviceContext, double>;
template class MaxOutGradFunctor<phi::CPUContext, float>;
template class MaxOutGradFunctor<phi::CPUContext, double>;
template class MaxOutFunctor<phi::CPUContext, float>;
......
......@@ -14,19 +14,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sample_prob.h"
namespace paddle {
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
template class SampleWithProb<platform::CPUDeviceContext, float>;
template class SampleWithProb<platform::CPUDeviceContext, double>;
} // namespace math
namespace math {} // namespace math
} // namespace operators
} // namespace paddle
......@@ -276,51 +276,6 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
template struct SelectedRowsSumTo<platform::CPUDeviceContext, float>;
template struct SelectedRowsSumTo<platform::CPUDeviceContext, double>;
template <typename T>
struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const phi::SelectedRows& input1,
framework::Tensor* input2) {
if (UNLIKELY(input1.rows().size() == 0)) {
LOG(WARNING) << "input selected rows is empty!";
return;
}
auto in1_height = input1.height();
const auto& in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(
in1_height,
in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But received first input height = "
"[%d], second input height = [%d]",
in1_height,
in2_dims[0]));
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(
in1_row_numel,
input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]",
in1_row_numel,
input2->numel() / in1_height));
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->data<T>();
for (size_t i = 0; i < in1_rows.size(); i++) {
for (int64_t j = 0; j < in1_row_numel; j++) {
input2_data[in1_rows[i] * in1_row_numel + j] +=
in1_data[i * in1_row_numel + j];
}
}
}
};
template <typename T>
struct SelectedRowsAddToTensor<phi::CPUContext, T> {
void operator()(const phi::CPUContext& context,
......@@ -366,13 +321,6 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
}
};
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
platform::bfloat16>;
template struct SelectedRowsAddToTensor<phi::CPUContext, float>;
template struct SelectedRowsAddToTensor<phi::CPUContext, double>;
template struct SelectedRowsAddToTensor<phi::CPUContext, int>;
......@@ -582,34 +530,6 @@ struct MergeAddImpl {
}
};
template <typename T>
struct MergeAdd<platform::CPUDeviceContext, T> {
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi::SelectedRows operator()(const platform::CPUDeviceContext& context,
const phi::SelectedRows& input,
const bool sorted_result) {
return MergeAddImpl<platform::CPUDeviceContext, T>()(
context, input, sorted_result);
}
void operator()(const platform::CPUDeviceContext& context,
const phi::SelectedRows& input,
phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CPUDeviceContext, T>()(
context, input, output, sorted_result);
}
void operator()(const platform::CPUDeviceContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CPUDeviceContext, T>()(
context, inputs, output, sorted_result);
}
};
template <typename T>
struct MergeAdd<phi::CPUContext, T> {
// unary functor, merge by adding duplicated rows in
......@@ -636,9 +556,7 @@ struct MergeAdd<phi::CPUContext, T> {
};
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype) \
template struct MergeAddImpl<platform::CPUDeviceContext, dtype>; \
template struct MergeAddImpl<phi::CPUContext, dtype>; \
template struct MergeAdd<platform::CPUDeviceContext, dtype>; \
template struct MergeAdd<phi::CPUContext, dtype>;
TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(float)
......
......@@ -20,13 +20,6 @@ namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {} // namespace framework
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
......@@ -101,66 +94,6 @@ static void fast_mem_init(void* dest,
}
}
template <typename T>
class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& seq_tensor,
framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value,
int pad_seq_len = -1,
int lod_level = 0,
bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod();
const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
const auto& seq_tensor_dims = seq_tensor.dims();
const auto& pad_tensor_dims = pad_tensor->dims();
if (pad_seq_len == -1) {
pad_seq_len = MaximumSequenceLength(seq_offsets);
}
int step_width = seq_tensor.numel() / seq_tensor_dims[0];
CheckDims(seq_tensor_dims,
pad_tensor_dims,
seq_offsets,
pad_seq_len,
step_width,
layout);
PADDLE_ENFORCE_EQ(
pad_value.numel() == 1 || pad_value.numel() == step_width,
true,
platform::errors::InvalidArgument(
"The numel of 'pad_value' can only be 1 or be equal to the "
"'step_width', but got %ld != 1 and %ld. Please check the input "
"value.",
pad_value.numel(),
step_width));
// fill padding value
T* pad_data = pad_tensor->data<T>();
const T* pad_value_data = pad_value.data<T>();
if (pad_value.numel() == 1) {
fast_mem_init<T>(
pad_data, pad_tensor->numel(), pad_value_data, sizeof(T));
} else {
for (int i = 0; i < pad_tensor->numel(); i += step_width) {
memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
}
}
CopyValidData<T>(pad_tensor,
&seq_tensor,
seq_offsets,
pad_seq_len,
step_width,
norm_by_times,
kSeqToPad,
layout);
}
};
template <typename T>
class PaddingLoDTensorFunctor<phi::CPUContext, T> {
public:
......@@ -221,42 +154,6 @@ class PaddingLoDTensorFunctor<phi::CPUContext, T> {
}
};
template <typename T>
class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor,
int pad_seq_len = -1,
int lod_level = 0,
bool norm_by_times = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims();
const auto& pad_tensor_dims = pad_tensor.dims();
if (pad_seq_len == -1) {
pad_seq_len = MaximumSequenceLength(seq_offsets);
}
int step_width = seq_tensor->numel() / seq_tensor_dims[0];
CheckDims(seq_tensor_dims,
pad_tensor_dims,
seq_offsets,
pad_seq_len,
step_width,
layout);
CopyValidData<T>(seq_tensor,
&pad_tensor,
seq_offsets,
pad_seq_len,
step_width,
norm_by_times,
kPadToSeq,
layout);
}
};
template <typename T>
class UnpaddingLoDTensorFunctor<phi::CPUContext, T> {
public:
......@@ -293,16 +190,6 @@ class UnpaddingLoDTensorFunctor<phi::CPUContext, T> {
}
};
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
template class PaddingLoDTensorFunctor<phi::CPUContext, int>;
template class PaddingLoDTensorFunctor<phi::CPUContext, int64_t>;
template class PaddingLoDTensorFunctor<phi::CPUContext, float>;
......
......@@ -24,29 +24,6 @@ namespace paddle {
namespace operators {
namespace math {
template <typename T>
class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const T* scales,
framework::LoDTensor* seq) {
const size_t level = 0;
auto lod = seq->lod();
const size_t num_seq = lod[level].size() - 1;
size_t seq_width = seq->dims()[1];
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
T* seq_data = seq->mutable_data<T>(context.GetPlace());
for (size_t i = 0; i < num_seq; ++i) {
for (size_t j = lod[level][i] * seq_width;
j < lod[level][i + 1] * seq_width;
++j) {
seq_data[j] *= scales[i];
}
}
}
};
template <typename T>
class ScaleLoDTensorFunctor<phi::CPUContext, T> {
public:
......@@ -70,9 +47,6 @@ class ScaleLoDTensorFunctor<phi::CPUContext, T> {
}
};
template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, double>;
template class ScaleLoDTensorFunctor<phi::CPUContext, float>;
template class ScaleLoDTensorFunctor<phi::CPUContext, double>;
......
......@@ -21,13 +21,6 @@ namespace paddle {
namespace operators {
namespace math {
template class SoftmaxFunctor<platform::CPUDeviceContext, float, true>;
template class SoftmaxFunctor<platform::CPUDeviceContext, float, false>;
template class SoftmaxFunctor<platform::CPUDeviceContext, double, true>;
template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
template class SoftmaxFunctor<phi::CPUContext, float, true>;
template class SoftmaxFunctor<phi::CPUContext, float, false>;
template class SoftmaxFunctor<phi::CPUContext, double, true>;
......
......@@ -16,12 +16,6 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_context.h"
namespace paddle {
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
namespace math {
......@@ -32,126 +26,6 @@ namespace math {
* [input_channels, filter_depth, filter_height, filter_width,
* output_depth, output_height, output_width]
*/
template <class T>
class Vol2ColFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& vol,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* col,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol.dims().size(),
4,
platform::errors::InvalidArgument(
"The dimension of vol should be 4, but received %d.",
vol.dims().size()));
PADDLE_ENFORCE_EQ(col->dims().size(),
7,
platform::errors::InvalidArgument(
"The dimension of col should be 7, but received %d.",
col->dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
int input_depth =
(data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
int input_height =
(data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
int input_width =
(data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
int filter_depth = col->dims()[1];
int filter_height = col->dims()[2];
int filter_width = col->dims()[3];
int output_depth = col->dims()[4];
int output_height = col->dims()[5];
int output_width = col->dims()[6];
int channels_col =
input_channels * filter_depth * filter_height * filter_width;
// changed
bool paddings_size_is_6 = (paddings.size() == 6);
int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1;
PADDLE_ENFORCE_EQ(
input_depth_tmp,
output_depth,
platform::errors::InvalidArgument(
"input_depth(%d) and output_depth(%d) are mismatching.",
input_depth_tmp,
output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1;
PADDLE_ENFORCE_EQ(
input_height_tmp,
output_height,
platform::errors::InvalidArgument(
"input_height(%d) and output_height(%d) are mismatching.",
input_height_tmp,
output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1;
PADDLE_ENFORCE_EQ(
input_width_tmp,
output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp,
output_width));
const T* vol_data = vol.data<T>();
T* col_data = col->data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int d_offset = (c / filter_width / filter_height) % filter_depth;
int c_in = c / filter_width / filter_height / filter_depth;
for (int d = 0; d < output_depth; ++d) {
int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
for (int h = 0; h < output_height; ++h) {
int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
for (int w = 0; w < output_width; ++w) {
int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
int col_idx =
((c * output_depth + d) * output_height + h) * output_width + w;
int vol_idx;
if (data_layout != DataLayout::kNHWC) {
vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
input_width +
w_pad;
} else {
vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
input_channels +
c_in;
}
col_data[col_idx] =
(h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
? static_cast<T>(0)
: vol_data[vol_idx];
}
}
}
}
}
};
template <class T>
class Vol2ColFunctor<phi::CPUContext, T> {
public:
......@@ -278,126 +152,6 @@ class Vol2ColFunctor<phi::CPUContext, T> {
* [input_channels, filter_depth, filter_height, filter_width,
* output_depth, output_height, output_width]
*/
template <class T>
class Col2VolFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& col,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* vol,
const DataLayout data_layout) const {
PADDLE_ENFORCE_EQ(vol->dims().size(),
4,
platform::errors::InvalidArgument(
"The dimension of vol should be 4, but received %d.",
vol->dims().size()));
PADDLE_ENFORCE_EQ(col.dims().size(),
7,
platform::errors::InvalidArgument(
"The dimension of col should be 7, but received %d.",
col.dims().size()));
int input_channels =
(data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
int input_depth =
(data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
int input_height =
(data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
int input_width =
(data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
int filter_depth = col.dims()[1];
int filter_height = col.dims()[2];
int filter_width = col.dims()[3];
int output_depth = col.dims()[4];
int output_height = col.dims()[5];
int output_width = col.dims()[6];
int channels_col =
input_channels * filter_depth * filter_height * filter_width;
bool paddings_size_is_6 = (paddings.size() == 6);
int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1;
PADDLE_ENFORCE_EQ(
input_depth_tmp,
output_depth,
platform::errors::InvalidArgument(
"input_depth(%d) and output_depth(%d) are mismatching.",
input_depth_tmp,
output_depth));
auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1;
PADDLE_ENFORCE_EQ(
input_height_tmp,
output_height,
platform::errors::InvalidArgument(
"input_height(%d) and output_height(%d) are mismatching.",
input_height_tmp,
output_height));
auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1;
PADDLE_ENFORCE_EQ(
input_width_tmp,
output_width,
platform::errors::InvalidArgument(
"input_width(%d) and output_width(%d) are mismatching.",
input_width_tmp,
output_width));
T* vol_data = vol->data<T>();
const T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int d_offset = (c / filter_width / filter_height) % filter_depth;
int cIm = c / filter_width / filter_height / filter_depth;
for (int d = 0; d < output_depth; ++d) {
int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
for (int h = 0; h < output_height; ++h) {
int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
for (int w = 0; w < output_width; ++w) {
int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
int vol_idx;
if (data_layout != DataLayout::kNHWC) {
vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
input_width +
w_pad;
} else {
vol_idx =
((d_pad * input_height + h_pad) * input_width + w_pad) *
input_channels +
cIm;
}
int col_idx =
((c * output_depth + d) * output_height + h) * output_width +
w;
vol_data[vol_idx] += col_data[col_idx];
}
}
}
}
}
}
};
template <class T>
class Col2VolFunctor<phi::CPUContext, T> {
public:
......@@ -518,13 +272,9 @@ class Col2VolFunctor<phi::CPUContext, T> {
}
};
template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
template class Vol2ColFunctor<phi::CPUContext, float>;
template class Vol2ColFunctor<phi::CPUContext, double>;
template class Col2VolFunctor<platform::CPUDeviceContext, float>;
template class Col2VolFunctor<platform::CPUDeviceContext, double>;
template class Col2VolFunctor<phi::CPUContext, float>;
template class Col2VolFunctor<phi::CPUContext, double>;
......
......@@ -34,7 +34,6 @@ class DenseTensor;
namespace paddle {
namespace framework {} // namespace framework
namespace platform {
class CPUDeviceContext;
class MKLDNNDeviceContext;
} // namespace platform
} // namespace paddle
......
......@@ -24,9 +24,6 @@ class OpDesc;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -27,9 +27,6 @@ class OpDesc;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -27,9 +27,6 @@ class EmptyGradOpMaker;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
DECLARE_INFER_SHAPE_FUNCTOR(reduce_all,
......
......@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
DECLARE_INFER_SHAPE_FUNCTOR(reduce_any,
......
......@@ -25,9 +25,6 @@ class OpDesc;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
......
......@@ -27,9 +27,6 @@ class OpDesc;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -31,9 +31,6 @@ class EmptyGradOpMaker;
namespace imperative {
class OpBase;
} // namespace imperative
namespace platform {
class CPUDeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
......
......@@ -367,14 +367,6 @@ DeviceContextPool::DeviceContextPool(
/*disable_setting_default_stream_for_allocator=*/false);
}
CPUDeviceContext::CPUDeviceContext() : phi::CPUContext() {
phi::CPUContext::Init();
}
CPUDeviceContext::CPUDeviceContext(CPUPlace place) : phi::CPUContext(place) {
phi::CPUContext::Init();
}
#ifdef PADDLE_WITH_IPU
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
......
......@@ -134,14 +134,7 @@ constexpr DeviceType kMLU = DeviceType::MLU;
using DeviceContext = phi::DeviceContext;
// using CPUDeviceContext = phi::CPUContext;
// TODO(wilber): The place constructor is used in many places, it is more
// difficult to use CPUDeviceContext = phi::CPUContext directly.
class CPUDeviceContext : public phi::CPUContext {
public:
CPUDeviceContext();
explicit CPUDeviceContext(CPUPlace place);
};
using CPUDeviceContext = phi::CPUContext;
template <typename Place>
struct DefaultDeviceContextType;
......
......@@ -69,30 +69,6 @@ struct Transform {
};
// NOTE: After the phi kernel is migrated, it needs to be deleted.
template <>
struct Transform<platform::CPUDeviceContext> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const platform::CPUDeviceContext& context,
InputIter first,
InputIter last,
OutputIter result,
UnaryOperation op) {
std::transform(first, last, result, op);
}
template <typename InputIter1,
typename InputIter2,
typename OutputIter,
typename BinaryOperation>
void operator()(const platform::CPUDeviceContext& context,
InputIter1 first1,
InputIter1 last1,
InputIter2 first2,
OutputIter result,
BinaryOperation op) {
std::transform(first1, last1, first2, result, op);
}
};
template <>
struct Transform<phi::CPUContext> {
......
......@@ -20,7 +20,6 @@ namespace phi {
::phi::CPUContext CreateCPUContext() {
::phi::CPUContext ctx{};
ctx.Init();
auto allocator = new backends::CpuPhiAllocator{};
ctx.SetAllocator(allocator);
ctx.SetHostAllocator(allocator);
......
......@@ -81,7 +81,6 @@ TEST(ElementwiseAdd, launcher_registry) {
::phi::CPUContext context;
context.SetAllocator(alloc);
context.Init();
host_context::KernelFrameBuilder kernel_frame_builder;
kernel_frame_builder.AddArgument(new host_context::Value(std::move(context)));
......
......@@ -51,10 +51,14 @@ struct CPUContext::Impl {
};
CPUContext::CPUContext()
: DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}
: DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {
impl_->Init();
}
CPUContext::CPUContext(const Place& place)
: DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}
: DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {
impl_->Init();
}
CPUContext::~CPUContext() = default;
......@@ -62,8 +66,6 @@ CPUContext::CPUContext(CPUContext&&) = default;
CPUContext& CPUContext::operator=(CPUContext&&) = default;
void CPUContext::Init() { impl_->Init(); }
Eigen::DefaultDevice* CPUContext::eigen_device() const {
return impl_->GetEigenDevice();
}
......
......@@ -34,12 +34,6 @@ class PADDLE_API CPUContext : public DeviceContext {
Eigen::DefaultDevice* eigen_device() const;
const Place& GetPlace() const override;
public:
// NOTE: DeviceContext hold resources. Used in training scenarios.
// The interface used by the training scene, DeviceContext will initialize
// all resources and delete them when destructing.
void Init();
protected:
// NOTE: External users manage resources. Used in inference scenarios.
// The Set interface is for inference only, DeviceContext will mark the
......
......@@ -1003,12 +1003,6 @@ struct CBlas<phi::dtype::float16> {
#ifdef PADDLE_WITH_MKLML
template <>
template <typename T>
T *Blas<paddle::platform::CPUDeviceContext>::GEMM_ALLOC(
const CBLAS_IDENTIFIER id, const int M, const int N, const int K) const {
return CBlas<T>::GEMM_ALLOC(id, M, N, K);
}
template <>
template <typename T>
T *Blas<phi::CPUContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
const int M,
const int N,
......@@ -1016,20 +1010,6 @@ T *Blas<phi::CPUContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
return CBlas<T>::GEMM_ALLOC(id, M, N, K);
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::GEMM_PACK(
const CBLAS_IDENTIFIER id,
const CBLAS_TRANSPOSE trans,
int M,
int N,
int K,
const T alpha,
const T *src,
const int ld,
T *dst) const {
CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
......@@ -1044,24 +1024,6 @@ void Blas<phi::CPUContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::GEMM_COMPUTE(
int transA,
int transB,
int M,
int N,
int K,
const T *A,
const int lda,
const T *B,
const int ldb,
T beta,
T *C,
const int ldc) const {
CBlas<T>::GEMM_COMPUTE(
CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::GEMM_COMPUTE(int transA,
......@@ -1080,11 +1042,6 @@ void Blas<phi::CPUContext>::GEMM_COMPUTE(int transA,
CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc);
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::GEMM_FREE(T *data) const {
CBlas<T>::GEMM_FREE(data);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::GEMM_FREE(T *data) const {
......@@ -1092,36 +1049,6 @@ void Blas<phi::CPUContext>::GEMM_FREE(T *data) const {
}
#endif
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
CBlas<T>::GEMM(CblasRowMajor,
transA,
transB,
M,
N,
K,
alpha,
A,
lda,
B,
ldb,
beta,
C,
ldc);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
......@@ -1153,36 +1080,6 @@ void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
ldc);
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::GEMM(bool transA,
bool transB,
int M,
int N,
int K,
T alpha,
const T *A,
int lda,
const T *B,
int ldb,
T beta,
T *C,
int ldc) const {
CBlas<T>::GEMM(CblasRowMajor,
transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans,
M,
N,
K,
alpha,
A,
lda,
B,
ldb,
beta,
C,
ldc);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::GEMM(bool transA,
......@@ -1214,36 +1111,6 @@ void Blas<phi::CPUContext>::GEMM(bool transA,
ldc);
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T *A,
int lda,
const T *B,
int ldb,
T beta,
T *C,
int ldc) const {
CBlas<T>::GEMM(CblasRowMajor,
transA,
transB,
M,
N,
K,
alpha,
A,
lda,
B,
ldb,
beta,
C,
ldc);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
......@@ -1323,50 +1190,18 @@ void Blas<DeviceContext>::MatMul(const phi::DenseTensor &mat_a,
mat_out->data<T>());
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::AXPY(int n,
T alpha,
const T *x,
T *y) const {
CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::VCOPY(int n,
const T *x,
T *y) const {
CBlas<T>::VCOPY(n, x, 1, y, 1);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::VCOPY(int n, const T *x, T *y) const {
CBlas<T>::VCOPY(n, x, 1, y, 1);
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::VADD(int n,
const T *x,
const T *y,
T *z) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VADD(n, x, y, z);
#else
if (x == z) {
this->template AXPY<T>(n, (T)(1.), y, z);
} else {
this->template VCOPY<T>(n, y, z);
this->template AXPY<T>(n, (T)(1.), x, z);
}
#endif
}
template <>
template <typename T>
void Blas<phi::CPUContext>::VADD(int n, const T *x, const T *y, T *z) const {
......@@ -1382,21 +1217,6 @@ void Blas<phi::CPUContext>::VADD(int n, const T *x, const T *y, T *z) const {
#endif
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::VSUB(int n,
const T *x,
const T *y,
T *z) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VSUB(n, x, y, z);
#else
// try to find if openblas support vsub
for (int i = 0; i < n; ++i) {
z[i] = x[i] - y[i];
}
#endif
}
template <>
template <typename T>
void Blas<phi::CPUContext>::VSUB(int n, const T *x, const T *y, T *z) const {
......@@ -1410,21 +1230,6 @@ void Blas<phi::CPUContext>::VSUB(int n, const T *x, const T *y, T *z) const {
#endif
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::VMUL(int n,
const T *x,
const T *y,
T *z) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VMUL(n, x, y, z);
#else
// try to find if openblas support vmul
for (int i = 0; i < n; ++i) {
z[i] = x[i] * y[i];
}
#endif
}
template <>
template <typename T>
void Blas<phi::CPUContext>::VMUL(int n, const T *x, const T *y, T *z) const {
......@@ -1438,21 +1243,6 @@ void Blas<phi::CPUContext>::VMUL(int n, const T *x, const T *y, T *z) const {
#endif
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::VDIV(int n,
const T *x,
const T *y,
T *z) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VDIV(n, x, y, z);
#else
// try to find if openblas support vdiv
for (int i = 0; i < n; ++i) {
z[i] = x[i] / y[i];
}
#endif
}
template <>
template <typename T>
void Blas<phi::CPUContext>::VDIV(int n, const T *x, const T *y, T *z) const {
......@@ -1466,20 +1256,6 @@ void Blas<phi::CPUContext>::VDIV(int n, const T *x, const T *y, T *z) const {
#endif
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::VEXP(int n,
const T *x,
T *y) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VEXP(n, x, y);
#else
// try to find if openblas support vexp
for (int i = 0; i < n; ++i) {
y[i] = std::exp(x[i]);
}
#endif
}
template <>
template <typename T>
void Blas<phi::CPUContext>::VEXP(int n, const T *x, T *y) const {
......@@ -1493,19 +1269,6 @@ void Blas<phi::CPUContext>::VEXP(int n, const T *x, T *y) const {
#endif
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::VSQUARE(int n,
const T *x,
T *y) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VSQUARE(n, x, y);
#else
for (int i = 0; i < n; ++i) {
y[i] = x[i] * x[i];
}
#endif
}
template <>
template <typename T>
void Blas<phi::CPUContext>::VSQUARE(int n, const T *x, T *y) const {
......@@ -1518,20 +1281,6 @@ void Blas<phi::CPUContext>::VSQUARE(int n, const T *x, T *y) const {
#endif
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::VPOW(int n,
const T *x,
T a,
T *y) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VPOW(n, x, a, y);
#else
for (int i = 0; i < n; ++i) {
y[i] = std::pow(x[i], a);
}
#endif
}
template <>
template <typename T>
void Blas<phi::CPUContext>::VPOW(int n, const T *x, T a, T *y) const {
......@@ -1544,22 +1293,6 @@ void Blas<phi::CPUContext>::VPOW(int n, const T *x, T a, T *y) const {
#endif
}
template <>
template <typename T>
T Blas<paddle::platform::CPUDeviceContext>::DOT(int n,
const T *x,
const T *y) const {
#ifdef PADDLE_WITH_MKLML
return CBlas<T>::DOT(n, x, 1, y, 1);
#else
// try to find if openblas support cblas_dot
T sum = 0;
for (int i = 0; i < n; ++i) {
sum += x[i] * y[i];
}
return sum;
#endif
}
template <>
template <typename T>
T Blas<phi::CPUContext>::DOT(int n, const T *x, const T *y) const {
......@@ -1575,20 +1308,6 @@ T Blas<phi::CPUContext>::DOT(int n, const T *x, const T *y) const {
#endif
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::SCAL(int n,
const T a,
T *x) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::SCAL(n, a, x, 1);
#else
// try to find if openblas support cblas_scal
for (int i = 0; i < n; ++i) {
x[i] = a * x[i];
}
#endif
}
template <>
template <typename T>
void Blas<phi::CPUContext>::SCAL(int n, const T a, T *x) const {
......@@ -1602,20 +1321,6 @@ void Blas<phi::CPUContext>::SCAL(int n, const T a, T *x) const {
#endif
}
template <>
template <typename T>
T Blas<paddle::platform::CPUDeviceContext>::ASUM(int n, T *x, int inc) const {
auto sum = static_cast<T>(0.0);
#ifdef PADDLE_WITH_MKLML
sum = CBlas<T>::ASUM(n, x, inc);
#else
// TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
for (int c = 0; c < n; ++c) {
sum += x[c];
}
#endif
return sum;
}
template <>
template <typename T>
T Blas<phi::CPUContext>::ASUM(int n, T *x, int inc) const {
......@@ -1625,99 +1330,26 @@ T Blas<phi::CPUContext>::ASUM(int n, T *x, int inc) const {
#else
// TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
for (int c = 0; c < n; ++c) {
sum += x[c];
}
#endif
return sum;
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::GEMV(bool trans_a,
int M,
int N,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::GEMV(bool trans_a,
int M,
int N,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T *A,
const T *B,
T beta,
T *C,
int batchCount,
int64_t strideA,
int64_t strideB) const {
PADDLE_ENFORCE_NOT_NULL(
A, phi::errors::InvalidArgument("Pointer A should not be null."));
PADDLE_ENFORCE_NOT_NULL(
B, phi::errors::InvalidArgument("Pointer B should not be null."));
PADDLE_ENFORCE_NOT_NULL(
C, phi::errors::InvalidArgument("Pointer C should not be null."));
#ifdef PADDLE_WITH_MKLML
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
auto a_array = std::vector<const T *>(batchCount);
auto b_array = std::vector<const T *>(batchCount);
auto c_array = std::vector<T *>(batchCount);
for (int k = 0; k < batchCount; ++k) {
a_array[k] = &A[k * strideA];
b_array[k] = &B[k * strideB];
c_array[k] = &C[k * M * N];
}
CBlas<T>::GEMM_BATCH(CblasRowMajor,
&transA,
&transB,
&M,
&N,
&K,
&alpha,
a_array.data(),
&lda,
b_array.data(),
&ldb,
&beta,
c_array.data(),
&ldc,
1 /* group_count */,
&batchCount);
#else
for (int k = 0; k < batchCount; ++k) {
auto *Ak = &A[k * strideA];
auto *Bk = &B[k * strideB];
auto *Ck = &C[k * M * N];
this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
sum += x[c];
}
#endif
return sum;
}
template <>
template <typename T>
void Blas<phi::CPUContext>::GEMV(bool trans_a,
int M,
int N,
T alpha,
const T *A,
const T *B,
T beta,
T *C) const {
CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -1778,47 +1410,6 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
#endif
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::BatchedGEMM(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
T alpha,
const T **A,
const T **B,
T beta,
T **C,
int batchCount) const {
#ifdef PADDLE_WITH_MKLML
const int lda = (std::max)((transA == CblasNoTrans) ? K : M, 1);
const int ldb = (std::max)((transB == CblasNoTrans) ? N : K, 1);
const int ldc = (std::max)(N, 1);
CBlas<T>::GEMM_BATCH(CblasRowMajor,
&transA,
&transB,
&M,
&N,
&K,
&alpha,
A,
&lda,
B,
&ldb,
&beta,
C,
&ldc,
1 /* group_count */,
&batchCount);
#else
for (int k = 0; k < batchCount; ++k) {
this->template GEMM<T>(
transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
}
#endif
}
template <>
template <typename T>
void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
......@@ -1864,113 +1455,6 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
!defined(PADDLE_WITH_HIP) // @{ Group Blas MKLML: BatchedGEMMWithHead
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::BatchedGEMMWithHead(
CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int W1,
int H1,
int W2,
int H2,
T alpha,
const T *A,
const T *B,
T beta,
T *C,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t head_number,
bool split_b_vertical) const {
int lda = (transA == CblasNoTrans) ? W1 : H1;
int ldb = (transB == CblasNoTrans) ? W2 : H2;
auto a_array = std::vector<const T *>(batchCount);
auto b_array = std::vector<const T *>(batchCount);
auto c_array = std::vector<T *>(batchCount);
if (split_b_vertical) {
int ldc = W2;
int sub_width = W2 / head_number;
for (int i = 0; i < head_number; i++) {
int sub_matA_offset = (transA == CblasNoTrans)
? i * (W1 / head_number)
: i * (W1 / head_number) * H1;
int sub_matB_offset = (transB == CblasNoTrans)
? i * (W2 / head_number)
: i * (W2 / head_number) * H2;
int sub_matC_offset = i * W2 / head_number;
for (int k = 0; k < batchCount; ++k) {
a_array[k] = &A[k * strideA] + sub_matA_offset;
b_array[k] = &B[k * strideB] + sub_matB_offset;
c_array[k] = &C[k * H1 * W2] + sub_matC_offset;
}
CBlas<T>::GEMM_BATCH(CblasRowMajor,
&transA,
&transB,
&H1,
&sub_width,
&H2,
&alpha,
a_array.data(),
&lda,
b_array.data(),
&ldb,
&beta,
c_array.data(),
&ldc,
1 /* group_count */,
&batchCount);
}
} else {
PADDLE_ENFORCE_EQ(
W1,
H2,
phi::errors::InvalidArgument(
"The fisrt matrix width should be same as second matrix height,"
"but received fisrt matrix width %d"
", second matrix height %d",
W1,
H2));
int ldc = W2 * head_number;
int sub_width = W1 / head_number;
for (int i = 0; i < head_number; i++) {
int sub_matA_offset = (transA == CblasNoTrans)
? i * (W1 / head_number)
: i * (W1 / head_number) * H1;
int sub_matB_offset = (transB == CblasNoTrans)
? i * (W1 / head_number) * W2
: i * (W1 / head_number);
int sub_matC_offset = i * W2;
for (int k = 0; k < batchCount; ++k) {
a_array[k] = &A[k * strideA] + sub_matA_offset;
b_array[k] = &B[k * strideB] + sub_matB_offset;
c_array[k] = &C[k * H1 * head_number * W2] + sub_matC_offset;
}
CBlas<T>::GEMM_BATCH(CblasRowMajor,
&transA,
&transB,
&H1,
&W2,
&sub_width,
&alpha,
a_array.data(),
&lda,
b_array.data(),
&ldb,
&beta,
c_array.data(),
&ldc,
1 /* group_count */,
&batchCount);
}
}
}
template <>
template <typename T>
void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int W1,
......@@ -2097,43 +1581,6 @@ void Blas<DeviceContext>::MatMul(
N);
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::MatMul(
const int M, const int N, const int K, const T *A, const T *B, T *C) const {
#ifdef PADDLE_WITH_LIBXSMM
// Refer to https://github.com/hfp/libxsmm/blob/master/README.md
// But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
// Since the matrix is very small,
// so the unit of calculation is already very fast,
// and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
// use xsmm directly.
// Note: SMM use ColMajor
const char transa = 'N';
const char transb = 'N';
const T alpha = static_cast<T>(1);
const T beta = static_cast<T>(0);
CBlas<T>::SMM_GEMM(
&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta, C, &N);
return;
#endif
CBlas<T>::GEMM(CblasRowMajor,
CblasNoTrans,
CblasNoTrans,
M,
N,
K,
static_cast<T>(1),
A,
K,
B,
N,
static_cast<T>(0),
C,
N);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::MatMul(
......@@ -2425,20 +1872,6 @@ void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
#endif
}
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::VMERF(int n,
const T *a,
T *y,
int64_t mode) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VMERF(n, a, y, mode);
#else
for (int i = 0; i < n; ++i) {
y[i] = std::erf(a[i]);
}
#endif
}
template <>
template <typename T>
void Blas<phi::CPUContext>::VMERF(int n, const T *a, T *y, int64_t mode) const {
......@@ -2454,39 +1887,6 @@ void Blas<phi::CPUContext>::VMERF(int n, const T *a, T *y, int64_t mode) const {
#ifdef PADDLE_WITH_MKLML
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::CSRMM(const char *transa,
const int *m,
const int *n,
const int *k,
const T *alpha,
const char *matdescra,
const T *val,
const int *indx,
const int *pntrb,
const int *pntre,
const T *b,
const int *ldb,
const T *beta,
T *c,
const int *ldc) const {
CBlas<T>::CSRMM(transa,
m,
n,
k,
alpha,
matdescra,
val,
indx,
pntrb,
pntre,
b,
ldb,
beta,
c,
ldc);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::CSRMM(const char *transa,
const int *m,
const int *n,
......@@ -2520,22 +1920,6 @@ void Blas<phi::CPUContext>::CSRMM(const char *transa,
}
#endif
template <>
template <typename T>
void Blas<paddle::platform::CPUDeviceContext>::TRSM(CBLAS_SIDE side,
CBLAS_UPLO uplo,
CBLAS_TRANSPOSE transA,
CBLAS_DIAG diag,
int M,
int N,
T alpha,
const T *A,
int lda,
T *B,
int ldb) const {
CBlas<T>::TRSM(
CblasRowMajor, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
}
template <>
template <typename T>
void Blas<phi::CPUContext>::TRSM(CBLAS_SIDE side,
......
......@@ -96,8 +96,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
}
}
template class FCFunctor<paddle::platform::CPUDeviceContext, float>;
template class FCFunctor<paddle::platform::CPUDeviceContext, double>;
template class FCFunctor<CPUContext, float>;
template class FCFunctor<CPUContext, double>;
......
......@@ -41,22 +41,6 @@ struct ForRange<phi::CPUContext> {
size_t limit_;
};
// NOTE: After the pten kernel is migrated, it needs to be deleted.
template <>
struct ForRange<paddle::platform::CPUDeviceContext> {
ForRange(const paddle::platform::CPUDeviceContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(limit) {}
template <typename Function>
void operator()(Function func) const {
phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx_, limit_);
for_range(func);
}
const paddle::platform::CPUDeviceContext& dev_ctx_;
size_t limit_;
};
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename Function>
......
......@@ -179,60 +179,6 @@ struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
}
};
template <typename T>
struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, T> {
static void compute(const paddle::platform::CPUDeviceContext &context,
GRUMetaValue<T> value,
int frame_size,
int batch_size,
const phi::funcs::detail::ActivationType active_node,
const phi::funcs::detail::ActivationType active_gate) {
#if !defined(__NVCC__) && !defined(__HIPCC___)
auto blas =
phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
if (value.prev_out_value) {
blas.GEMM(CblasNoTrans,
CblasTrans,
batch_size,
frame_size,
frame_size,
1,
value.prev_out_value,
value.state_weight,
0,
value.reset_output_value);
}
detail::forward_reset_output(
phi::funcs::detail::forward::gru_resetOutput<T>(),
value,
frame_size,
batch_size,
active_gate,
false,
&context);
T *cell_state_value = value.gate_value + 2 * frame_size;
T *reset_output_value = value.reset_output_value;
for (int b = 0; b < batch_size; ++b) {
blas.VADD(
frame_size, cell_state_value, reset_output_value, cell_state_value);
cell_state_value += frame_size * 3;
reset_output_value += frame_size;
}
detail::forward_final_output(
phi::funcs::detail::forward::gru_finalOutput<T>(),
value,
frame_size,
batch_size,
active_node,
true,
false,
&context);
#endif
}
};
template <typename T>
struct GRUUnitFunctorV2<CPUContext, T> {
static void compute(const CPUContext &context,
......@@ -286,131 +232,6 @@ struct GRUUnitFunctorV2<CPUContext, T> {
}
};
template <typename T>
struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, T> {
static void compute(const paddle::platform::CPUDeviceContext &context,
GRUMetaValue<T> value,
GRUMetaGrad<T> grad,
int frame_size,
int batch_size,
const phi::funcs::detail::ActivationType active_node,
const phi::funcs::detail::ActivationType active_gate) {
#if !defined(__NVCC__) && !defined(__HIPCC___)
// calculate grad_update_gate, grad_frame_state,
// grad_reset_output, grad_reset_gate
detail::cpu_gru_backward(context,
phi::funcs::detail::backward::gru<T>(),
value,
grad,
frame_size,
batch_size,
active_node,
active_gate);
auto blas =
phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
if (grad.prev_out_grad && value.prev_out_value) {
// update prev_out_grad
blas.GEMM(false,
false,
batch_size,
frame_size,
frame_size,
1,
grad.gate_grad,
frame_size * 3,
value.gate_weight,
frame_size,
1,
grad.prev_out_grad,
frame_size);
blas.GEMM(false,
false,
batch_size,
frame_size,
frame_size,
1,
grad.gate_grad + frame_size,
frame_size * 3,
value.gate_weight + frame_size * frame_size,
frame_size,
1,
grad.prev_out_grad,
frame_size);
blas.GEMM(false,
false,
batch_size,
frame_size,
frame_size,
1,
grad.reset_output_grad,
frame_size,
value.state_weight,
frame_size,
1,
grad.prev_out_grad,
frame_size);
// update weight_hh_grad
if (grad.gate_weight_grad) {
// reset gate
blas.GEMM(true,
false,
frame_size,
frame_size,
batch_size,
1,
grad.gate_grad,
frame_size * 3,
value.prev_out_value,
frame_size,
1,
grad.gate_weight_grad,
frame_size);
// update gate
blas.GEMM(true,
false,
frame_size,
frame_size,
batch_size,
1,
grad.gate_grad + frame_size,
frame_size * 3,
value.prev_out_value,
frame_size,
1,
grad.gate_weight_grad + frame_size * frame_size,
frame_size);
// cell state
blas.GEMM(true,
false,
frame_size,
frame_size,
batch_size,
1,
grad.reset_output_grad,
frame_size,
value.prev_out_value,
frame_size,
1,
grad.state_weight_grad,
frame_size);
}
}
// update bias_hh_grad
T *gate_grad = grad.gate_grad;
T *bias_hh_grad = grad.bias_hh_grad;
T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size;
T *reset_output_grad = grad.reset_output_grad;
for (int b = 0; b < batch_size; ++b) {
blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad);
blas.VADD(
frame_size, state_bias_grad, reset_output_grad, state_bias_grad);
gate_grad += 3 * frame_size;
reset_output_grad += frame_size;
}
#endif
}
};
template <typename T>
struct GRUUnitGradFunctorV2<CPUContext, T> {
static void compute(const CPUContext &context,
......@@ -540,12 +361,6 @@ template struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, double>;
template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, float>;
template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, double>;
template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, float>;
template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext,
double>;
template struct GRUUnitFunctorV2<CPUContext, float>;
template struct GRUUnitFunctorV2<CPUContext, double>;
template struct GRUUnitGradFunctorV2<CPUContext, float>;
......
......@@ -21,38 +21,6 @@ limitations under the License. */
namespace phi {
namespace funcs {
template <class T>
struct LstmUnitFunctor<paddle::platform::CPUDeviceContext, T> {
static void compute(const paddle::platform::CPUDeviceContext& context,
LstmMetaValue<T> value,
int frame_size,
int batch_size,
T cell_clip,
const phi::funcs::detail::ActivationType& gate_act,
const phi::funcs::detail::ActivationType& cell_act,
const phi::funcs::detail::ActivationType& cand_act,
bool old_api_version = true) {
for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_forward(context,
phi::funcs::detail::forward::lstm<T>(),
value,
frame_size,
cell_clip,
cand_act,
gate_act,
cell_act,
old_api_version);
value.gate_value += frame_size * 4;
value.state_value += frame_size;
value.state_active_value += frame_size;
value.output_value += frame_size;
if (value.prev_state_value) {
value.prev_state_value += frame_size;
}
}
}
};
template <class T>
struct LstmUnitFunctor<CPUContext, T> {
static void compute(const CPUContext& context,
......@@ -85,49 +53,6 @@ struct LstmUnitFunctor<CPUContext, T> {
}
};
template <class T>
struct LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
static void compute(const paddle::platform::CPUDeviceContext& context,
LstmMetaValue<T> value,
LstmMetaGrad<T> grad,
int frame_size,
int batch_size,
T cell_clip,
const phi::funcs::detail::ActivationType& gate_act,
const phi::funcs::detail::ActivationType& cell_act,
const phi::funcs::detail::ActivationType& cand_act,
bool old_api_version = true) {
for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_backward(context,
phi::funcs::detail::backward::lstm<T>(),
value,
grad,
frame_size,
cell_clip,
cand_act,
gate_act,
cell_act,
old_api_version);
value.gate_value += frame_size * 4;
value.state_value += frame_size;
value.state_active_value += frame_size;
value.output_value += frame_size;
if (value.prev_state_value) {
value.prev_state_value += frame_size;
}
grad.gate_grad += frame_size * 4;
grad.state_grad += frame_size;
grad.state_active_grad += frame_size;
grad.output_grad += frame_size;
if (grad.prev_state_grad) {
grad.prev_state_grad += frame_size;
}
}
}
};
template <class T>
struct LstmUnitGradFunctor<CPUContext, T> {
static void compute(const CPUContext& context,
......@@ -171,11 +96,6 @@ struct LstmUnitGradFunctor<CPUContext, T> {
}
};
template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, float>;
template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, double>;
template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
template class LstmUnitFunctor<CPUContext, float>;
template class LstmUnitFunctor<CPUContext, double>;
template class LstmUnitGradFunctor<CPUContext, float>;
......
......@@ -39,22 +39,6 @@ namespace funcs {
using float16 = phi::dtype::float16;
template struct SetConstant<paddle::platform::CPUDeviceContext,
phi::dtype::float16>;
template struct SetConstant<paddle::platform::CPUDeviceContext,
phi::dtype::bfloat16>;
template struct SetConstant<paddle::platform::CPUDeviceContext, float>;
template struct SetConstant<paddle::platform::CPUDeviceContext, double>;
template struct SetConstant<paddle::platform::CPUDeviceContext, int16_t>;
template struct SetConstant<paddle::platform::CPUDeviceContext, int>;
template struct SetConstant<paddle::platform::CPUDeviceContext, int64_t>;
template struct SetConstant<paddle::platform::CPUDeviceContext, bool>;
template struct SetConstant<paddle::platform::CPUDeviceContext, uint8_t>;
template struct SetConstant<paddle::platform::CPUDeviceContext,
phi::dtype::complex<float>>;
template struct SetConstant<paddle::platform::CPUDeviceContext,
phi::dtype::complex<double>>;
template struct SetConstant<phi::CPUContext, phi::dtype::float16>;
template struct SetConstant<phi::CPUContext, phi::dtype::bfloat16>;
template struct SetConstant<phi::CPUContext, float>;
......@@ -86,32 +70,6 @@ template struct SetConstant<paddle::platform::XPUDeviceContext,
#endif
#define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<paddle::platform::CPUDeviceContext, \
phi::dtype::float16, \
RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, \
phi::dtype::bfloat16, \
RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, float, RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, double, RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, int, RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, \
int64_t, \
RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, bool, RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, \
int16_t, \
RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, \
uint8_t, \
RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, int8_t, RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, \
phi::dtype::complex<float>, \
RANK>; \
template struct Transpose<paddle::platform::CPUDeviceContext, \
phi::dtype::complex<double>, \
RANK>; \
template struct Transpose<phi::CPUContext, phi::dtype::float16, RANK>; \
template struct Transpose<phi::CPUContext, phi::dtype::bfloat16, RANK>; \
template struct Transpose<phi::CPUContext, float, RANK>; \
......@@ -164,7 +122,6 @@ void TransposeNormal<DeviceContext, T>::operator()(
// define transpose normal
#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
template struct TransposeNormal<paddle::platform::CPUDeviceContext, TYPE>; \
template struct TransposeNormal<phi::CPUContext, TYPE>
DEFINE_CPU_TRANS_NORMAL(phi::dtype::float16);
......@@ -291,6 +248,31 @@ void set_constant(const paddle::platform::DeviceContext& context,
#endif
}
template struct ColwiseSum<phi::CPUContext, float>;
template struct ColwiseSum<phi::CPUContext, double>;
template struct ColwiseSum<phi::CPUContext, int>;
template struct ColwiseSum<phi::CPUContext, int64_t>;
template struct RowwiseMean<phi::CPUContext, float>;
template struct RowwiseMean<phi::CPUContext, double>;
template <typename T>
struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
void operator()(paddle::platform::CPUDeviceContext* ctx,
const paddle::framework::Tensor& src,
paddle::framework::Tensor* dst) {
auto in = paddle::framework::EigenVector<T>::Flatten(src);
auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
auto& place = *(ctx->eigen_device());
out.device(place) = out + in;
}
};
template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
phi::dtype::float16>;
template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
phi::dtype::bfloat16>;
template <typename T>
struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
void operator()(const paddle::platform::CPUDeviceContext& context,
......@@ -333,41 +315,5 @@ struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
template struct RowwiseAdd<paddle::platform::CPUDeviceContext, float>;
template struct RowwiseAdd<paddle::platform::CPUDeviceContext, double>;
template struct ColwiseSum<paddle::platform::CPUDeviceContext, float>;
template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;
template struct ColwiseSum<phi::CPUContext, float>;
template struct ColwiseSum<phi::CPUContext, double>;
template struct ColwiseSum<phi::CPUContext, int>;
template struct ColwiseSum<phi::CPUContext, int64_t>;
template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;
template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;
template struct RowwiseMean<phi::CPUContext, float>;
template struct RowwiseMean<phi::CPUContext, double>;
template <typename T>
struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
void operator()(paddle::platform::CPUDeviceContext* ctx,
const paddle::framework::Tensor& src,
paddle::framework::Tensor* dst) {
auto in = paddle::framework::EigenVector<T>::Flatten(src);
auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
auto& place = *(ctx->eigen_device());
out.device(place) = out + in;
}
};
template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
phi::dtype::float16>;
template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
phi::dtype::bfloat16>;
} // namespace funcs
} // namespace phi
......@@ -29,9 +29,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
template class MatrixInverseFunctor<CPUContext, float>;
template class MatrixInverseFunctor<CPUContext, double>;
// TODO(chenweihang): remove these instantiations later
template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, float>;
template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, double>;
} // namespace funcs
} // namespace phi
......@@ -48,7 +48,6 @@ TEST(API, to_sparse_coo) {
std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
// 1. test dense_to_sparse_coo
paddle::experimental::Tensor x(dense_x);
......
......@@ -47,7 +47,6 @@ TEST(Scalar, ConstructFromDenseTensor1) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx.Init();
auto* dense_x_data = dev_ctx.Alloc<float16>(&dense_x);
dense_x_data[0] = 1;
......@@ -67,7 +66,6 @@ TEST(Scalar, ConstructFromDenseTensor2) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx.Init();
auto* dense_x_data = dev_ctx.Alloc<int16_t>(&dense_x);
dense_x_data[0] = 1;
......@@ -87,7 +85,6 @@ TEST(Scalar, ConstructFromDenseTensor3) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx.Init();
auto* dense_x_data = dev_ctx.Alloc<int8_t>(&dense_x);
dense_x_data[0] = 1;
......@@ -107,7 +104,6 @@ TEST(Scalar, ConstructFromDenseTensor4) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx.Init();
auto* dense_x_data = dev_ctx.Alloc<bool>(&dense_x);
dense_x_data[0] = true;
......@@ -127,7 +123,6 @@ TEST(Scalar, ConstructFromDenseTensor5) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx.Init();
auto* dense_x_data = dev_ctx.Alloc<complex64>(&dense_x);
dense_x_data[0] = 1;
......@@ -148,7 +143,6 @@ TEST(Scalar, ConstructFromDenseTensor6) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx.Init();
auto* dense_x_data = dev_ctx.Alloc<complex128>(&dense_x);
dense_x_data[0] = 1;
......@@ -170,7 +164,6 @@ TEST(Scalar, ConstructFromDenseTensor7) {
.GetAllocator(phi::GPUPlace())
.get());
dev_ctx.Init();
auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
dev_ctx.Wait();
......
......@@ -24,10 +24,6 @@ cc_test(
test_op_utils
SRCS test_op_utils.cc
DEPS op_compat_infos)
cc_test(
test_phi_device_context
SRCS test_device_context.cc
DEPS phi_context cpu_context)
cc_test(
test_meta_fn_utils
SRCS test_meta_fn_utils.cc
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "gtest/gtest.h"
// TODO(wilber): will remove after the cpu, gpu context megre.
#include "paddle/phi/backends/cpu/cpu_context.h"
// #include "paddle/phi/backends/all_context.h"
// NOTE: The paddle framework should add WITH_EIGEN option to support compile
// without eigen.
#include "unsupported/Eigen/CXX11/Tensor"
namespace phi {
namespace tests {
class InferenceCPUContext : public CPUContext {
public:
void SetEigenDevice(Eigen::DefaultDevice* eigen_device) {
CPUContext::SetEigenDevice(eigen_device);
}
};
TEST(DeviceContext, cpu_context) {
std::cout << "test training scenarios" << std::endl;
{
phi::CPUContext ctx;
ctx.Init();
EXPECT_TRUE(ctx.eigen_device() != nullptr);
}
std::cout << "test inference scenarios" << std::endl;
Eigen::DefaultDevice* device = new Eigen::DefaultDevice();
{
InferenceCPUContext ctx;
ctx.SetEigenDevice(device);
EXPECT_TRUE(ctx.eigen_device() != nullptr);
}
delete device;
}
} // namespace tests
} // namespace phi
......@@ -52,7 +52,6 @@ TEST(DEV_API, cast) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
phi::DataType out_dtype = phi::DataType::FLOAT64;
// 2. test API
......
......@@ -60,7 +60,6 @@ TEST(DEV_API, concat) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = phi::Concat<float>(dev_ctx, inputs, 0);
// 3. check result
......
......@@ -48,7 +48,6 @@ TEST(DEV_API, conj) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
// 2. test API
auto out = phi::Conj<paddle::complex64>(dev_ctx, dense_x);
......
......@@ -65,7 +65,6 @@ TEST(DEV_API, copy) {
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
phi::Copy(
dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
......
......@@ -36,7 +36,6 @@ TEST(DEV_API, empty) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
// 2. test API
auto out = phi::Empty<int>(dev_ctx, {3, 2});
......@@ -66,7 +65,6 @@ TEST(DEV_API, empty_like) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = phi::EmptyLike<float>(dev_ctx, dense_x);
// 3. check result
......@@ -86,7 +84,6 @@ TEST(DEV_API, full) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = phi::Full<float>(dev_ctx, {3, 2}, val);
// 3. check result
......@@ -119,7 +116,6 @@ TEST(DEV_API, full_like) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
// 2. test API
auto out = phi::FullLike<float>(dev_ctx, dense_x, val);
......
......@@ -61,7 +61,6 @@ TEST(DEV_API, dot) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = phi::Dot<float>(dev_ctx, dense_x, dense_y);
// 3. check result
......
......@@ -66,7 +66,6 @@ TEST(DEV_API, add) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto dense_out = phi::Add<float>(dev_ctx, dense_x, dense_y);
// 3. check result
......@@ -118,7 +117,6 @@ TEST(DEV_API, subtract) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto dense_out = phi::Subtract<float>(dev_ctx, dense_x, dense_y);
// 3. check result
......@@ -170,7 +168,6 @@ TEST(DEV_API, divide) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto dense_out = phi::Divide<float>(dev_ctx, dense_x, dense_y);
// 3. check result
......@@ -222,7 +219,6 @@ TEST(DEV_API, multiply) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto dense_out = phi::Multiply<float>(dev_ctx, dense_x, dense_y);
// 3. check result
......
......@@ -52,7 +52,6 @@ TEST(DEV_API, flatten) {
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
// 2. test API
auto out = phi::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis);
......
......@@ -273,7 +273,6 @@ TEST(math_funciton, set_constant) {
t.Resize({10, 10});
t.mutable_data<int>(paddle::platform::CPUPlace());
auto* ctx = new paddle::platform::CPUDeviceContext();
ctx->Init();
phi::funcs::set_constant(*ctx, &t, 10);
for (int64_t i = 0; i < t.numel(); ++i) {
PADDLE_ENFORCE_EQ(10,
......
......@@ -58,7 +58,6 @@ TEST(DEV_API, dot) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false);
// 3. check result
......
......@@ -51,7 +51,6 @@ TEST(DEV_API, mean) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = phi::Mean<float>(dev_ctx, dense_x, dims, false);
// 3. check result
......
......@@ -54,7 +54,6 @@ TEST(DEV_API, reshape) {
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = phi::Reshape<float>(dev_ctx, dense_x, shape);
// 3. check result
std::vector<int64_t> expect_shape = {12, 3};
......
......@@ -51,7 +51,6 @@ TEST(DEV_API, scale) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = phi::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
......@@ -93,7 +92,6 @@ TEST(DEV_API, scale_host) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto out = phi::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
......
......@@ -42,7 +42,6 @@ TEST(DEV_API, sparse_relu) {
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx_cpu.Init();
DenseTensor dense_x =
phi::Empty(dev_ctx_cpu,
......
......@@ -75,7 +75,6 @@ void TestConv3dBase(const std::vector<IntT>& indices,
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx_cpu.Init();
const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4];
......
......@@ -113,7 +113,6 @@ TEST(DEV_API, sparse_elementwise_coo_kernel_double) {
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx_cpu.Init();
auto coo_x = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
auto coo_y = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_y, sparse_dim);
......@@ -159,7 +158,6 @@ TEST(DEV_API, sparse_elementwise_csr_kernel_float) {
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx_cpu.Init();
auto csr_x = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
auto csr_y = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_y);
......@@ -357,7 +355,6 @@ TEST(DEV_API, sparse_elementwise_csr_grad_kernel_float) {
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx_cpu.Init();
auto csr_x = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
auto csr_y = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_y);
......@@ -404,7 +401,6 @@ TEST(DEV_API, sparse_elementwise_coo_grad_kernel_double) {
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx_cpu.Init();
auto csr_x = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
auto csr_y = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_y, sparse_dim);
......
......@@ -60,7 +60,6 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx_cpu.Init();
const int in_channels = x_dims[4];
const int out_channels = in_channels;
......
......@@ -88,7 +88,6 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
paddle::platform::CPUPlace());
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
dev_ctx_cpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
......@@ -307,7 +306,6 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
// 1. test cpu
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
dev_ctx_cpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
......@@ -489,7 +487,6 @@ void TestCooToCsr(const DDim& dense_dims,
// 1. test cpu
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
dev_ctx_cpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
......@@ -588,7 +585,6 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace());
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
dev_ctx_cpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
......@@ -701,7 +697,6 @@ void TestSparseCooToDense(const DDim& dense_dims,
const int64_t non_zero_num,
const int64_t sparse_dim) {
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
dev_ctx_cpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
......@@ -879,7 +874,6 @@ void TestSparseCsrToDense(const DDim& dense_dims,
// 1. test cpu
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.Init();
dev_ctx_cpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
......
......@@ -40,7 +40,6 @@ TEST(DEV_API, split) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
for (size_t i = 0; i < 4; ++i) {
......
......@@ -49,7 +49,6 @@ TEST(DEV_API, sum) {
dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
// 2. test API
auto out =
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册